From 840190629e39f1040b69d81c75c0518d1c506c17 Mon Sep 17 00:00:00 2001 From: rudeboybert Date: Sat, 3 Feb 2018 11:35:03 -0500 Subject: [PATCH] v0.3.0 is a go! --- bib/packages.bib | 37 +- docs/10-hypo.html | 56 +- docs/11-inference-for-regression.html | 12 +- docs/12-thinking-with-data.html | 14 +- docs/2-ci.html | 653 -------- docs/2-getting-started.html | 32 +- docs/2-regression.html | 1444 ----------------- docs/2-sampling.html | 669 -------- docs/3-hypo.html | 1420 ---------------- docs/3-tidy.html | 0 docs/3-viz.html | 16 +- docs/4-tidy.html | 66 +- docs/4-wrangling.html | 0 docs/5-multiple-regression.html | 0 docs/5-wrangling.html | 102 +- docs/6-ci.html | 0 docs/6-regression.html | 16 +- docs/7-hypo.html | 0 docs/7-multiple-regression.html | 12 +- docs/8-inference-for-regression.html | 0 docs/8-sampling.html | 12 +- docs/9-ci.html | 58 +- docs/A-appendixA.html | 12 +- docs/B-appendixB.html | 12 +- docs/C-appendixC.html | 14 +- docs/images/shinyapp.png | Bin 203558 -> 0 bytes docs/images/soup.jpg | Bin 409716 -> 0 bytes docs/images/temp.png | Bin 8003 -> 0 bytes docs/index.html | 14 +- .../figure-html/2numxplot1-1.png | Bin 172913 -> 172925 bytes .../figure-html/2numxplot4-1.png | Bin 196655 -> 196667 bytes .../figure-html/2numxplot5-1.png | Bin 118867 -> 118879 bytes docs/ismaykim_files/figure-html/alpha-1.png | Bin 102925 -> 102937 bytes docs/ismaykim_files/figure-html/badbox-1.png | Bin 32674 -> 32686 bytes docs/ismaykim_files/figure-html/bar-1.png | Bin 36738 -> 36750 bytes docs/ismaykim_files/figure-html/boxplot-1.png | Bin 47612 -> 47624 bytes .../figure-html/carrierpie-1.png | Bin 104206 -> 104218 bytes .../figure-html/catxplot0b-1.png | Bin 75576 -> 75588 bytes .../figure-html/catxplot1-1.png | Bin 57748 -> 57760 bytes .../figure-html/catxplot7-1.png | Bin 71854 -> 71866 bytes .../figure-html/catxplot8-1.png | Bin 41672 -> 41684 bytes .../figure-html/correlation1-1.png | Bin 101049 -> 101061 bytes .../figure-html/correlation2-1.png | Bin 138069 -> 138081 bytes .../figure-html/credit-limit-quartiles-1.png | Bin 42734 -> 42746 bytes docs/ismaykim_files/figure-html/do-plot-1.png | Bin 39928 -> 0 bytes .../figure-html/facet-bar-vert-1.png | Bin 112485 -> 112497 bytes .../figure-html/facethistogram-1.png | Bin 92627 -> 92639 bytes .../figure-html/flightsbar-1.png | Bin 35598 -> 35610 bytes .../figure-html/flightscol-1.png | Bin 35907 -> 35919 bytes .../figure-html/gapminder-1.png | Bin 122162 -> 122174 bytes docs/ismaykim_files/figure-html/geombar-1.png | Bin 23854 -> 23866 bytes docs/ismaykim_files/figure-html/geomcol-1.png | Bin 24217 -> 24229 bytes .../ismaykim_files/figure-html/guatline-1.png | Bin 79364 -> 79376 bytes .../figure-html/height-hist-1.png | Bin 35759 -> 0 bytes .../figure-html/height-hist2-1.png | Bin 38407 -> 0 bytes docs/ismaykim_files/figure-html/here-1.png | Bin 120937 -> 120949 bytes docs/ismaykim_files/figure-html/hist-1.png | Bin 49053 -> 49065 bytes docs/ismaykim_files/figure-html/hist1a-1.png | Bin 35687 -> 35699 bytes docs/ismaykim_files/figure-html/hist1b-1.png | Bin 37060 -> 37072 bytes .../figure-html/hourlytemp-1.png | Bin 91262 -> 89230 bytes docs/ismaykim_files/figure-html/jitter-1.png | Bin 97544 -> 102556 bytes docs/ismaykim_files/figure-html/model1-1.png | Bin 159369 -> 159381 bytes .../figure-html/model1_residuals_hist-1.png | Bin 37624 -> 37636 bytes docs/ismaykim_files/figure-html/model2-1.png | Bin 159193 -> 159205 bytes .../figure-html/model3-residuals-hist-1.png | Bin 40468 -> 40480 bytes .../figure-html/monthtempbox-1.png | Bin 52160 -> 52172 bytes .../figure-html/monthtempbox2-1.png | Bin 34986 -> 34998 bytes .../figure-html/monthtempbox3-1.png | Bin 136347 -> 136426 bytes .../figure-html/movie-hist-1.png | Bin 49144 -> 49156 bytes docs/ismaykim_files/figure-html/noalpha-1.png | Bin 82821 -> 82833 bytes .../ismaykim_files/figure-html/nolayers-1.png | Bin 38175 -> 38187 bytes .../figure-html/numxcatxplot1-1.png | Bin 160198 -> 160210 bytes .../figure-html/numxcatxplot2-1.png | Bin 158385 -> 158397 bytes .../figure-html/numxplot1-1.png | Bin 112127 -> 112139 bytes .../figure-html/numxplot2-1.png | Bin 132442 -> 132454 bytes .../figure-html/numxplot3-1.png | Bin 120654 -> 120666 bytes .../figure-html/numxplot4-1.png | Bin 117580 -> 117592 bytes .../figure-html/numxplot5-1.png | Bin 118489 -> 118501 bytes .../figure-html/numxplot6-1.png | Bin 106145 -> 106157 bytes .../figure-html/numxplot7-1.png | Bin 108079 -> 108091 bytes .../figure-html/numxplot9-1.png | Bin 54069 -> 54081 bytes .../figure-html/plot-sample1-1.png | Bin 35300 -> 0 bytes docs/ismaykim_files/figure-html/qqplot1-1.png | Bin 52528 -> 52540 bytes .../figure-html/qqplotmean-1.png | Bin 51626 -> 51638 bytes .../figure-html/resid-histogram-1.png | Bin 35060 -> 35072 bytes .../figure-html/resid-plot-1.png | Bin 51183 -> 51195 bytes .../figure-html/residual1-1.png | Bin 37758 -> 37770 bytes .../figure-html/residual2-1.png | Bin 118049 -> 118061 bytes .../figure-html/sample-profiles2-1.png | Bin 37426 -> 0 bytes .../figure-html/sample-profiles3-1.png | Bin 35861 -> 0 bytes .../figure-html/sampling-distribution-1-1.png | Bin 56169 -> 0 bytes .../sampling-distribution-virtual-1.png | Bin 53939 -> 53951 bytes .../sampling-distribution-virtual-2-1.png | Bin 64320 -> 64332 bytes .../figure-html/samplingdistribution-1.png | Bin 56223 -> 56235 bytes .../figure-html/stacked_bar-1.png | Bin 43827 -> 43839 bytes .../figure-html/unnamed-chunk-10-1.png | Bin 40122 -> 0 bytes .../figure-html/unnamed-chunk-109-1.png | Bin 41616 -> 0 bytes .../figure-html/unnamed-chunk-111-1.png | Bin 41616 -> 0 bytes .../figure-html/unnamed-chunk-112-1.png | Bin 41604 -> 41616 bytes .../figure-html/unnamed-chunk-12-1.png | Bin 36009 -> 0 bytes .../figure-html/unnamed-chunk-13-1.png | Bin 36009 -> 0 bytes .../figure-html/unnamed-chunk-160-1.png | Bin 53315 -> 0 bytes .../figure-html/unnamed-chunk-162-1.png | Bin 53315 -> 0 bytes .../figure-html/unnamed-chunk-163-1.png | Bin 53315 -> 0 bytes .../figure-html/unnamed-chunk-167-1.png | Bin 53315 -> 0 bytes .../figure-html/unnamed-chunk-17-1.png | Bin 39041 -> 0 bytes .../figure-html/unnamed-chunk-171-1.png | Bin 144682 -> 0 bytes .../figure-html/unnamed-chunk-172-1.png | Bin 53303 -> 53315 bytes .../figure-html/unnamed-chunk-173-1.png | Bin 144682 -> 0 bytes .../figure-html/unnamed-chunk-174-1.png | Bin 144682 -> 0 bytes .../figure-html/unnamed-chunk-175-1.png | Bin 148057 -> 0 bytes .../figure-html/unnamed-chunk-178-1.png | Bin 118996 -> 0 bytes .../figure-html/unnamed-chunk-179-1.png | Bin 120090 -> 0 bytes .../figure-html/unnamed-chunk-18-1.png | Bin 56976 -> 0 bytes .../figure-html/unnamed-chunk-188-1.png | Bin 118984 -> 118996 bytes .../figure-html/unnamed-chunk-189-1.png | Bin 120078 -> 120090 bytes .../figure-html/unnamed-chunk-193-1.png | Bin 150122 -> 0 bytes .../figure-html/unnamed-chunk-195-1.png | Bin 150122 -> 0 bytes .../figure-html/unnamed-chunk-196-1.png | Bin 150122 -> 0 bytes .../figure-html/unnamed-chunk-20-1.png | Bin 39041 -> 0 bytes .../figure-html/unnamed-chunk-200-1.png | Bin 150122 -> 0 bytes .../figure-html/unnamed-chunk-205-1.png | Bin 172925 -> 0 bytes .../figure-html/unnamed-chunk-207-1.png | Bin 172925 -> 0 bytes .../figure-html/unnamed-chunk-208-1.png | Bin 172925 -> 0 bytes .../figure-html/unnamed-chunk-21-1.png | Bin 39041 -> 0 bytes .../figure-html/unnamed-chunk-212-1.png | Bin 172925 -> 0 bytes .../figure-html/unnamed-chunk-214-1.png | Bin 150110 -> 150122 bytes .../figure-html/unnamed-chunk-220-1.png | Bin 38087 -> 0 bytes .../figure-html/unnamed-chunk-221-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-222-1.png | Bin 38087 -> 0 bytes .../figure-html/unnamed-chunk-223-1.png | Bin 38087 -> 0 bytes .../figure-html/unnamed-chunk-224-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-226-1.png | Bin 40122 -> 0 bytes .../figure-html/unnamed-chunk-227-1.png | Bin 38087 -> 0 bytes .../figure-html/unnamed-chunk-228-1.png | Bin 172913 -> 172925 bytes .../figure-html/unnamed-chunk-229-1.png | Bin 40122 -> 0 bytes .../figure-html/unnamed-chunk-23-1.png | Bin 38087 -> 0 bytes .../figure-html/unnamed-chunk-231-1.png | Bin 36009 -> 0 bytes .../figure-html/unnamed-chunk-232-1.png | Bin 36009 -> 0 bytes .../figure-html/unnamed-chunk-233-1.png | Bin 40122 -> 0 bytes .../figure-html/unnamed-chunk-236-1.png | Bin 36009 -> 0 bytes .../figure-html/unnamed-chunk-237-1.png | Bin 39041 -> 0 bytes .../figure-html/unnamed-chunk-239-1.png | Bin 39041 -> 0 bytes .../figure-html/unnamed-chunk-24-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-240-1.png | Bin 39041 -> 0 bytes .../figure-html/unnamed-chunk-243-1.png | Bin 38087 -> 0 bytes .../figure-html/unnamed-chunk-244-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-249-1.png | Bin 39166 -> 39178 bytes .../figure-html/unnamed-chunk-251-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-252-1.png | Bin 40110 -> 40122 bytes .../figure-html/unnamed-chunk-253-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-254-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-255-1.png | Bin 35997 -> 36009 bytes .../figure-html/unnamed-chunk-258-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-26-1.png | Bin 16276 -> 16288 bytes .../figure-html/unnamed-chunk-260-1.png | Bin 39041 -> 0 bytes .../figure-html/unnamed-chunk-262-1.png | Bin 37571 -> 0 bytes .../figure-html/unnamed-chunk-263-1.png | Bin 39029 -> 39041 bytes .../figure-html/unnamed-chunk-264-1.png | Bin 37571 -> 0 bytes .../figure-html/unnamed-chunk-265-1.png | Bin 37571 -> 0 bytes .../figure-html/unnamed-chunk-268-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-269-1.png | Bin 37571 -> 0 bytes .../figure-html/unnamed-chunk-27-1.png | Bin 35995 -> 36007 bytes .../figure-html/unnamed-chunk-270-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-271-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-272-1.png | Bin 46784 -> 0 bytes .../figure-html/unnamed-chunk-273-1.png | Bin 46784 -> 0 bytes .../figure-html/unnamed-chunk-274-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-275-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-276-1.png | Bin 40179 -> 0 bytes .../figure-html/unnamed-chunk-277-1.png | Bin 40779 -> 40791 bytes .../figure-html/unnamed-chunk-278-1.png | Bin 40179 -> 0 bytes .../figure-html/unnamed-chunk-279-1.png | Bin 40179 -> 0 bytes .../figure-html/unnamed-chunk-28-1.png | Bin 44775 -> 44787 bytes .../figure-html/unnamed-chunk-280-1.png | Bin 44691 -> 0 bytes .../figure-html/unnamed-chunk-281-1.png | Bin 34838 -> 0 bytes .../figure-html/unnamed-chunk-282-1.png | Bin 34838 -> 0 bytes .../figure-html/unnamed-chunk-283-1.png | Bin 37164 -> 37176 bytes .../figure-html/unnamed-chunk-285-1.png | Bin 37571 -> 0 bytes .../figure-html/unnamed-chunk-286-1.png | Bin 34838 -> 0 bytes .../figure-html/unnamed-chunk-287-1.png | Bin 44691 -> 0 bytes .../figure-html/unnamed-chunk-289-1.png | Bin 35997 -> 36009 bytes .../figure-html/unnamed-chunk-29-1.png | Bin 45218 -> 45230 bytes .../figure-html/unnamed-chunk-291-1.png | Bin 41221 -> 0 bytes .../figure-html/unnamed-chunk-292-1.png | Bin 51019 -> 0 bytes .../figure-html/unnamed-chunk-293-1.png | Bin 46784 -> 0 bytes .../figure-html/unnamed-chunk-294-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-295-1.png | Bin 41132 -> 41144 bytes .../figure-html/unnamed-chunk-296-1.png | Bin 46006 -> 0 bytes .../figure-html/unnamed-chunk-297-1.png | Bin 46663 -> 46675 bytes .../figure-html/unnamed-chunk-298-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-299-1.png | Bin 40179 -> 0 bytes .../figure-html/unnamed-chunk-3-1.png | Bin 40561 -> 0 bytes .../figure-html/unnamed-chunk-30-1.png | Bin 45351 -> 45363 bytes .../figure-html/unnamed-chunk-300-1.png | Bin 46006 -> 0 bytes .../figure-html/unnamed-chunk-301-1.png | Bin 38593 -> 0 bytes .../figure-html/unnamed-chunk-302-1.png | Bin 34838 -> 0 bytes .../figure-html/unnamed-chunk-303-1.png | Bin 40167 -> 40179 bytes .../figure-html/unnamed-chunk-304-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-305-1.png | Bin 67468 -> 0 bytes .../figure-html/unnamed-chunk-306-1.png | Bin 34826 -> 34838 bytes .../figure-html/unnamed-chunk-307-1.png | Bin 44679 -> 44691 bytes .../figure-html/unnamed-chunk-308-1.png | Bin 38593 -> 0 bytes .../figure-html/unnamed-chunk-309-1.png | Bin 67468 -> 0 bytes .../figure-html/unnamed-chunk-31-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-311-1.png | Bin 64119 -> 0 bytes .../figure-html/unnamed-chunk-312-1.png | Bin 45705 -> 0 bytes .../figure-html/unnamed-chunk-313-1.png | Bin 38321 -> 0 bytes .../figure-html/unnamed-chunk-314-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-315-1.png | Bin 51019 -> 0 bytes .../figure-html/unnamed-chunk-316-1.png | Bin 46006 -> 0 bytes .../figure-html/unnamed-chunk-317-1.png | Bin 70702 -> 0 bytes .../figure-html/unnamed-chunk-318-1.png | Bin 40779 -> 40791 bytes .../figure-html/unnamed-chunk-319-1.png | Bin 51007 -> 51019 bytes .../figure-html/unnamed-chunk-32-1.png | Bin 58302 -> 0 bytes .../figure-html/unnamed-chunk-320-1.png | Bin 45994 -> 46006 bytes .../figure-html/unnamed-chunk-322-1.png | Bin 70702 -> 0 bytes .../figure-html/unnamed-chunk-323-1.png | Bin 60577 -> 60589 bytes .../figure-html/unnamed-chunk-324-1.png | Bin 40779 -> 40791 bytes .../figure-html/unnamed-chunk-325-1.png | Bin 67468 -> 0 bytes .../figure-html/unnamed-chunk-327-1.png | Bin 64119 -> 0 bytes .../figure-html/unnamed-chunk-328-1.png | Bin 38581 -> 38593 bytes .../figure-html/unnamed-chunk-329-1.png | Bin 67456 -> 67468 bytes .../figure-html/unnamed-chunk-330-1.png | Bin 42448 -> 0 bytes .../figure-html/unnamed-chunk-331-1.png | Bin 64107 -> 64119 bytes .../figure-html/unnamed-chunk-332-1.png | Bin 42448 -> 0 bytes .../figure-html/unnamed-chunk-333-1.png | Bin 42448 -> 0 bytes .../figure-html/unnamed-chunk-334-1.png | Bin 38321 -> 0 bytes .../figure-html/unnamed-chunk-335-1.png | Bin 45705 -> 0 bytes .../figure-html/unnamed-chunk-336-1.png | Bin 43879 -> 0 bytes .../figure-html/unnamed-chunk-337-1.png | Bin 42448 -> 0 bytes .../figure-html/unnamed-chunk-338-1.png | Bin 38309 -> 38321 bytes .../figure-html/unnamed-chunk-339-1.png | Bin 45693 -> 45705 bytes .../figure-html/unnamed-chunk-34-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-340-1.png | Bin 40371 -> 0 bytes .../figure-html/unnamed-chunk-341-1.png | Bin 43879 -> 0 bytes .../figure-html/unnamed-chunk-342-1.png | Bin 70690 -> 70702 bytes .../figure-html/unnamed-chunk-343-1.png | Bin 39602 -> 0 bytes .../figure-html/unnamed-chunk-344-1.png | Bin 40371 -> 0 bytes .../figure-html/unnamed-chunk-345-1.png | Bin 41564 -> 0 bytes .../figure-html/unnamed-chunk-346-1.png | Bin 40808 -> 0 bytes .../figure-html/unnamed-chunk-347-1.png | Bin 40192 -> 0 bytes .../figure-html/unnamed-chunk-348-1.png | Bin 39602 -> 0 bytes .../figure-html/unnamed-chunk-349-1.png | Bin 40808 -> 0 bytes .../figure-html/unnamed-chunk-35-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-351-1.png | Bin 42912 -> 0 bytes .../figure-html/unnamed-chunk-352-1.png | Bin 40192 -> 0 bytes .../figure-html/unnamed-chunk-353-1.png | Bin 42448 -> 0 bytes .../figure-html/unnamed-chunk-354-1.png | Bin 39309 -> 0 bytes .../figure-html/unnamed-chunk-355-1.png | Bin 42113 -> 0 bytes .../figure-html/unnamed-chunk-356-1.png | Bin 42912 -> 0 bytes .../figure-html/unnamed-chunk-357-1.png | Bin 42436 -> 0 bytes .../figure-html/unnamed-chunk-358-1.png | Bin 42436 -> 42448 bytes .../figure-html/unnamed-chunk-359-1.png | Bin 39297 -> 39309 bytes .../figure-html/unnamed-chunk-360-1.png | Bin 40371 -> 0 bytes .../figure-html/unnamed-chunk-361-1.png | Bin 43867 -> 0 bytes .../figure-html/unnamed-chunk-362-1.png | Bin 43867 -> 43879 bytes .../figure-html/unnamed-chunk-363-1.png | Bin 39041 -> 0 bytes .../figure-html/unnamed-chunk-364-1.png | Bin 40359 -> 0 bytes .../figure-html/unnamed-chunk-365-1.png | Bin 40359 -> 40371 bytes .../figure-html/unnamed-chunk-366-1.png | Bin 41552 -> 41564 bytes .../figure-html/unnamed-chunk-367-1.png | Bin 41097 -> 0 bytes .../figure-html/unnamed-chunk-368-1.png | Bin 39590 -> 0 bytes .../figure-html/unnamed-chunk-369-1.png | Bin 39590 -> 39602 bytes .../figure-html/unnamed-chunk-37-1.png | Bin 37176 -> 0 bytes .../figure-html/unnamed-chunk-370-1.png | Bin 44339 -> 0 bytes .../figure-html/unnamed-chunk-371-1.png | Bin 41026 -> 0 bytes .../figure-html/unnamed-chunk-372-1.png | Bin 40180 -> 0 bytes .../figure-html/unnamed-chunk-373-1.png | Bin 40180 -> 40192 bytes .../figure-html/unnamed-chunk-374-1.png | Bin 40796 -> 40808 bytes .../figure-html/unnamed-chunk-375-1.png | Bin 42113 -> 0 bytes .../figure-html/unnamed-chunk-376-1.png | Bin 42900 -> 0 bytes .../figure-html/unnamed-chunk-377-1.png | Bin 42900 -> 42912 bytes .../figure-html/unnamed-chunk-378-1.png | Bin 44351 -> 0 bytes .../figure-html/unnamed-chunk-379-1.png | Bin 42101 -> 0 bytes .../figure-html/unnamed-chunk-380-1.png | Bin 42101 -> 42113 bytes .../figure-html/unnamed-chunk-381-1.png | Bin 42521 -> 42533 bytes .../figure-html/unnamed-chunk-383-1.png | Bin 39029 -> 0 bytes .../figure-html/unnamed-chunk-384-1.png | Bin 39029 -> 39041 bytes .../figure-html/unnamed-chunk-387-1.png | Bin 41085 -> 0 bytes .../figure-html/unnamed-chunk-388-1.png | Bin 41085 -> 41097 bytes .../figure-html/unnamed-chunk-389-1.png | Bin 41154 -> 41166 bytes .../figure-html/unnamed-chunk-391-1.png | Bin 41014 -> 0 bytes .../figure-html/unnamed-chunk-392-1.png | Bin 41014 -> 41026 bytes .../figure-html/unnamed-chunk-394-1.png | Bin 44351 -> 0 bytes .../figure-html/unnamed-chunk-398-1.png | Bin 44339 -> 0 bytes .../figure-html/unnamed-chunk-399-1.png | Bin 44339 -> 44351 bytes .../figure-html/unnamed-chunk-40-1.png | Bin 126307 -> 0 bytes .../figure-html/unnamed-chunk-41-1.png | Bin 127482 -> 0 bytes .../figure-html/unnamed-chunk-42-1.png | Bin 37516 -> 0 bytes .../figure-html/unnamed-chunk-43-1.png | Bin 36869 -> 0 bytes .../figure-html/unnamed-chunk-44-1.png | Bin 127482 -> 0 bytes .../figure-html/unnamed-chunk-45-1.png | Bin 37516 -> 0 bytes .../figure-html/unnamed-chunk-46-1.png | Bin 37516 -> 0 bytes .../figure-html/unnamed-chunk-48-1.png | Bin 41144 -> 0 bytes .../figure-html/unnamed-chunk-49-1.png | Bin 41144 -> 0 bytes .../figure-html/unnamed-chunk-50-1.png | Bin 46675 -> 0 bytes .../figure-html/unnamed-chunk-51-1.png | Bin 46675 -> 0 bytes .../figure-html/unnamed-chunk-52-1.png | Bin 41144 -> 0 bytes .../figure-html/unnamed-chunk-53-1.png | Bin 54094 -> 54106 bytes .../figure-html/unnamed-chunk-54-1.png | Bin 46675 -> 0 bytes .../figure-html/unnamed-chunk-55-1.png | Bin 64627 -> 64639 bytes .../figure-html/unnamed-chunk-56-1.png | Bin 40179 -> 0 bytes .../figure-html/unnamed-chunk-57-1.png | Bin 40179 -> 0 bytes .../figure-html/unnamed-chunk-59-1.png | Bin 34838 -> 0 bytes .../figure-html/unnamed-chunk-6-1.png | Bin 40122 -> 0 bytes .../figure-html/unnamed-chunk-60-1.png | Bin 34838 -> 0 bytes .../figure-html/unnamed-chunk-61-1.png | Bin 44691 -> 0 bytes .../figure-html/unnamed-chunk-62-1.png | Bin 34838 -> 0 bytes .../figure-html/unnamed-chunk-63-1.png | Bin 44691 -> 0 bytes .../figure-html/unnamed-chunk-64-1.png | Bin 44691 -> 0 bytes .../figure-html/unnamed-chunk-66-1.png | Bin 47466 -> 0 bytes .../figure-html/unnamed-chunk-69-1.png | Bin 47454 -> 47466 bytes .../figure-html/unnamed-chunk-7-1.png | Bin 37297 -> 0 bytes .../figure-html/unnamed-chunk-71-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-72-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-73-1.png | Bin 51019 -> 0 bytes .../figure-html/unnamed-chunk-74-1.png | Bin 46006 -> 0 bytes .../figure-html/unnamed-chunk-75-1.png | Bin 51019 -> 0 bytes .../figure-html/unnamed-chunk-76-1.png | Bin 60589 -> 0 bytes .../figure-html/unnamed-chunk-77-1.png | Bin 60589 -> 0 bytes .../figure-html/unnamed-chunk-78-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-79-1.png | Bin 60589 -> 0 bytes .../figure-html/unnamed-chunk-80-1.png | Bin 40791 -> 0 bytes .../figure-html/unnamed-chunk-81-1.png | Bin 38593 -> 0 bytes .../figure-html/unnamed-chunk-82-1.png | Bin 38593 -> 0 bytes .../figure-html/unnamed-chunk-83-1.png | Bin 67468 -> 0 bytes .../figure-html/unnamed-chunk-84-1.png | Bin 64119 -> 0 bytes .../figure-html/unnamed-chunk-85-1.png | Bin 64119 -> 0 bytes .../figure-html/unnamed-chunk-86-1.png | Bin 67468 -> 0 bytes .../figure-html/unnamed-chunk-87-1.png | Bin 64119 -> 0 bytes .../figure-html/unnamed-chunk-88-1.png | Bin 64119 -> 0 bytes .../figure-html/unnamed-chunk-9-1.png | Bin 36009 -> 0 bytes .../figure-html/unnamed-chunk-91-1.png | Bin 38321 -> 0 bytes .../figure-html/unnamed-chunk-92-1.png | Bin 38321 -> 0 bytes .../figure-html/unnamed-chunk-93-1.png | Bin 45705 -> 0 bytes .../figure-html/unnamed-chunk-94-1.png | Bin 38321 -> 0 bytes .../figure-html/unnamed-chunk-95-1.png | Bin 70702 -> 0 bytes .../figure-html/unnamed-chunk-96-1.png | Bin 70702 -> 0 bytes .../figure-html/unnamed-chunk-98-1.png | Bin 70702 -> 0 bytes .../figure-html/unnamed-chunk-99-1.png | Bin 70702 -> 0 bytes .../css/fontawesome/fontawesome-webfont.ttf | Bin 165548 -> 142072 bytes docs/libs/htmlwidgets-1.0/htmlwidgets.js | 836 ---------- docs/references.html | 20 +- docs/scripts/02-getting-started.R | 47 - docs/scripts/03-visualization.R | 290 ---- docs/scripts/04-tidy.R | 140 -- docs/scripts/05-wrangling.R | 192 --- docs/scripts/06-regression.R | 488 ------ docs/scripts/07-multiple-regression.R | 374 ----- docs/scripts/08-sampling.R | 104 -- docs/scripts/09-confidence-intervals.R | 146 -- docs/scripts/10-hypothesis-testing.R | 303 ---- docs/scripts/11-inference-for-regression.R | 71 - docs/scripts/12-thinking-with-data.R | 1 - docs/search_index.json | 18 +- 356 files changed, 245 insertions(+), 7456 deletions(-) delete mode 100644 docs/2-ci.html delete mode 100644 docs/2-regression.html delete mode 100644 docs/2-sampling.html delete mode 100644 docs/3-hypo.html delete mode 100644 docs/3-tidy.html delete mode 100644 docs/4-wrangling.html delete mode 100644 docs/5-multiple-regression.html delete mode 100644 docs/6-ci.html delete mode 100644 docs/7-hypo.html delete mode 100644 docs/8-inference-for-regression.html delete mode 100644 docs/images/shinyapp.png delete mode 100755 docs/images/soup.jpg delete mode 100644 docs/images/temp.png delete mode 100644 docs/ismaykim_files/figure-html/do-plot-1.png delete mode 100644 docs/ismaykim_files/figure-html/height-hist-1.png delete mode 100644 docs/ismaykim_files/figure-html/height-hist2-1.png delete mode 100644 docs/ismaykim_files/figure-html/plot-sample1-1.png delete mode 100644 docs/ismaykim_files/figure-html/sample-profiles2-1.png delete mode 100644 docs/ismaykim_files/figure-html/sample-profiles3-1.png delete mode 100644 docs/ismaykim_files/figure-html/sampling-distribution-1-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-10-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-109-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-111-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-12-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-13-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-160-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-162-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-163-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-167-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-17-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-171-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-173-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-174-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-175-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-178-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-179-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-18-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-193-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-195-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-196-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-20-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-200-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-205-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-207-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-208-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-21-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-212-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-220-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-221-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-222-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-223-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-224-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-226-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-227-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-229-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-23-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-231-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-232-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-233-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-236-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-237-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-239-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-24-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-240-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-243-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-244-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-251-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-253-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-254-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-258-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-260-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-262-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-264-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-265-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-268-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-269-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-270-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-271-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-272-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-273-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-274-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-275-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-276-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-278-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-279-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-280-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-281-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-282-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-285-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-286-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-287-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-291-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-292-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-293-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-294-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-296-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-298-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-299-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-3-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-300-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-301-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-302-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-304-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-305-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-308-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-309-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-31-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-311-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-312-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-313-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-314-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-315-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-316-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-317-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-32-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-322-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-325-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-327-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-330-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-332-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-333-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-334-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-335-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-336-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-337-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-34-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-340-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-341-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-343-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-344-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-345-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-346-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-347-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-348-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-349-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-35-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-351-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-352-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-353-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-354-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-355-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-356-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-357-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-360-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-361-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-363-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-364-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-367-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-368-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-37-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-370-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-371-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-372-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-375-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-376-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-378-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-379-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-383-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-387-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-391-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-394-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-398-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-40-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-41-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-42-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-43-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-44-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-45-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-46-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-48-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-49-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-50-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-51-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-52-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-54-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-56-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-57-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-59-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-6-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-60-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-61-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-62-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-63-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-64-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-66-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-7-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-71-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-72-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-73-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-74-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-75-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-76-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-77-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-78-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-79-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-80-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-81-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-82-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-83-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-84-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-85-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-86-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-87-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-88-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-9-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-91-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-92-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-93-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-94-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-95-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-96-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-98-1.png delete mode 100644 docs/ismaykim_files/figure-html/unnamed-chunk-99-1.png delete mode 100644 docs/libs/htmlwidgets-1.0/htmlwidgets.js delete mode 100644 docs/scripts/02-getting-started.R delete mode 100644 docs/scripts/03-visualization.R delete mode 100644 docs/scripts/04-tidy.R delete mode 100644 docs/scripts/05-wrangling.R delete mode 100644 docs/scripts/06-regression.R delete mode 100644 docs/scripts/07-multiple-regression.R delete mode 100644 docs/scripts/08-sampling.R delete mode 100644 docs/scripts/09-confidence-intervals.R delete mode 100644 docs/scripts/10-hypothesis-testing.R delete mode 100644 docs/scripts/11-inference-for-regression.R delete mode 100644 docs/scripts/12-thinking-with-data.R diff --git a/bib/packages.bib b/bib/packages.bib index ccceb8ccb..c79b71547 100755 --- a/bib/packages.bib +++ b/bib/packages.bib @@ -9,15 +9,15 @@ @Manual{R-base @Manual{R-bookdown, title = {bookdown: Authoring Books and Technical Documents with R Markdown}, author = {Yihui Xie}, - year = {2018}, - note = {R package version 0.6}, + year = {2017}, + note = {R package version 0.5}, url = {https://CRAN.R-project.org/package=bookdown}, } @Manual{R-devtools, title = {devtools: Tools to Make Developing R Packages Easier}, author = {Hadley Wickham and Winston Chang}, year = {2017}, - note = {R package version 1.13.4}, + note = {R package version 1.13.2}, url = {https://CRAN.R-project.org/package=devtools}, } @Manual{R-dplyr, @@ -35,18 +35,17 @@ @Manual{R-dygraphs url = {https://CRAN.R-project.org/package=dygraphs}, } @Manual{R-fivethirtyeight, - title = {fivethirtyeight: Data and Code Behind the Stories and Interactives at -'FiveThirtyEight'}, - author = {Chester Ismay and Jennifer Chunn}, + title = {fivethirtyeight: Data and Code Behind the Stories and Interactives at 'FiveThirtyEight'}, + author = {Albert Y. Kim and Chester Ismay and Jennifer Chunn}, + note = {R package version 0.3.0.9000}, + url = {https://github.com/rudeboybert/fivethirtyeight}, year = {2017}, - note = {R package version 0.3.0}, - url = {https://CRAN.R-project.org/package=fivethirtyeight}, } @Manual{R-ggplot2, title = {ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics}, author = {Hadley Wickham and Winston Chang}, note = {http://ggplot2.tidyverse.org, https://github.com/tidyverse/ggplot2}, - year = {2018}, + year = {2017}, } @Manual{R-ggplot2movies, title = {ggplot2movies: Movies Data}, @@ -58,15 +57,15 @@ @Manual{R-ggplot2movies @Manual{R-knitr, title = {knitr: A General-Purpose Package for Dynamic Report Generation in R}, author = {Yihui Xie}, - year = {2018}, - note = {R package version 1.19}, + year = {2017}, + note = {R package version 1.18}, url = {https://CRAN.R-project.org/package=knitr}, } @Manual{R-mosaic, title = {mosaic: Project MOSAIC Statistics and Mathematics Teaching Utilities}, author = {Randall Pruim and Daniel T. Kaplan and Nicholas J. Horton}, year = {2017}, - note = {R package version 1.1.1}, + note = {R package version 1.0.0}, url = {https://CRAN.R-project.org/package=mosaic}, } @Manual{R-nycflights13, @@ -101,28 +100,28 @@ @Manual{R-rmarkdown @Manual{R-tibble, title = {tibble: Simple Data Frames}, author = {Kirill Müller and Hadley Wickham}, - year = {2018}, - note = {R package version 1.4.2}, + year = {2017}, + note = {R package version 1.3.4}, url = {https://CRAN.R-project.org/package=tibble}, } @Manual{R-tidyr, title = {tidyr: Easily Tidy Data with 'spread()' and 'gather()' Functions}, author = {Hadley Wickham and Lionel Henry}, - year = {2018}, - note = {R package version 0.8.0}, + year = {2017}, + note = {R package version 0.7.2}, url = {https://CRAN.R-project.org/package=tidyr}, } @Manual{R-tufte, title = {tufte: Tufte's Styles for R Markdown Documents}, author = {Yihui Xie and JJ Allaire}, - year = {2018}, - note = {R package version 0.3}, + year = {2016}, + note = {R package version 0.2}, url = {https://CRAN.R-project.org/package=tufte}, } @Manual{R-webshot, title = {webshot: Take Screenshots of Web Pages}, author = {Winston Chang}, year = {2017}, - note = {R package version 0.5.0}, + note = {R package version 0.4.1}, url = {https://CRAN.R-project.org/package=webshot}, } diff --git a/docs/10-hypo.html b/docs/10-hypo.html index ade5451f2..8d82bfbf1 100644 --- a/docs/10-hypo.html +++ b/docs/10-hypo.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -933,18 +933,18 @@

10.9.2 Comparing action and roman

The movies dataset in the ggplot2movies package contains information on a large number of movies that have been rated by users of IMDB.com (Wickham 2015). We are interested in the question here of whether Action movies are rated higher on IMDB than Romance movies. We will first need to do a little bit of data wrangling using the ideas from Chapter 5 to get the data in the form that we would like:

(movies_trimmed <- movies %>% select(title, year, rating, Action, Romance))
# A tibble: 58,788 x 5
-   title                     year rating Action Romance
-   <chr>                    <int>  <dbl>  <int>   <int>
- 1 $                         1971   6.40      0       0
- 2 $1000 a Touchdown         1939   6.00      0       0
- 3 $21 a Day Once a Month    1941   8.20      0       0
- 4 $40,000                   1996   8.20      0       0
- 5 $50,000 Climax Show, The  1975   3.40      0       0
- 6 $pent                     2000   4.30      0       0
- 7 $windle                   2002   5.30      1       0
- 8 '15'                      2002   6.70      0       0
- 9 '38                       1987   6.60      0       0
-10 '49-'17                   1917   6.00      0       0
+                      title  year rating Action Romance
+                      <chr> <int>  <dbl>  <int>   <int>
+ 1                        $  1971    6.4      0       0
+ 2        $1000 a Touchdown  1939    6.0      0       0
+ 3   $21 a Day Once a Month  1941    8.2      0       0
+ 4                  $40,000  1996    8.2      0       0
+ 5 $50,000 Climax Show, The  1975    3.4      0       0
+ 6                    $pent  2000    4.3      0       0
+ 7                  $windle  2002    5.3      1       0
+ 8                     '15'  2002    6.7      0       0
+ 9                      '38  1987    6.6      0       0
+10                  '49-'17  1917    6.0      0       0
 # ... with 58,778 more rows

Note that Action and Romance are binary variables here. To remove any overlap of movies (and potential confusion) that are both Action and Romance, we will remove them from our population:

movies_trimmed <- movies_trimmed %>%
@@ -1151,16 +1151,16 @@ 

10.9.9 Distribution of # A tibble: 10 x 2 .index diffmean <dbl> <dbl> - 1 1.00 -0.132 - 2 2.00 -0.197 - 3 3.00 -0.0265 - 4 4.00 0.715 - 5 5.00 -0.474 - 6 6.00 -0.121 - 7 7.00 -0.174 - 8 8.00 -0.209 - 9 9.00 -0.00882 -10 10.0 -0.332

+ 1 1 -0.13235 + 2 2 -0.19706 + 3 3 -0.02647 + 4 4 0.71471 + 5 5 -0.47353 + 6 6 -0.12059 + 7 7 -0.17353 + 8 8 -0.20882 + 9 9 -0.00882 +10 10 -0.33235

We can now plot the distribution of these simulated differences in means:

ggplot(data = rand_distn, aes(x = diffmean)) +
   geom_histogram(color = "white", bins = 20)
@@ -1647,11 +1647,9 @@

10.13.2 Script of R code

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/11-inference-for-regression.html b/docs/11-inference-for-regression.html index f8d6191ed..01d9e0cdf 100644 --- a/docs/11-inference-for-regression.html +++ b/docs/11-inference-for-regression.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -690,11 +690,9 @@

11.1.3 Script of R code

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/12-thinking-with-data.html b/docs/12-thinking-with-data.html index dbbc37cdb..76a33eeb2 100644 --- a/docs/12-thinking-with-data.html +++ b/docs/12-thinking-with-data.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -489,7 +489,7 @@

12 Thinking with Data

12.1 Effective Data Storytelling

As we’ve progressed throughout this book, you’ve seen how to work with data in a variety of ways. You’ve learned effective strategies for plotting data by understanding which types of plots work best for which combinations of variable types. You’ve summarized data in table form and calculated summary statistics for a variety of different variables. Further, you’ve seen the value of inference as a process to come to conclusions about a population by using a random sample. Lastly, you’ve explored how to use linear regression and the importance of checking the conditions required to make it a valid procedure. All throughout, you’ve learned many computational techniques and focused on reproducible research in writing R code and keeping track of your work in R Markdown. All of these steps go into making a great story using data.

As the textbook comes to a close, we thought it best that you explore what stellar work is being produced by data journalists throughout the world that specialize in effective data storytelling. We recommend you read and analyze this article by Walt Hickey entitled The Dollar-And-Cents Case Against Hollywood’s Exclusion of Women. As you read over it, think carefully about how Walt is using data, graphics, and analyses to paint the picture for the reader of what the story is he wants to tell.

-

In the spirit of reproducibility, the members of FiveThirtyEight have also shared the data that they used to create this story and some R code here. A vignette showing how to reproduce one of the plots at the end of the article using dplyr, ggplot2, and other packages in Hadley’s tidyverse is available here as part of the fivethirtyeight R package (Ismay and Chunn 2017). Great data stories don’t mislead the reader, but rather engulf them in understanding the importance that data plays in our lives through the captivation of storytelling.

+

In the spirit of reproducibility, the members of FiveThirtyEight have also shared the data that they used to create this story and some R code here. A vignette showing how to reproduce one of the plots at the end of the article using dplyr, ggplot2, and other packages in Hadley’s tidyverse is available here as part of the fivethirtyeight R package (Kim, Ismay, and Chunn 2017). Great data stories don’t mislead the reader, but rather engulf them in understanding the importance that data plays in our lives through the captivation of storytelling.

12.2 Examples

@@ -555,11 +555,9 @@

Concluding remarks

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/2-ci.html b/docs/2-ci.html deleted file mode 100644 index ec4d870ab..000000000 --- a/docs/2-ci.html +++ /dev/null @@ -1,653 +0,0 @@ - - - - - - - - An Introduction to Statistical and Data Sciences via R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
- -
- -
-
- - -
-
- -
- -ModernDive - -
-

2 Confidence Intervals

-
-
-

-Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. -

-

-Please check out our sneak peak of infer below in the meanwhile. For more details on infer visit https://infer.netlify.com/. -

-
-Drawing -
-
-
-
-

2.1 Sneak peak of infer

-
-Drawing -
-
    -
  • Question: Of all the cars in the mtcars dataset, do automatic cars get better gas mileage than manual cars?
  • -
  • Approach: 95% confidence interval for difference in means.
  • -
-
library(dplyr)
-library(ggplot2)
-library(infer)
-
-# Clean data
-mtcars <- mtcars %>%
-  as_tibble() %>% 
-  mutate(am = factor(am))
-
-# Simulate sampling distribution of two-sample difference in means:
-sampling_distribution <- mtcars %>%
-  specify(mpg ~ am) %>%
-  generate(reps = 1000, type = "bootstrap") %>%
-  calculate(stat = "diff in means", order = c("1", "0")) 
-
-# Compute 95% confidence interval:
-conf_int <- sampling_distribution %>% 
-  pull(stat) %>% 
-  quantile(probs = c(0.025, 0.975))
-
-# Visualize:
-plot <- sampling_distribution %>% 
-  visualize()
-plot +
-  geom_vline(xintercept = conf_int, col = "red", size = 1)
-

-
-
-
-

2.2 Under construction…

-

Definition: Confidence Interval

-

A confidence interval gives a range of plausible values for a parameter. It depends on a specified confidence level with higher confidence levels corresponding to wider confidence intervals and lower confidence levels corresponding to narrower confidence intervals. Common confidence levels include 90%, 95%, and 99%.

-

Usually we don’t just begin chapters with a definition, but confidence intervals are simple to define and play an important role in the sciences and any field that uses data. You can think of a confidence interval as playing the role of a net when fishing. Instead of just trying to catch a fish with a single spear (estimating an unknown parameter by using a single point estimate/statistic), we can use a net to try to provide a range of possible locations for the fish (use a range of possible values based around our statistic to make a plausible guess as to the location of the parameter).

-
-

Needed packages

-

Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section ?? for information on how to install and load R packages.

-
library(dplyr)
-library(ggplot2)
-library(infer)
-library(mosaic)
-library(knitr)
-library(ggplot2movies)
- -
-
-
-
-

2.3 Bootstrapping

-

Just as we did in Chapter 3 with the Lady Tasting Tea when making hypotheses about a population total with which we would like to test which one is more plausible, we can also use computation to infer conclusions about a population quantitative statistic such as the mean. In this case, we will focus on constructing confidence intervals to produce plausible values for a population mean. (We can do a similar analysis for a population median or other summary measure as well.)

-

Traditionally, the way to construct confidence intervals for a mean is to assume a normal distribution for the population or to invoke the Central Limit Theorem and get, what often appears to be magic, results. (This is similar to what was done in Section 3.10.) These methods are often not intuitive, especially for those that lack a strong mathematical background. They also come with their fair share of assumptions and often turn Statistics, a field that is full of tons of useful applications to many different fields and disciplines, into a robotic procedural-based topic. It doesn’t have to be that way!

-

In this section, we will introduce the concept of bootstrapping. It will be a useful tool that will allow us to estimate the variability of our statistic from sample to sample. One neat feature of bootstrapping is that it enables us to approximate the sampling distribution and estimate the distribution’s standard deviation using ONLY the information in the one selected (original) sample. It sounds just as plagued with the magical type qualities of traditional theory-based inference on initial glance but we will see that it provides an intuitive and useful way to make inferences, especially when the samples are of medium to large size.

-

To introduce the concept of bootstrapping, we again will use the movies dataset in the ggplot2movies data frame. Recall that you can also glance at this data frame using the View function and look at the help documentation for movies using the ? function. We will explore many other features of this dataset in the chapters to come, but here we will be focusing on the rating variable corresponding to the average IMDB user rating.

-

You may notice that this dataset is quite large: 58,788 movies have data collected about them here. This will correspond to our population of ALL movies. Remember from Chapter ?? that our population is rarely known. We use this dataset as our population here to show you the power of bootstrapping in estimating population parameters. We’ll see how confidence intervals built using the bootstrap distribution perform at including our population parameter of interest. Here we can actually calculate these values since our population is known, but remember that in general this isn’t the case.

-

Let’s take a look at what the distribution of our population ratings looks like. We’ll see that we will use the distribution of our sample(s) as an estimate of this population histogram.

-
movies %>% ggplot(aes(x = rating)) +
-  geom_histogram(color = "white", bins = 20)
-
-Population ratings histogram -

-Figure 2.1: Population ratings histogram -

-
-
-

-Learning check -

-
-

(LC9.1) Why was a histogram chosen as the plot to make for the rating variable above?

-

(LC9.2) What does the shape of the rating histogram tell us about how IMDB users rate movies? What stands out about the plot?

-
- -
-

It’s important to think about what our goal is here. We would like to produce a confidence interval for the population mean rating. We will have to pretend for a moment that we don’t have all 58,788 movies. Let’s say that we only have a random sample of 50 movies from this dataset instead. In order to get a random sample, we can use the resample function in the mosaic package with replace = FALSE. We could also use the sample_n function from dplyr.

-
set.seed(2017)
-movies_sample <- movies %>% 
-  sample_n(50)
-

The sample_n function has filtered the data frame movies “at random” to choose only 50 rows from the larger movies data frame. We store information on these 50 movies in the movies_sample data frame.

-

Let’s now explore what the rating variable looks like for these 50 movies:

-
ggplot(data = movies_sample, aes(x = rating)) +
-  geom_histogram(color = "white", bins = 20)
-
-Sample ratings histogram -

-Figure 2.2: Sample ratings histogram -

-
-

Remember that we can think of this histogram as an estimate of our population distribution histogram that we saw above. We are interested in the population mean rating and trying to find a range of plausible values for that value. A good start in guessing the population mean is to use the mean of our sample rating from the movies_sample data:

-
(movies_sample_mean <- movies_sample %>% 
-   summarize(mean = mean(rating)))
-
# A tibble: 1 x 1
-   mean
-  <dbl>
-1 5.894
-

Note the use of the ( ) at the beginning and the end of this creation of the movies_sample_mean object. If you’d like to print out your newly created object, you can enclose it in the parentheses as we have here.

-

This value of 5.894 is just one guess at the population mean. The idea behind bootstrapping is to sample with replacement from the original sample to create new resamples of the same size as our original sample.

-

Returning to our example, let’s investigate what one such resample of the movies_sample dataset accomplishes. We can create one resample/bootstrap sample by using the resample function in the mosaic package.

-
boot1 <- resample(movies_sample) %>%
-  arrange(orig.id)
-

The important thing to note here is the original row numbers from the movies_sample data frame in the far right column called orig.ids. Since we are sampling with replacement, there is a strong likelihood that some of the 50 observational units are going to be selected again.

-

You may be asking yourself what does this mean and how does this lead us to creating a distribution for the sample mean. Recall that the original sample mean of our data was calculated using the summarize function above.

-
-

-Learning check -

-
-

(LC9.3) What happens if we change the seed to our pseudo-random generation? Try it above when we used resample to describe the resulting movies_sample.

-

(LC9.4) Why is sampling at random important from the movies data frame? Why don’t we just pick Action movies and do bootstrapping with this Action movies subset?

-

(LC9.5) What was the purpose of assuming we didn’t have access to the full movies dataset here?

-
- -
-

Before we had a calculated mean in our original sample of 5.894. Let’s calculate the mean of ratings in our bootstrapped sample:

-
(movies_boot1_mean <- boot1 %>% summarize(mean = mean(rating)))
-
# A tibble: 1 x 1
-   mean
-  <dbl>
-1 5.686
-

More than likely the calculated bootstrap sample mean is different than the original sample mean. This is what was meant earlier by the sample means having some variability. What we are trying to do is replicate many different samples being taken from a larger population. Our best guess at what the population looks like is multiple copies of the sample we collected. We then can sample from that larger “created” population by generating bootstrap samples.

-

Similar to what we did in the previous section, we can repeat this process using the do function followed by an asterisk. Let’s look at 10 different bootstrap means for ratings from movies_sample. Note the use of the resample function here.

-
do(10) * 
-  (resample(movies_sample) %>% 
-     summarize(mean = mean(rating)))
-
    mean
-1  5.942
-2  5.572
-3  5.828
-4  6.292
-5  6.032
-6  5.920
-7  5.996
-8  5.852
-9  6.098
-10 5.608
-

You should see some variability begin to tease its way out here. Many of the simulated means will be close to our original sample mean but many will stray pretty far away. This occurs because outliers may have been selected a couple of times in the resampling or small values were selected more than larger. There are myriad reasons why this might be the case.

-

So what’s the next step now? Just as we repeated the repetitions thousands of times with the “Lady Tasting Tea” example, we can do a similar thing here:

-
trials <- do(5000) * summarize(resample(movies_sample), mean = mean(rating))
-
ggplot(data = trials, mapping = aes(x = mean)) +
-  geom_histogram(bins = 30, color = "white")
-
-Bootstrapped means histogram -

-Figure 2.3: Bootstrapped means histogram -

-
-

The shape of this resulting distribution may look familiar to you. It resembles the well-known normal (bell-shaped) curve. At this point, we can easily calculate a confidence interval. In fact, we have a couple different options. We will first use the percentiles of the distribution we just created to isolate the middle 95% of values. This will correspond to our 95% confidence interval for the population mean rating, denoted by \(\mu\).

-
(ciq_mean_rating <- confint(trials, level = 0.95, method = "quantile"))
-
  name lower upper level     method estimate
-1 mean  5.46   6.3  0.95 percentile    5.894
-

It’s always important at this point to interpret the results of this confidence interval calculation. In this context, we can say something like the following:

-
-

Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.46 and 6.3.

-
-

This statement may seem a little confusing to you. Another way to think about this is that this confidence interval was constructed using the sample data by a procedure that is 95% reliable in that of 100 generated confidence intervals based on 100 different random samples, we expect on average that 95 of them will capture the true unknown parameter. This also means that we will get invalid results 5% of the time. Just as we had a trade-off with \(\alpha\) and \(\beta\) with hypothesis tests, we have a similar trade-off here with setting the confidence level.

-

To further reiterate this point, the graphic below from Diez, Barr, and Çetinkaya-Rundel (2014) shows us that if we repeated a confidence interval process 25 times with 25 different samples, we would expect about 95% of them to actually contain the population parameter of interest. This parameter is marked with a dotted vertical line. We can see that only one confidence interval does not overlap with this value. (The one marked in red.) Therefore 24 in 25 (96%), which is quite close to our 95% reliability, do include the population parameter.

-
-Confidence interval coverage plot from OpenIntro -

-Figure 2.4: Confidence interval coverage plot from OpenIntro -

-
-

Remember that we are pretending like we don’t know what the mean IMDB rating for ALL movies is. Our population here is all of the movies listed in the movies data frame from ggplot2movies. So does our bootstrapped confidence interval here contain the actual mean value?

-
movies %>% summarize(mean_rating = mean(rating))
-
# A tibble: 1 x 1
-  mean_rating
-        <dbl>
-1       5.933
-

We see here that the population mean does fall in our range of plausible values generated from the bootstrapped samples.

-

We can also get an idea of how the theory-based inference techniques would have approximated this confidence interval by using the formula \[\bar{x} \pm (2 * SE),\] where \(\bar{x}\) is our original sample mean and \(SE\) stands for standard error and corresponds to the standard deviation of the bootstrap distribution. The value of 2 here corresponds to it being a 95% confidence interval. (95% of the values in a normal distribution fall within 2 standard deviations of the mean.) This formula assumes that the bootstrap distribution is symmetric and bell-shaped. This is often the case with bootstrap distributions, especially those in which the original distribution of the sample is not highly skewed.

-

Definition: standard error

-

The standard error is the standard deviation of the sampling distribution. The sampling distribution may be approximated by the bootstrap distribution or the null distribution depending on the context. Traditional theory-based methodologies for inference also have formulas for standard errors, assuming some conditions are met.

-

To compute this type of confidence interval, we only need to make a slight modification to the confint function seen above. (The expression after the \(\pm\) sign is known as the margin of error.)

-
(cise_mean_rating <- confint(trials, level = 0.95, method = "stderr"))
-
  name lower upper level method estimate margin.of.error
-1 mean 5.467 6.316  0.95 stderr    5.894          0.4247
-
-

Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.4667 and 6.3161.

-
-
-

-Learning check -

-
-

(LC9.6) Reproduce the bootstrapping above using a sample of size 50 instead of 25. What changes do you see?

-

(LC9.7) Reproduce the bootstrapping above using a sample of size 5 instead of 25. What changes do you see?

-

(LC9.8) How does the sample size affect the analysis above?

-

(LC9.9) Why must bootstrap samples be the same size as the original sample?

-
- -
-
-

2.3.1 Review of bootstrapping

-

We can summarize the process to generate a bootstrap distribution here in a series of steps that clearly identify the terminology we will use (R. Lock et al. 2012).

-
    -
  • Generate bootstrap samples by sampling with replacement from the original sample, using the same sample size.
  • -
  • Compute the statistic of interest, called a bootstrap statistic, for each of the bootstrap samples.
  • -
  • Collect the statistics for many bootstrap samples to create a bootstrap distribution.
  • -
-

Visually, we can represent this process in the following diagram.

-
-Bootstrapping diagram from Lock5 textbook -

-Figure 2.5: Bootstrapping diagram from Lock5 textbook -

-
-
-
-
-
-

2.4 Relation to hypothesis testing

-

Recall that we found a statistically significant difference in the sample mean of romance movie ratings compared to the sample mean of action movie ratings. We concluded Chapter 3 by attempting to understand just how much greater we could expect the population mean romance movie rating to be compared to the population mean action movie rating. In order to do so, we will calculate a confidence interval for the difference \(\mu_r - \mu_a\). We’ll then go back to our population parameter values and see if our confidence interval contains our parameter value.

-

We could use bootstrapping in a way similar to that done above, except now on a difference in sample means, to create a distribution and then use the confint function with the option of quantile to determine a confidence interval for the plausible values of the difference in population means. This is an excellent programming activity and the reader is urged to try to do so.

-

Recall what the randomization/null distribution looked like for our simulated shuffled sample means:

-

Note all this code was moved over from hypothesis testing

-
(movies_trimmed <- movies %>% select(title, year, rating, Action, Romance))
-
# A tibble: 58,788 x 5
-                      title  year rating Action Romance
-                      <chr> <int>  <dbl>  <int>   <int>
- 1                        $  1971    6.4      0       0
- 2        $1000 a Touchdown  1939    6.0      0       0
- 3   $21 a Day Once a Month  1941    8.2      0       0
- 4                  $40,000  1996    8.2      0       0
- 5 $50,000 Climax Show, The  1975    3.4      0       0
- 6                    $pent  2000    4.3      0       0
- 7                  $windle  2002    5.3      1       0
- 8                     '15'  2002    6.7      0       0
- 9                      '38  1987    6.6      0       0
-10                  '49-'17  1917    6.0      0       0
-# ... with 58,778 more rows
-
movies_trimmed <- movies_trimmed %>%
-  filter(!(Action == 1 & Romance == 1))
-
movies_trimmed <- movies_trimmed %>%
-  mutate(genre = ifelse(Action == 1, "Action",
-                        ifelse(Romance == 1, "Romance",
-                               "Neither"))) %>%
-  filter(genre != "Neither") %>%
-  select(-Action, -Romance)
-
set.seed(2017)
-movies_genre_sample <- movies_trimmed %>% 
-  group_by(genre) %>%
-  sample_n(34) %>% 
-  ungroup()
-
mean_ratings <- movies_genre_sample %>% 
-  group_by(genre) %>%
-  summarize(mean = mean(rating))
-obs_diff <- diff(mean_ratings$mean)
-
shuffled_ratings <- #movies_trimmed %>%
-  movies_genre_sample %>% 
-     mutate(genre = shuffle(genre)) %>% 
-     group_by(genre) %>%
-     summarize(mean = mean(rating))
-diff(shuffled_ratings$mean)
-
[1] -0.1324
-
set.seed(2017)
-many_shuffles <- do(5000) * 
-  (movies_genre_sample %>% 
-     mutate(genre = shuffle(genre)) %>% 
-     group_by(genre) %>%
-     summarize(mean = mean(rating))
-   )
-
rand_distn <- many_shuffles %>%
-  group_by(.index) %>%
-  summarize(diffmean = diff(mean))
-head(rand_distn, 10)
-
# A tibble: 10 x 2
-   .index  diffmean
-    <dbl>     <dbl>
- 1      1 -0.132353
- 2      2 -0.197059
- 3      3 -0.026471
- 4      4  0.714706
- 5      5 -0.473529
- 6      6 -0.120588
- 7      7 -0.173529
- 8      8 -0.208824
- 9      9 -0.008824
-10     10 -0.332353
-
ggplot(data = rand_distn, mapping = aes(x = diffmean)) +
-  geom_histogram(color = "white", bins = 20)
-
-Simulated shuffled sample means histogram -

-Figure 2.6: Simulated shuffled sample means histogram -

-
-

With this null distribution being quite symmetric and bell-shaped, the standard error method introduced above likely provides a good estimate of a range of plausible values for \(\mu_r - \mu_a\). Another nice option here is that we can use the standard deviation of the null/randomization distribution we just found with our hypothesis test.

-
(std_err <- rand_distn %>% summarize(se = sd(diffmean)))
-
# A tibble: 1 x 1
-      se
-   <dbl>
-1 0.3404
-

We can use the general formula of \(statistic \pm (2 * SE)\) for a confidence interval to obtain the following result for plausible values of the difference in population means at the 95% level.

-
(lower <- obs_diff - (2 * std_err))
-
      se
-1 0.2692
-
(upper <- obs_diff + (2 * std_err))
-
     se
-1 1.631
-

We can, therefore, say that we are 95% confident that the population mean rating for romance movies is between 0.269 and 1.631 points higher than for that of action movies.

-

The important thing to check here is whether 0 is contained in the confidence interval. If it is, it is plausible that the difference in the two population means between the two groups is 0. This means that the null hypothesis is plausible. The results of the hypothesis test and the confidence interval should match as they do here. We rejected the null hypothesis with hypothesis testing and we have evidence here that the mean rating for romance movies is higher than for action movies.

-
-
-
-

2.5 Effect size

-

The phrase effect size has been thrown around recently as an alternative to \(p\)-values. In combination with the confidence interval, it can be often more valuable than just looking at the results of a hypothesis test. It depends on the scientific discipline exactly what is meant by “effect size” but, in general, it refers to the magnitude of the difference between group measurements. For our two sample problem involving movies, it is the observed difference in sample means obs_diff.

-

It’s worthy of mention here that confidence intervals are always centered at the observed statistic. In other words, if you are looking at a confidence interval and someone asks you what the “effect size” is you can simply find the midpoint of the stated confidence interval.

-
-

-Learning check -

-
-

(LC9.10) Check to see whether the difference in population mean ratings for the two genres falls in the confidence interval we found here. Are we guaranteed that it will fall in the range of plausible values?

-

(LC9.11) Why do you think many scientific fields are shifting to preferring inclusion of confidence intervals in articles over just \(p\)-values and hypothesis tests?

-

(LC9.12) Why is 95% related to a value of 2 in the margin of error? What would approximate values be for 90% and for 99%?

-

(LC9.13) Why is a 95% confidence interval wider than a 90% confidence interval? Explain by using a concrete example from everyday life about what is meant by “confidence.”

-

(LC9.14) How would confidence intervals correspond to one-sided hypothesis tests?

-

(LC9.15) There is a relationship between the significance level and the confidence level. What do you think it is?

-

(LC9.16) The moment the phrase “standard error” is mentioned, there seems to be someone that says “The standard error is \(s\) divided by the square root of \(n\).” This standard error formula is used in the theory-based procedure for an inference on one mean. But… does it always work? For samp1, samp2, and samp3 below, do the following:

-
    -
  1. produce a bootstrap distribution based on the sample
  2. -
  3. calculate the standard deviation of the bootstrap distribution
  4. -
  5. compare this value of the standard error to what you obtain when you calculate the standard deviation of the sample \(s\) divided by \(\sqrt{n}\).
  6. -
-
df1 <- data_frame(samp1 = rexp(50))
-df2 <- data_frame(samp2 = rnorm(100))
-df3 <- data_frame(samp3 = rbeta(20, 5, 5))
-

Describe how \(s / \sqrt{n}\) does in approximating the standard error for these three samples and their corresponding bootstrap distributions.

-
- -
-
-
-
-

2.6 Conclusion

-
-

2.6.1 What’s to come?

-

This concludes the Inference unit of this book. You should now have a thorough introduction into topics in both data science and statistics. In the last chapter of the textbook, we’ll summarize the purpose of this book as well as present an excellent example of what goes into making an effective story via data.

- -
-
-

2.6.2 Script of R code

-

An R script file of all R code used in this chapter is available here.

- -
-
-
-
- -
-
-
- - -
-
- - - - - - - - - - - - - - diff --git a/docs/2-getting-started.html b/docs/2-getting-started.html index 1a3ec5aa2..290d7f527 100644 --- a/docs/2-getting-started.html +++ b/docs/2-getting-started.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -707,16 +707,16 @@

2.4.2 flights data frame

# A tibble: 336,776 x 19
     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
- 1  2013     1     1      517            515      2.00      830            819
- 2  2013     1     1      533            529      4.00      850            830
- 3  2013     1     1      542            540      2.00      923            850
- 4  2013     1     1      544            545     -1.00     1004           1022
- 5  2013     1     1      554            600     -6.00      812            837
- 6  2013     1     1      554            558     -4.00      740            728
- 7  2013     1     1      555            600     -5.00      913            854
- 8  2013     1     1      557            600     -3.00      709            723
- 9  2013     1     1      557            600     -3.00      838            846
-10  2013     1     1      558            600     -2.00      753            745
+ 1  2013     1     1      517            515         2      830            819
+ 2  2013     1     1      533            529         4      850            830
+ 3  2013     1     1      542            540         2      923            850
+ 4  2013     1     1      544            545        -1     1004           1022
+ 5  2013     1     1      554            600        -6      812            837
+ 6  2013     1     1      554            558        -4      740            728
+ 7  2013     1     1      555            600        -5      913            854
+ 8  2013     1     1      557            600        -3      709            723
+ 9  2013     1     1      557            600        -3      838            846
+10  2013     1     1      558            600        -2      753            745
 # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
 #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
 #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
@@ -899,11 +899,9 @@

2.5.1 What’s to come?

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/2-regression.html b/docs/2-regression.html deleted file mode 100644 index cc97276cf..000000000 --- a/docs/2-regression.html +++ /dev/null @@ -1,1444 +0,0 @@ - - - - - - - - An Introduction to Statistical and Data Sciences via R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
- - -
-
- -
- -ModernDive - -
-

2 Basic Regression

-

Now that we are equipped with data visualization skills from Chapter ??, data wrangling skills from Chapter ??, and an understanding of the “tidy” data format from Chapter ??, we now proceed with data modeling. The fundamental premise of data modeling is to make explicit the relationship between:

-
    -
  • An outcome variable \(y\), also called a dependent variable and
  • -
  • An explanatory/predictor variable \(x\), also called an independent variable or covariate.
  • -
-

Another way to state this is using mathematical terminology: we will model the outcome variable \(y\) as a function of the explanatory/predictor variable \(x\). Why do we have two different labels, explanatory and predictor, for the variable \(x\)? That’s because roughly speaking data modeling can be used for two purposes:

-
    -
  1. Modeling for prediction: You want to predict an outcome variable \(y\) based on the information contained in a set of predictor variables. You don’t care so much about understanding how all the variables relate and interact, but so long as you can make good predictions about \(y\), you’re fine. For example, if we know many individuals’ risk factors for lung cancer, such as smoking habits and age, can we predict whether or not they will develop lung cancer? Here we wouldn’t care so much about distinguishing the degree to which the different risk factors contribute to lung cancer, but instead only on whether or not they could be put together to make reliable predictions.
  2. -
  3. Modeling for explanation: You want to explicitly describe the relationship between an outcome variable \(y\) and a set of explanatory variables, determine the significance of any found relationships, and have measures summarizing these. Continuing our example from above, we would now be interested in describing the individual effects of the different risk factors and quantifying the magnitude of these effects. One reason could be to design an intervention to reduce lung cancer cases in a population, such as targeting smokers of a specific age group with an advertisement for smoking cessation programs. In this book, we’ll focus more on this latter purpose.
  4. -
-

Data modeling is used in a wide variety of fields, including statistical inference, causal inference, artificial intelligence, and machine learning. There are many techniques for data modeling, such as tree-based models, neural networks/deep learning, and more. However, we’ll focus on one particular technique: linear regression, one of the most commonly-used and easy-to-understand approaches to modeling. Recall our discussion in Subsection ?? on numerical and categorical variables. Linear regression involves:

-
    -
  • An outcome variable \(y\) that is numerical
  • -
  • Explanatory variables \(\vec{x}\) that are either numerical or categorical
  • -
-

Whereas there is always only one numerical outcome variable \(y\), we have choices on both the number and the type of explanatory variables \(\vec{x}\) to use. We’re going to cover the following regression scenarios:

-
    -
  • In this chapter, Chapter 2 on basic regression, where we’ll always have only one explanatory variable: -
      -
    • A single numerical explanatory variable \(x\) in Section 2.1. This scenario is known as simple linear regression.
    • -
    • A single categorical explanatory variable \(x\) in Section 2.2.
    • -
  • -
  • In the next chapter: Chapter ?? on multiple regression, where we’ll have more than one explanatory variable: -
      -
    • Two numerical explanatory variables \(x_1\) and \(x_2\) in Section ??. This can be denoted as \(\vec{x}\) as well since we have more than one explanatory variable.
    • -
    • One numerical and one categorical explanatory variable in Section ??. We’ll also introduce interaction models here; there the effect of one explanatory variable depends on the value of another.
    • -
  • -
-

We’ll study all four of these regression scenarios using real data, all easily accessible via R packages!

-
-

Needed packages

-

In this chapter we introduce a new package, moderndive, that is an accompaniment package to this ModernDive book that includes useful functions for linear regression and other functions and data used later in the book. Let’s now load all the packages needed for this chapter. If needed, read Section ?? for information on how to install and load R packages.

-
library(ggplot2)
-library(dplyr)
-library(moderndive)
-library(gapminder)
-
-
-

2.1 One numerical explanatory variable

-

Why do some professors and instructors at universities and colleges get high teaching evaluations from students while others don’t? What factors can explain these differences? Are there biases? These are questions that are of interest to university/college administrators, as teaching evaluations are among the many criteria considered in determining which professors and instructors should get promotions. Researchers at the University of Texas in Austin tried to answer this question: what factors can explain differences in instructor’s teaching evaluation scores? To this end, they collected information on \(n = 463\) instructors. A full description of the study can be found at openintro.org.

-

We’ll keep things simple for now and try to explain differences in instructor evaluation scores as a function of one numerical variable: their “beauty score” which we’ll describe shortly. Could it be that instructors with higher beauty scores also have higher teaching evaluations? Could it be instead that instructors with higher beauty scores tend to have lower teaching evaluations? Or could it be there is no relationship between beauty score and teaching evaluations?

-

We’ll achieve this by modeling the relationship between these two variables with a particular kind of linear regression called simple linear regression. Simple linear regression is the most basic form of linear regression where we have

-
    -
  1. A numerical outcome variable \(y\). In this case, their teaching score.
  2. -
  3. A single numerical explanatory variable \(x\). In this case, their beauty score.
  4. -
-
-

2.1.1 Exploratory data analysis

-

A crucial step before doing any kind of modeling or analysis is performing an exploratory data analysis, or EDA, of all our data. Exploratory data analysis can give you a sense of the distribution of the data, whether there are outliers and/or missing values, but most importantly it can inform how to build your model. There are many approaches to exploratory data analysis, here are three:

-
    -
  1. Most fundamentally: just looking at the raw values, in a spreadsheet for example. While this may seem trivial, many people ignore this crucial step!
  2. -
  3. Computing summary statistics likes means, medians, and standard deviations.
  4. -
  5. Creating data visualizations.
  6. -
-

Let’s load the data, select only a subset of the variables, and look at the raw values. Recall you can look at the raw values by running View(evals) in the console in RStudio to pop-up the spreadsheet viewer. Here, however, we present only a snapshot of 5 randomly chosen rows:

-
load(url("http://www.openintro.org/stat/data/evals.RData"))
-evals <- evals %>%
-  select(score, bty_avg, age)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.1: Random sample of 5 instructors
scorebty_avgage
2903.66.6734
3414.93.5043
1993.32.3347
474.44.6733
2154.73.6760
-

While a full description of each of these variables can be found at openintro.org, let’s summarize what each of these variables represent

-
    -
  1. score: Numerical variable of the average teaching score based on students’ evaluations between 1 and 5. This is the outcome variable \(y\) of interest.
  2. -
  3. bty_avg: Numerical variable of average “beauty” rating based on a panel of 6 students’ scores between 1 and 10. This is the numerical explanatory variable \(x\) of interest.
  4. -
  5. age: A numerical variable of age.
  6. -
- -

Another way to look at the raw values is using the glimpse() function, which gives us a slightly different view of the data. We see Observations: 463, indicating that there are 463 observations in evals, each corresponding to a particular instructor at UT Austin. Expressed differently, each row in the data frame evals corresponds to one of 463 instructors.

-
glimpse(evals)
-
Observations: 463
-Variables: 3
-$ score   <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5,...
-$ bty_avg <dbl> 5.00, 5.00, 5.00, 5.00, 3.00, 3.00, 3.00, 3.33, 3.33, 3.17,...
-$ age     <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40,...
-

Since both the outcome variable score and the explanatory variable bty_avg are numerical, we can compute summary statistics about them such as the mean and median. Let’s take evals, then select only the two variables of interest for now, and pipe them into the summary() command which returns: the minimum (smallest) value, the first quartile, the median, the mean (average), the third quartile, and the maximum (largest) value.

-
evals %>% 
-  select(score, bty_avg) %>% 
-  summary()
-
     score         bty_avg    
- Min.   :2.30   Min.   :1.67  
- 1st Qu.:3.80   1st Qu.:3.17  
- Median :4.30   Median :4.33  
- Mean   :4.17   Mean   :4.42  
- 3rd Qu.:4.60   3rd Qu.:5.50  
- Max.   :5.00   Max.   :8.17  
-

We get an idea of how the values in both variables are distributed. For example, the mean teaching score was 4.17 out of 5 whereas the mean beauty score was 4.42 out of 10. Furthermore, the middle 50% of teaching scores were between 3.80 and 4.6 (the first and third quartiles) while the middle 50% of beauty scores were between 3.17 and 5.5 out of 10.

-

The summary() function however only returns what are called univariate summaries, i.e. summaries about single variables at a time. Since we are considering the relationship between two numerical variables, it would be nice to have a summary statistic that simultaneously considers both variables. The correlation coefficient is a bivariate summary statistic that fits this bill. Coefficients in general are quantitative expressions of a specific property of a phenomenon. A correlation coefficient is a quantitative expression between -1 and 1 that summarizes the strength of the linear relationship between two numerical variables:

-
    -
  • -1 indicates a perfect negative relationship: as the value of one variable goes up, the value of the other variable tends to go down.
  • -
  • 0 indicates no relationship: the values of both variables go up/down independently of each other.
  • -
  • +1 indicates a perfect positive relationship: as the value of one variable goes up, the value of the other variable tends to go up as well.
  • -
-

Figure 2.1 gives examples of different correlation coefficient values for hypothetical numerical variables \(x\) and \(y\). We see that while for a correlation coefficient of -0.75 there is still a negative relationship between \(x\) and \(y\), it is not as strong as the negative relationship between \(x\) and \(y\) when the correlation coefficient is -1.

-
-Different correlation coefficients -

-Figure 2.1: Different correlation coefficients -

-
-

The correlation coefficient is computed using the cor() function, where in this case the inputs to the function are the two numerical variables from which we want to calculate the correlation coefficient. Recall from Subsection ?? that the $ pulls out specific variables from a data frame:

-
cor(evals$score, evals$bty_avg)
-
[1] 0.187
-

In our case, the correlation coefficient of 0.187 indicates that the relationship between teaching evaluation score and beauty average is “weakly positive.” There is a certain amount of subjectivity in interpreting correlation coefficients, especially those that aren’t close to -1, 0, and 1. For help developing such intuition and more discussion on the correlation coefficient see Subsection 2.3.1 below.

-

Let’s now proceed by visualizing this data. Since both the score and bty_avg variables are numerical, a scatterplot is an appropriate graph to visualize this data. Let’s do this using geom_point() and set informative axes labels and title.

-
ggplot(evals, aes(x = bty_avg, y = score)) +
-  geom_point() +
-  labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores")
-
-Instructor evaluation scores at UT Austin -

-Figure 2.2: Instructor evaluation scores at UT Austin -

-
-

However Figure 2.2 suffers from overplotting. Recall from the data visualization Subsection ?? that overplotting occurs when several points are stacked directly on top of each other thereby obscuring the number of points. For example, let’s focus on the 6 points in the top-right of the plot with a beauty score of around 8 out of 10: are there truly only 6 points, or are there many more just stacked on top of each other? You can think of these as ties. Let’s break up these ties with a little random “jitter” added to the points in Figure 2.3. Jittering adds a little random bump to each of the points to break up these ties. Remember that the geom_jitter only alters the visual display of the points; the values in the data frame stay the same.

-
ggplot(evals, aes(x = bty_avg, y = score)) +
-  geom_jitter() +
-  labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores")
-
-Instructor evaluation scores at UT Austin: Jittered -

-Figure 2.3: Instructor evaluation scores at UT Austin: Jittered -

-
-

From Figure 2.3 we make several observations:

- -
    -
  1. Focusing our attention on the top-right of the plot again, we now see that those originally unjittered 6 points actually were actually 12!
  2. -
  3. A further interesting trend is that the jittering revealed a large number of instructors with beauty scores of between 3 and 4.5, towards the lower end of the beauty scale.
  4. -
  5. Most beauty scores lie between 2 and 8.
  6. -
  7. Most teaching scores lie between 3 and 5.
  8. -
  9. Recall our earlier computation of the correlation coefficient, which describes the strength of the linear relationship between two numerical variables. Looking at Figure 2.3, it is not immediately apparent that these two variables are positively related. This is to be expected given the positive, but rather weak (close to 0), correlation coefficient of 0.187.
  10. -
-

Going back to the unjittered plot in Figure 2.2, let’s improve on it by adding a “regression line” in Figure 2.4. This is easily done by adding a new layer to the ggplot code that created Figure 2.3: + geom_smooth(method="lm"). A regression line is a “best fitting” line in that of all possible lines you could draw on this plot, it is “best” in terms of some mathematical criteria. We discuss the criteria for “best” in Subsection 2.3.3 below, but we suggest you read this only after covering the concept of a residual coming up in Subsection 2.1.3.

-
ggplot(evals, aes(x = bty_avg, y = score)) +
-  geom_point() +
-  labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") +  
-  geom_smooth(method = "lm")
-
-Regression line -

-Figure 2.4: Regression line -

-
-

When viewed on this plot, the regression line is a visual summary of the relationship between two numerical variables, in our case the outcome variable score and the explanatory variable bty_avg. The positive slope of the blue line is consistent with our observed correlation coefficient of 0.187 suggesting that there is a positive relationship between score and bty_avg. We’ll see later however that while the correlation coefficient is not equal to the slope of this line, they always have the same sign: positive or negative.

-

What are the grey bands surrounding the blue line? These are standard error bands, which can be thought of as error/uncertainty bands. Let’s skip this idea for now and suppress these grey bars for now by adding the argument se = FALSE to geom_smooth(method = "lm"). We’ll introduce standard errors in Chapter ?? on sampling, use them for constructing confidence intervals and conducting hypothesis tests in Chapters ?? and ??, and consider them when we revisit regression in Chapter ??.

-
ggplot(evals, aes(x = bty_avg, y = score)) +
-  geom_point() +
-  labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") +
-  geom_smooth(method = "lm", se = FALSE)
-
-Regression line without error bands -

-Figure 2.5: Regression line without error bands -

-
-
-

-Learning check -

-
-

(LC6.1) Conduct a new exploratory data analysis with the same outcome variable \(x\) being score but with age as the new explanatory variable \(x\). Remember, this involves three things:

-
    -
  1. Looking at the raw values
  2. -
  3. Computing summary statistics of the variables of interest.
  4. -
  5. Creating informative visualizations
  6. -
-

What can you say about the relationship between age and teaching scores based on this exploration?

-
- -
-
-
-

2.1.2 Simple linear regression

-

If case you’ve forgotten from high school algebra, in general, the equation of a line is \(y = a + bx\), which is defined by two coefficients. Recall we defined this earlier as “quantitative expressions of a specific property of a phenomenon. These two coefficients are:

-
    -
  • the intercept coefficient \(a\), or the value of \(y\) when \(x = 0\), and
  • -
  • the slope coefficient \(b\), or the increase in \(y\) for every increase of one in \(x\).
  • -
-

However, when defining a line specifically for regression, like the blue regression line in Figure 2.5, we use slightly different notation: the equation of the regression line is \(\widehat{y} = b_0 + b_1 x\) where

-
    -
  • the intercept coefficient is \(b_0\), or the value of \(\widehat{y}\) when \(x=0\), and
  • -
  • the slope coefficient \(b_1\), or the increase in \(\widehat{y}\) for every increase of one in \(x\).
  • -
-

Why do we put a “hat” on top of the \(y\)? It’s a form of notation commonly used in regression, which we’ll introduce in the next Subsection 2.1.3 when we discuss fitted values. For now, let’s ignore the hat and treat the equation of the line as you would from high school algebra recognizing the slope and the intercept. We know looking at Figure 2.5 that the slope coefficient corresponding to bty_avg should be positive. Why? Because as bty_avg increases, professors tend to roughly have larger teaching evaluation scores. However, what are the specific values of the intercept and slope coefficients? Let’s not worry about computing these by hand, but instead let the computer do the work for us, specifically R!

-

Let’s get the value of the intercept and slope coefficients by outputting something called the linear regression table. This is always done in a two-step process:

-
    -
  1. First “fit” the linear regression model to the data using the lm() function and save this to score_model. lm stands for “linear model”, given that we are dealing with lines. When we say “fit”, we are saying find the best fitting line to this data.
  2. -
  3. Then apply the get_regression_table() function from the moderndive R package to score_model.
  4. -
-
score_model <- lm(score ~ bty_avg, data = evals)
-get_regression_table(score_model, digits = 2)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.2: Linear regression table
termestimatestd_errorstatisticp_valueconf_lowconf_high
intercept3.8800.07650.9603.7314.030
bty_avg0.0670.0164.0900.0350.099
-

Whoa! There is a lot going on, both in terms of the inputs and outputs! Let’s unpack this slowly. First, the lm() function that “fits” the linear regression model is typically used as lm(y ~ x, data = DATA_FRAME_NAME) where:

-
    -
  • y is the outcome variable, followed by a tilde (~), the key to the left of “1” on your keyboard. In our case, y is set to score.
  • -
  • x is the explanatory variable. In our case, x is set to bty_avg. We call the combination y ~ x a model formula.
  • -
  • DATA_FRAME_NAME is the name of the data frame that contains the variables y and x. In our case the evals data frame.
  • -
-

Then we pipe this output to be the input of the get_regression_table() function, just as when we discussed piping in Section ?? in the data wrangling chapter. An additional argument to the get_regression_table() function is digits, where we specify the number of significant digits of precision (number of digits after the decimal points) we want the regression table to have. digits defaults to 3, meaning if you don’t specify this argument, digits = 3 is used by default. All the get_regression_table() function in the moderndive package does is generate regression table outputs that are clean and easy-to-read while hiding a lot of the code necessary to do so and not much else. This is known as a wrapper function in computer programming, which takes other pre-existing functions and “wraps” them in a single function. While not necessary to understand regression, if you are curious to know what is going on under the hood of get_regression_table(), see Subsection 2.3.4 below.

- -

Now let’s consider the outputted regression table, which has two rows denoted by the first column term: one corresponding to the intercept coefficient \(b_0\) and one corresponding to the slope coefficient \(b_1\) for bty_avg. The second column estimate gives us the “fitted” (or computed) values for both these coefficients. Therefore the blue regression line in Figure 2.5 is \(\widehat{\text{score}} = b_0 + b_{\text{bty avg}} \text{bty avg} = 3.88 + 0.067\text{bty avg}\) where

-
    -
  • The intercept coefficient \(b_0\) = 3.88, meaning for instructors that had a hypothetical beauty score of 0 would on average have a teaching score of 3.88. In this case however, while the intercept has a mathematical interpretation when defining the regression line, there is no practical interpretation since score is an average of a panel of 6 students’ ratings from 1 to 10, a bty_avg of 0 would be impossible. Furthermore, no instructors had a beauty score anywhere near 0.
  • -
  • Of more interest is the slope coefficient associated with bty_avg \(b_{\text{bty avg}}\) = 0.067. This is a numerical quantity that summarizes the relationship between the outcome and explanatory variables. It is interpreted as follows, for every increase of 1 unit in bty_avg, there is an associated increase of on average 0.067 units of score. We note in particular that the sign of this slope is positive, suggesting a positive relationship between beauty scores and teaching scores. We are very careful with our wording: -
      -
    • We only stated that there is an associated increase, and not necessarily a causal increase. For example, perhaps it’s not that beauty directly affects teaching scores, but instead individuals from wealthier backgrounds tend to have had better education and training, and hence have higher teaching scores, but these same individuals also have higher beauty scores. Avoiding such reasoning can be summarized by the adage “correlation is not necessarily causation”. In other words, just because two variables are correlated, it doesn’t mean one directly causes the other. We discuss these ideas more in Subsection 2.3.2.
      -
    • -
    • We say that this associated increase is on average 0.067 units of teaching score and not that the associated increase is exactly 0.067 units of score across all values of bty_avg. This is because the slope is the average increase across all points as shown by the regression line in Figure 2.5. But what about the remaining 5 columns: std_error, statistic, p_value, conf_low and conf_high? They give you information on the statistical significance of these results, or their “meaningfulness” from a statistical perspective. We’ll revisit these in Chapter ?? on (statistical) inference for regression after we’ve covered standard errors in Chapter ?? (std_error), confidence intervals in Chapter ?? (conf_low and conf_high), and hypothesis testing in Chapter ?? (statistic and p_value). For now, we’ll only focus on the term and estimate columns.
    • -
  • -
-
-

-Learning check -

-
-

(LC6.2) Fit a new simple linear regression using lm(score ~ age, data=evals) where age is the new explanatory variable \(x\). Get information about the “best-fitting” line from the regression table by applying the get_regression_table() function. How do the regression results match up with the results from your exploratory data analysis above?

-
- -
-
-
-

2.1.3 Observed/fitted values and residuals

-

We just saw how to get the value of the intercept and the slope of the regression line from the regression table generated by get_regression_table(). Now instead, say we want information on individual points, in this case one of the \(n = 463\) instructors in this dataset, one corresponding to each row of evals.

-

For example, say we are interested in the 21st instructor in this dataset:

- - - - - - - - - - - - - - - - -
Table 2.3: Data for 21st instructor
scorebty_avgage
4.97.3331
-

What is the value on the blue line corresponding to this instructors bty_avg of 7.333? In Figure 2.6 we mark three values in particular corresponding to this instructor. Note we revert back to the geom_point() as the geom_jitter() has random noise added to teach point, making it difficult to identify points exactly.

-
    -
  • Red circle: This is the observed value \(y\) = 4.9 and corresponds to this instructor’s actual teaching score.
  • -
  • Red square: This is the fitted value \(\widehat{y}\) and corresponds to the value on the regression line for \(x\) = 7.333. This value is computed using the intercept and slope in the regression table above: \(\widehat{y} = b_0 + b_1 x\) = 3.88 + 0.067 * 7.333 = 4.369
  • -
  • Blue arrow: The length of this arrow is the residual and is computed by subtracting the fitted value \(\widehat{y}\) from the observed value \(y\). The residual can be thought of as the error or “lack of fit” of the regression line, In the case of this instructor, it is \(y - \widehat{y}\) = 4.9 - 4.369 = 0.531. In other words, the model was off by 0.531 teaching score units for this instructor.
  • -
-
-Example of observed value, fitted value, and residual -

-Figure 2.6: Example of observed value, fitted value, and residual -

-
-

What if we want both

-
    -
  1. the fitted value \(\widehat{y} = b_0 + b_1 \times x\)
  2. -
  3. the residual \(y - \widehat{y}\)
  4. -
-

not only the 21st instructor but for all 463 instructors in the study? Recall that each instructor corresponds to one of the 463 rows in the evals data frame and also one of the 463 points in regression plot in Figure 2.5. We could repeat the above calculations by hand 463 times, but that would be tedious and time consuming. Instead, let’s use the get_regression_points() function that we’ve included in the moderndive R package. Note that in the table below we only present the results for 21st through 24th instructors.

-
regression_points <- get_regression_points(score_model)
-regression_points
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.4: Regression points (for only 21st through 24th instructor)
IDscorebty_avgscore_hatresidual
214.97.334.370.531
224.67.334.370.231
234.57.334.370.131
244.45.504.250.153
-

Just as with the get_regression_table() function, the inputs to the get_regression_points() function are the same, however the outputs are different. Let’s inspect the individual columns:

-
    -
  • The score column represents the observed value of the outcome variable \(y\)
  • -
  • The bty_avg column represents the values of the explanatory variable \(x\)
  • -
  • The score_hat column represents the fitted values \(\widehat{y}\)
  • -
  • The residual column represents the residuals \(y - \widehat{y}\)
  • -
-

Just as we did for the 21st instructor in the evals dataset (in the first row of the table above), let’s repeat the above calculations for the 24th instructor in the evals dataset (in the fourth row of the table above):

-
    -
  • score = 4.4 is the observed value \(y\) for this instructor.
  • -
  • bty_avg = 5.50 is the value of the explanatory variable \(x\) for this instructor.
  • -
  • score_hat = 4.25 = 3.88 + 0.067 * \(x\) = 3.88 + 0.067 * 5.50 is the fitted value \(\widehat{y}\) for this instructor.
  • -
  • residual = 0.153 = 4.4 - 4.25 is the value of the residual for this instructor. In other words, the model was off by 0.153 teaching score units for this instructor.
  • -
-

At this point, we suggest you read Subsection 2.3.3, where we explicitly define how a regression line is a “best” fitting line.

- -
-
-

2.1.4 Residual analysis

-

Recall the residuals can be thought of as the error or the “lack-of-fit” between the observed value \(y\) and the fitted value \(\widehat{y}\) on the blue regression line in Figure 2.5. Ideally when we fit a regression model, we’d like there to be no systematic pattern to these residuals. We’ll be more specific as to what we mean by no systematic pattern when we see Figure 2.8 below, but let’s keep this notion imprecise for now. Investigating any such patterns is known as residual analysis and is the theme of this section.

- -

We’ll perform our residual analysis in two ways:

-
    -
  1. Creating a scatterplot with the residuals on the \(y\)-axis and the original explanatory variable \(x\) on the \(x\)-axis.
  2. -
  3. Creating a histogram of the residuals, thereby showing the distribution of the residuals.
  4. -
-

First, recall in Figure 2.6 above we created a scatterplot where

-
    -
  • On the vertical axis we had the teaching score \(y\)
  • -
  • On the horizontal axis we had the beauty score \(x\)
  • -
  • The blue arrow represented the residual for one particular instructor.
  • -
-

Instead, in Figure 2.7 below, let’s create a scatterplot where

-
    -
  • On the vertical axis we have the residual \(y-\widehat{y}\) instead
  • -
  • On the horizontal axis we have the beauty score \(x\) as before
  • -
-
-Plot of residuals over beauty score -

-Figure 2.7: Plot of residuals over beauty score -

-
-

You can think of Figure 2.7 as Figure 2.6 but with the blue line flattened out to \(y=0\). Does it seem like there is no systematic pattern to the residuals? This question is rather qualitative and subjective in nature, thus different people may respond with different answers to the above question. However, it can be argued that there isn’t a drastic pattern in the residuals.

-

Let’s now get a little more precise in our definition of no systematic pattern in the residuals. Ideally, the residuals should behave randomly and

-
    -
  1. The residuals should be on average 0. In other words, sometimes the regression model will make a positive error in that \(y - \widehat{y} > 0\), sometimes the regression model will make a negative error in that \(y - \widehat{y} < 0\), but on average the error is 0.
  2. -
  3. The value and spread of the residuals should not depend on the value of \(x\).
  4. -
-

In Figure 2.8 below, we display some hypothetical examples where there are drastic patterns to the residuals. In Example 1, the value of the residual seems to depend on \(x\): the residuals tend to be positive for small and large values of \(x\) in this range, whereas values of \(x\) more in the middle tend to have negative residuals. In Example 2, while the residuals seem to be on average 0 for each value of \(x\), the spread of the residuals varies for different values of \(x\); this situation is known as heteroskedasticity.

-
-Examples of less than ideal residual patterns -

-Figure 2.8: Examples of less than ideal residual patterns -

-
-

The second way to perform a residual analysis is to look at the histogram of the residuals:

-
ggplot(regression_points, aes(x = residual)) +
-  geom_histogram(binwidth = 0.25, color = "white") +
-  labs(x = "Residual")
-
-Histogram of residuals -

-(#fig:model1_residuals_hist)Histogram of residuals -

-
-

This histogram seems to indicate that we have more positive residuals than negative. Since residual = \(y-\widehat{y} > 0\) when \(y > \widehat{y}\), it seems our fitted teaching score from the regression model tends to underestimate the true teaching score. This histogram has a slight left-skew in that there is a long tail on the left. Another way to say this is this data exhibits a negative skew. Is this a problem? Again, there is a certain amount of subjectivity in the response. In the authors’ opinion, while there is a slight skew/pattern to the residuals isn’t a large concern. On the other hand, others might disagree with our assessment. Here are examples of an ideal and less than ideal pattern to the residuals when viewed in a histogram:

-
-Examples of ideal and less than ideal residual patterns -

-Figure 2.9: Examples of ideal and less than ideal residual patterns -

-
-

In fact, we’ll see later on that we would like the residuals to be normally distributed with mean 0. In other words, be bell-shaped and centered at 0! While this requirement and residual analysis in general may seem to some of you as not being overly critical at this point, we’ll see later after when we cover inference for regression in Chapter ?? that for the last five columns of the regression table from earlier (std error, statistic, p_value,conf_low, and conf_high) to have valid interpretations, the above three conditions should roughly hold.

-
-

-Learning check -

-
-

(LC6.3) Continuing with our regression using age as the explanatory variable and teaching score as the outcome variable, use the get_regression_points() function to get the observed values, fitted values, and residuals for all 463 instructors and perform a residual analysis and look for any systematic patterns in the residuals. Ideally, there should be little to no pattern.

-
- -
-
-
-
-

2.2 One categorical explanatory variable

-

It’s an unfortunate truth that life expectancy is not the same across various countries in the world; there are a multitude of factors that are associated with how long people live. International development agencies are very interested in studying these differences in the hope of understanding where governments should allocate resources to address this problem. In this section, we’ll explore differences in life expectancy in two ways:

-
    -
  1. Differences between continents: Are there significant differences in life expectancy, on average, between the five continents of the world: Africa, the Americas, Asia, Europe, and Oceania?
  2. -
  3. Differences within continents: How does life expectancy vary within the world’s five continents? For example, is the spread of life expectancy among the countries of Africa larger than the spread of life expectancy among the countries of Asia?
  4. -
-

To answer such questions, we’ll study the gapminder dataset in the gapminder package. Recall we introduced this dataset in Subsection ?? when we first studied the “Grammar of Graphics”; in particular Figure ??. This dataset has international development statistics such as life expectancy, GDP per capita, and population by country (\(n\) = 142) for 5-year intervals between 1952 and 2007.

-

We’ll use this data for linear regression again, but note that our explanatory variable \(x\) is now categorical, and not numerical like when we covered simple linear regression in Section 2.1:

-
    -
  1. A numerical outcome variable \(y\). In this case, life expectancy.
  2. -
  3. A single categorical explanatory variable \(x\), In this case, the continent the country is part of.
  4. -
-

When the explanatory variable \(x\) is categorical, the concept of a “best-fitting” line is a little different than the one we saw previously in Section 2.1 where the explanatory variable \(x\) was numerical. We’ll study these differences shortly in Subsection 2.2.2, but first our exploratory data analysis.

-
-

2.2.1 Exploratory data analysis

-

Let’s load the gapminder data, filter() for only observations in 2007, select() only the variables we’ll need along with gdpPercap which is each country’s gross domestic product per capita, a rough measure of that country’s economic performance (this will be used for the upcoming Learning Check). Save this in a data frame gapminder2007:

-
library(gapminder)
-gapminder2007 <- gapminder %>%
-  filter(year == 2007) %>% 
-  select(country, continent, lifeExp, gdpPercap)
-

Let’s look at the raw data values both by bringing up RStudio’s spreadsheet viewer and the glimpse() function, although in Table 2.5 we only show 5 randomly selected countries out of 142:

-
View(gapminder2007)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.5: Random sample of 5 countries
countrycontinentlifeExpgdpPercap
Slovak RepublicEurope74.718678
IsraelAsia80.725523
BulgariaEurope73.010681
TanzaniaAfrica52.51107
MyanmarAsia62.1944
-
glimpse(gapminder2007)
-
Observations: 142
-Variables: 4
-$ country   <fctr> Afghanistan, Albania, Algeria, Angola, Argentina, Austra...
-$ continent <fctr> Asia, Europe, Africa, Africa, Americas, Oceania, Europe,...
-$ lifeExp   <dbl> 43.8, 76.4, 72.3, 42.7, 75.3, 81.2, 79.8, 75.6, 64.1, 79....
-$ gdpPercap <dbl> 975, 5937, 6223, 4797, 12779, 34435, 36126, 29796, 1391, ...
-

We see that the variable continent is indeed categorical, as it is encoded as fctr which stands for “factor”: R’s way of storing categorical variables. Let’s look at a summary of the explanatory variable continent:

-
summary(gapminder2007$continent)
-
  Africa Americas     Asia   Europe  Oceania 
-      52       25       33       30        2 
-

We observe that all other continents have 25 countries or more, but Oceania only has two: Australia and New Zealand. Let’s now compute some summary statistics of the outcome variable lifeExp, in particular the worldwide median and mean life expectancy

-
lifeExp_worldwide <- gapminder2007 %>%
-  summarize(median = median(lifeExp), mean = mean(lifeExp))
- - - - - - - - - - - - - - -
Table 2.6: Worldwide life expectancy
medianmean
71.967
-

Given that the global median life expectancy is 71.935 half of the world’s countries (71 countries) will have a life expectancy less than 71.935, while half will have a life expectancy greater than this value. The mean life expectancy of 67.007 is lower however. Why are these two values different? Let’s look at a histogram of lifeExp to see why.

-
ggplot(gapminder2007, aes(x = lifeExp)) +
-  geom_histogram(binwidth = 5, color = "white") +
-  labs(x = "Life expectancy", y = "Number of countries", title = "Worldwide life expectancy")
-

-

We see that this data is left-skewed/negatively skewed: there are a few countries with very low life expectancies that are bringing down the mean life expectancy. However, the median is less sensitive to the effects of such outliers. Hence the median is greater than the mean in this case. Let’s proceed by comparing median and mean life expectancy between continents by adding a group_by(continent) to the above code:

-
lifeExp_by_continent <- gapminder2007 %>%
-  group_by(continent) %>%
-  summarize(median = median(lifeExp), mean = mean(lifeExp))
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.7: Life expectancy by continent
continentmedianmean
Africa52.954.8
Americas72.973.6
Asia72.470.7
Europe78.677.6
Oceania80.780.7
-

We see now that there are differences in life expectancies between the continents. For example focusing on only medians, while the median life expectancy across all \(n = 142\) countries in 2007 was 71.935, the median life expectancy across the \(n =52\) countries in Africa was only 52.927.

-

Let’s create a corresponding visualization. One way to compare the life expectancies of countries in different continents would be via a faceted histogram. Recall we saw back in the Data Visualization chapter, specifically Section ??, that facets allow us to split a visualization by the different levels of a categorical variable or factor variable. In Figure 2.10, the variable we facet by is continent, which is categorical with five levels, each corresponding to the five continents of the world.

-
ggplot(gapminder2007, aes(x = lifeExp)) +
-  geom_histogram(binwidth = 5, color = "white") +
-  labs(x = "Life expectancy", y = "Number of countries", title = "Life expectancy by continent") +
-  facet_wrap(~continent, nrow = 2)
-
-Life expectancy in 2007 -

-Figure 2.10: Life expectancy in 2007 -

-
-

Another way would be via a geom_boxplot where we map the categorical variable continent to the \(x\)-axis and the different life expectancies within each continent on the \(y\)-axis; we do this in Figure 2.11.

-
ggplot(gapminder2007, aes(x = continent, y = lifeExp)) +
-  geom_boxplot() +
-  labs(x = "Continent", y = "Life expectancy (years)", title = "Life expectancy by continent") 
-
-Life expectancy in 2007 -

-Figure 2.11: Life expectancy in 2007 -

-
-

Some people prefer comparing a numerical variable between different levels of a categorical variable, in this case comparing life expectancy between different continents, using a boxplot over a faceted histogram as we can make quick comparisons with single horizontal lines. For example, we can see that even the country with the highest life expectancy in Africa is still lower than all countries in Oceania.

-

It’s important to remember however that the solid lines in the middle of the boxes correspond to the medians (i.e. the middle value) rather than the mean (the average). So, for example, if you look at Asia, the solid line denotes the median life expectancy of around 72 years, indicating to us that half of all countries in Asia have a life expectancy below 72 years whereas half of all countries in Asia have a life expectancy above 72 years. Furthermore, note that:

-
    -
  • Africa and Asia have much more spread/variation in life expectancy as indicated by the interquartile range (the height of the boxes).
  • -
  • Oceania has almost no spread/variation, but this might in large part be due to the fact there are only two countries in Oceania: Australia and New Zealand.
  • -
-

Now, let’s start making comparisons of life expectancy between continents. Let’s use Africa as a baseline for comparsion. Why Africa? Only because it happened to be first alphabetically, we could’ve just as appropriately used the Americas as the baseline for comparison. Using the “eyeball test” (just using our eyes to see if anything stands out), we make the following observations about differences in median life expectancy compared to the baseline of Africa:

-
    -
  1. The median life expectancy of the Americas is roughly 20 years greater.
  2. -
  3. The median life expectancy of Asia is roughly 20 years greater.
  4. -
  5. The median life expectancy of Europe is roughly 25 years greater.
  6. -
  7. The median life expectancy of Oceania is roughly 27.8 years greater.
  8. -
-

Let’s remember these four differences vs Africa corresponding to the Americas, Asia, Europe, and Oceania: 20, 20, 25, 27.8.

-
-

-Learning check -

-
-

(LC6.4) Conduct a new exploratory data analysis with the same explanatory variable \(x\) being continent but with gdpPercap as the new outcome variable \(y\). Remember, this involves three things:

-
    -
  1. Looking at the raw values
  2. -
  3. Computing summary statistics of the variables of interest.
  4. -
  5. Creating informative visualizations
  6. -
-

What can you say about the difference in GDP per capita based on this exploration?

-
- -
-
-
-

2.2.2 Linear regression

-

In Subsection 2.1.2 we introduced simple linear regression, which involves modeling the relationship between a numerical outcome variable \(y\) as a function of a numerical explanatory variable \(x\), in our life expectancy example, we now have a categorical explanatory variable \(x\) continent. While we still can fit a regression model, given our categorical explanatory variable we no longer have a concept of a “best-fitting” line, but differences relative to a baseline for comparison.

-

Before we fit our regression model, let’s create a table similar to Table 2.7, but

-
    -
  1. Report the mean life expectancy for each continent.
  2. -
  3. Report the difference in mean life expectancy relative to Africa’s mean life expectancy of 54.806 in the column “mean vs Africa”; this column is simply the “mean” column minus 54.806.
  4. -
-

Think back to your observations from the eyeball test of Figure 2.11 at the end of the last subsection. The column “mean vs Africa” is the same idea of comparing a summary statistic to a baseline for comparison, in this case the countries of Africa, but using means instead of medians.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.8: Mean life expectancy by continent
continentmeanmean vs Africa
Africa54.80.0
Americas73.618.8
Asia70.715.9
Europe77.622.8
Oceania80.725.9
-

Now, let’s use the get_regression_table() function we introduced in Section 2.1.2 to get the regression table for gapminder2007 analysis:

-
lifeExp_model <- lm(lifeExp ~ continent, data = gapminder2007)
-get_regression_table(lifeExp_model)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.9: Linear regression table
termestimatestd_errorstatisticp_valueconf_lowconf_high
intercept54.81.0253.45052.856.8
continentAmericas18.81.8010.45015.222.4
continentAsia15.91.659.68012.719.2
continentEurope22.81.7013.47019.526.2
continentOceania25.95.334.86015.436.5
-

Just as before, we have the term and estimates columns of interest, but unlike before, we now have 5 rows corresponding to 5 outputs in our table: an intercept like before, but also continentAmericas, continentAsia, continentEurope, and continentOceania. What are these values?

-
    -
  1. intercept = 54.8 corresponds to the mean life expectancy for Africa. This mean life expectancy is treated as a baseline for comparison for the other continents.
  2. -
  3. continentAmericas = 18.8 is the difference in mean life expectancies of the Americas minus Africa. Note that \(18.80 = 73.6 - 54.8\) is the 2nd “mean vs Africa” value in Table 2.8.
  4. -
  5. continentAmericas = 15.9 is the difference in mean life expectancy of Asia minus Africa. Note that \(15.9 = 70.7 - 54.8\) is the 2nd “mean vs Africa” value in Table 2.8.
  6. -
  7. continentEurope = 22.8 is the difference in mean life expectancy of Europe minus Africa. Note that \(22.8 = 77.6 - 54.8\) is the 3rd “mean vs Africa” value in Table 2.8.
  8. -
  9. continentOceania = 25.9 is the difference in mean life expectancy of Oceania minus Africa. Note that \(25.9 = 80.7 - 54.8\) is the 3rd “mean vs Africa” value in Table 2.8.
  10. -
-

Let’s generalize this idea a bit. If we fit a linear regression model using a categorical explanatory variable \(x\) that has \(k\) levels, a regression model will return an intercept and \(k - 1\) “slope” coefficients. When \(x\) is a numerical explanatory variable the interpretation is of a “slope” coefficient, but when \(x\) is categorical the meaning is a little trickier. They are offsets relative to the baseline.

-

In our case, since there are \(k = 5\) continents, the regression model returns an intercept corresponding to the baseline for comparison Africa and \(k - 1 = 4\) slope coefficients corresponding to the Americas, Asia, Europe, and Oceania. Africa was chosen as the baseline by R for no other reason than it is first alphabetically of the 5 continents. You can manually specify which continent to use as baseline instead of the default choice of whichever comes first alphabetically, but we leave that to a more advanced course.

-
-

-Learning check -

-
-

(LC6.5) Fit a new linear regression using lm(gdpPercap ~ continent, data=gapminder) where gdpPercap is the new outcome variable \(y\). Get information about the “best-fitting” line from the regression table by applying the get_regression_table() function. How do the regression results match up with the results from your exploratory data analysis above?

-
- -
-
-
-

2.2.3 Observed/fitted values and residuals

-

Recall in Subsection 2.1.3 when we had a numerical explanatory variable \(x\), we defined:

-
    -
  1. Observed values \(y\), or the observed value of the outcome variable
  2. -
  3. Fitted values \(\widehat{y}\), or the value on the regression line for a given \(x\) value
  4. -
  5. Residuals \(y - \widehat{y}\), or the error between the observed value and the fitted value
  6. -
-

What do fitted values \(\widehat{y}\) and residuals \(y - \widehat{y}\) correspond to when the explanatory variable \(x\) is categorical? Let’s investigate these values for the first 10 countries in the gapminder2007 dataset:

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.10: First 10 out of 142 countries
countrycontinentlifeExpgdpPercap
AfghanistanAsia43.8975
AlbaniaEurope76.45937
AlgeriaAfrica72.36223
AngolaAfrica42.74797
ArgentinaAmericas75.312779
AustraliaOceania81.234435
AustriaEurope79.836126
BahrainAsia75.629796
BangladeshAsia64.11391
BelgiumEurope79.433693
-

Recall the get_regression_points() function we used in Subsection 2.1.3 to return the observed value of the outcome variable, all explanatory variables, fitted values, and residuals for all points in the regression. Recall that each “point” in this case corresponds to one of 142 countries in the gapminder2007 dataset. They are also the 142 observations used to construct the boxplots in Figure 2.11.

-
regression_points <- get_regression_points(lifeExp_model)
-regression_points
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.11: Regression points (First 10 out of 142 countries)
IDlifeExpcontinentlifeExp_hatresidual
143.8Asia70.7-26.900
276.4Europe77.6-1.226
372.3Africa54.817.495
442.7Africa54.8-12.075
575.3Americas73.61.712
681.2Oceania80.70.515
779.8Europe77.62.180
875.6Asia70.74.907
964.1Asia70.7-6.666
1079.4Europe77.61.792
-

Notice

-
    -
  • The fitted values lifeExp_hat \(\widehat{\text{lifeexp}}\). Countries in Africa have the same fitted value of 54.8, which is the mean life expectancy of Africa; countries in Asia have the same fitted value of 70.7, which is the mean life expectancy of Asia; this similarly holds for countries in the Americas, Europe, and Oceania.
  • -
  • The residual column is simply \(y - \widehat{y}\) = lifeexp - lifeexp_hat. These values can be interpreted as that particular country’s deviation from the mean life expectancy of the respective continent’s mean. For example, the first row of this dataset corresponds to Afghanistan, and the residual of $-26.9 = 43.8
  • -
  • 70.7$ is Afghanistan’s mean life expectancy minus the mean life expectancy of all Asian countries.
  • -
-
-
-

2.2.4 Residual analysis

-

Recall our discussion on residuals from Section 2.1.4 where our goal was to investigate whether or not there was a systematic pattern to the residuals, as ideally since residuals can be thought of as error, there should be no such pattern. While there are many ways to do such residual analysis, we focused on two approaches based on visualizations.

-
    -
  1. A plot with residuals on the vertical axis and the predictor (in this case continent) on the horizontal axis
  2. -
  3. A histogram of all residuals
  4. -
-

First, let’s plot the residuals vs continent in Figure 2.12, but also let’s plot all 142 points with a little horizontal random jitter by setting the width = 0.1 parameter in geom_jitter():

-
ggplot(regression_points, aes(x = continent, y = residual)) +
-  geom_jitter(width = 0.1) + 
-  labs(x = "Continent", y = "Residual") +
-  geom_hline(yintercept = 0, col = "blue")
-
-Plot of residuals over continent -

-Figure 2.12: Plot of residuals over continent -

-
-

We observe:

-
    -
  1. There seems to be a rough balance of both positive and negative residuals for all 5 continents.
  2. -
  3. However, there is one clear outlier in Asia. It has the smallest residual, hence also has the smallest life expectancy in Asia.
  4. -
-

Let’s investigate the 5 countries in Asia with the shortest life expectancy:

-
gapminder2007 %>%
-  filter(continent == "Asia") %>%
-  arrange(lifeExp)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.12: Countries in Asia with shortest life expectancy
countrycontinentlifeExpgdpPercap
AfghanistanAsia43.8975
IraqAsia59.54471
CambodiaAsia59.71714
MyanmarAsia62.1944
Yemen, Rep.Asia62.72281
-

This was the earlier identified residual for Afghanistan of -26.9. Unfortunately given recent geopolitical turmoil, individuals who live in Afghanistan have a drastically lower life expectancy.

-

Second, let’s look at a histogram of all 142 values of residuals in Figure 2.13. In this case, the residuals form a rather nice bell-shape, although there are a couple of very low and very high values at the tails. As we said previously, searching for patterns in residuals can be somewhat subjective, but ideally we hope there are no “drastic” patterns.

-
ggplot(regression_points, aes(x = residual)) +
-  geom_histogram(binwidth = 5, color = "white") +
-  labs(x = "Residual")
-
-Histogram of residuals -

-Figure 2.13: Histogram of residuals -

-
-
-

-Learning check -

-
-

(LC6.6) Continuing with our regression using gdpPercap as the outcome variable and continent as the explanatory variable, use the get_regression_points() function to get the observed values, fitted values, and residuals for all 142 countries in 2007 and perform a residual analysis and look for any systematic patterns in the residuals. Is there a patter?

-
- -
-
-
- -
-

2.4 Conclusion

-

In this chapter, you seen what we call “basic regression” when you only have one explanatory variable. In Chapter ??, we’ll study multiple regression where we have more than one explanatory variable!

-
-

2.4.1 Script of R code

-

An R script file of all R code used in this chapter is available here.

- -
-
-

Chihara, Laura M., and Tim C. Hesterberg. 2011. Mathematical Statistics with Resampling and R. Hoboken, NJ: John Wiley; Sons. https://sites.google.com/site/chiharahesterberg/home.

-
-
-

Diez, David M, Christopher D Barr, and Mine Çetinkaya-Rundel. 2014. Introductory Statistics with Randomization and Simulation. First Edition. https://www.openintro.org/stat/textbook.php?stat_book=isrs.

-
-
-

Grolemund, Garrett, and Hadley Wickham. 2016. R for Data Science. http://r4ds.had.co.nz/.

-
-
-

Xie, Yihui. 2017. Bookdown: Authoring Books and Technical Documents with R Markdown. https://CRAN.R-project.org/package=bookdown.

-
-
-
-
-
-
- -
-
-
- - -
-
- - - - - - - - - - - - - - diff --git a/docs/2-sampling.html b/docs/2-sampling.html deleted file mode 100644 index a9357f98c..000000000 --- a/docs/2-sampling.html +++ /dev/null @@ -1,669 +0,0 @@ - - - - - - - - An Introduction to Statistical and Data Sciences via R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- - - -
-
- - -
-
- -
- -ModernDive - -
-

2 Sampling

-

In this chapter we kick off the third and final segment of this book, statistical inference, by learning about sampling. The concepts behind sampling form the basis of confidence intervals and hypothesis testing, which we’ll cover in Chapters ?? and ?? respectively. We will see that the tools that you learned in the data science segment of this book (data visualization, “tidy” data format, and data wrangling) will also play an important role here in the development of your understanding. As mentioned before, the concepts throughout this text all build into a culmination allowing you to “think with data.”

-
-

Needed packages

-

Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section ?? for information on how to install and load R packages.

-
library(dplyr)
-library(ggplot2)
-library(moderndive)
-
-
-

2.1 Terminology

-

Before we can start studying sampling, we need to define some terminology.

-
    -
  1. Population: The population is the (usually) large pool of observational units that we are interested in.
  2. -
  3. Population parameter: A population parameter is a numerical quantify of interest about a population, such as a proportion or a mean.
  4. -
  5. Census: An enumeration of every member of a population. Ex: the Decennial United States census.
  6. -
  7. Sample: A sample is a smaller collection of observational units that is selected from the population. We would like to infer about the population based on this sample.
  8. -
  9. Sampling: Sampling refers to the process of selecting observations from a population. There are both random and non-random ways this can be done.
  10. -
  11. Representative sampling: A sample is said be a representative sample if the characteristics of observational units selected are a good approximation of the characteristics from the original population.
  12. -
  13. Generalizability: Generalizability refers to the largest group in which it makes sense to make inferences about from the sample collected. This is directly related to how the sample was selected.
  14. -
  15. Bias: Bias corresponds to a favoring of one group in a population over another group. Or put differently, when certain members of a population have a higher chance of being included in a sample than others.
  16. -
  17. Statistic: A statistic is a calculation based on one or more variables measured in the sample.
  18. -
  19. Point estimates/sample statistics: These are statistics, computed based on a sample, that estimate an unknow population parameter.
  20. -
-
-
-

2.2 “In real life” sampling

-

Consider the following “sampling bowl” consisting of 2400 balls, which are either red, white, or green. We are interested in knowing the proportion of balls in the sampling bowl that are red, but are too lazy to count the number of balls out of 2400 that are red. In other words, we’re not interested in conducting a census. So instead we attempt to estimate the proportion red by using the sampling “shovel” to extract a sample of size \(n\) = 50 balls, and count the proportion of these that are red. However, before we extracted a sample using this shovel, we made sure to give the balls a good stir, ensuring we have random sampling.

-
-Sampling from a sampling bowl -

-Figure 2.1: Sampling from a sampling bowl -

-
-

We put students to the task of estimating the proprotion of balls in the tub that are red, because frankly, we’re too lazy to do so ourselves! Groups of students “in real life” took random samples of size \(n\) = 50. Thank you Niko, Sophie, Caitlin, Yaw, and Drew for doing double duty! In other, we have 10 samples of size \(n\) = 50:

-
bowl_samples
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.1: In real life: 10 samples of size 50
groupredwhitegreenn
Kathleen and Max1832050
Sean, Jack, and CJ1832050
X and Judy2228050
James and Jacob2129050
Hannah and Siya1634050
Niko, Sophie, and Caitlin1436050
Niko, Sophie, and Caitlin1931050
Aleja and Ray2030050
Yaw and Drew1634050
Yaw and Drew2129050
-

For each sample of size \(n\) = 50, what is the sample proportion red? In other words, what are the point estimates \(\widehat{p}\) based on a sample of size \(n\) = 50 of \(p\), the true proportion of balls in the tub that is red? We can easily compute this using the mutate() function from the dplyr package we studied extensively in Chapter ??:

-
bowl_samples <- bowl_samples %>% 
-  mutate(prop_red = red/n) %>% 
-  select(group, prop_red)
-bowl_samples
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.2: In real life: 10 sample proportions red based on samples of size 50
groupprop_red
Kathleen and Max0.36
Sean, Jack, and CJ0.36
X and Judy0.44
James and Jacob0.42
Hannah and Siya0.32
Niko, Sophie, and Caitlin0.28
Niko, Sophie, and Caitlin0.38
Aleja and Ray0.40
Yaw and Drew0.32
Yaw and Drew0.42
-

We see that one group got a sample proportion \(\widehat{p}\) as low as 28% while another got a sample proportion \(\widehat{p}\) as high as 0.44. Why are these different? Why is there this variation? Because of sampling variability! Sampling is inherently random, so for a sample of \(n\) = 50 balls, we’ll never get exactly the same number of red balls.

-

Let’s visualize this using our data visualization skills that you honed in Chapter ??! Let’s investigate the distribution of these 10 sample proportion red \(\widehat{p}\) each based on a random sample of size \(n\) = 50 using a histogram, an appropriate visualization since prop_red is numerical:

-
-In real life: 10 sample proportions red based on 10 samples of size 50 -

-Figure 2.2: In real life: 10 sample proportions red based on 10 samples of size 50 -

-
-

Let’s as ourselves some questions:

-
    -
  1. Where is the histogram centered?
  2. -
  3. What is the spread of this histogram?
  4. -
-

Recall from Section ?? the mean and the standard deviation are two summary statistics that would answer this question:

-
bowl_samples %>% 
-  summarize(mean = mean(prop_red), sd = sd(prop_red))
- - - - - - - - - - - - - -
meansd
0.370.052
-

What you have just unpacked are some very deep and very subtle concepts in statistical inference:

-
    -
  1. The histogram in Figure 2.2 is called the sampling distribution of \(\widehat{p}\) based on samples of size \(n=50\). It describes how values of the sample proportion red will vary from sample-to-sample due to the aforementioned sampling variability.
  2. -
  3. If the sampling is done in an unbiased and random fashion, in other words we made sure to stir the bowl before we sampled, then the sampling distribution will be guaranteed to be centered at the true unknown population proportion red, or in other words the true number of balls out of 2400 that are red. In this case, these 10 values of \(\widehat{p}\) are centered at 0.37
  4. -
  5. The spread of this histogram, as quantified by the standard devation of 0.0519, is called the standard error. It quantifies the variability of our estimates \(\widehat{p}\).
  6. -
-
-
-

2.3 Virtual sampling

-

In the moderndive package, we’ve included a data frame called bowl that actually is a virtual version of the above sampling bowl in Figure 2.1 with all 2400 balls! While we present a snap shot of the first 10 rows of bowl below, you should View() it in RStudio to convince yourselves that bowl is indeed a virtual version of the image above.

-
View(bowl)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.3: First 10 balls in virtual sampling bowl
ball_IDcolor
1white
2white
3white
4red
5white
6white
7red
8white
9red
10white
-

Note that the balls are not actually marked with numbers; the variable ball_ID are merely used as an identification variable for each row of bowl (recall our previous discussion on identification variables in Subsection ?? in “Data Tidying” Chapter ??.

-

Let’s replicate what the groups of students did above but virtually. We are going to now simulate using a computer what our students did by hand in Table 2.1 using the rep_sample_n() function which the moderndive package borrows from the infer package we’ll see shortly. The rep_sample_n() function takes the following arguments:

-
    -
  • tbl: a data frame representing the population you wish to infer about. We’ll set this to bowl, since this is the (virtual) population of interest.
  • -
  • size: the sample size \(n\) in question. We’ll set this to 50, micking the number of slots in the sampling “shovel” in the image in Figure 2.1.
  • -
  • replace: A logical TRUE/FALSE value indicating whether or not to put each ball back into the bowl after we’ve sampled it. In our case, we’ll set this to FALSE since we are sampling 50 balls at once, not 50 single balls individually.
  • -
  • reps: the number of samples of size \(n\) = size to extract. We’ll set this to 10, mimicking the data we have in Table 2.1.
  • -
-

Let’s apply this function to mimick our situation above and View() the data. The output is rather large, so we won’t display it below.

-
all_samples <- rep_sample_n(bowl, size = 50, reps = 10)
-View(all_samples)
-

Scroll through the spreadsheet viewer, you’ll notice

-
    -
  1. The values of replicate (1 through 10) come in bunches of 50, representing the 10 groups respective samples of size \(n\) = 50.
  2. -
  3. The ball_ID identification variable is all over the place, suggesting we really are (virtual) randomly sampling balls.
  4. -
  5. color represents the color of each of the virtually sampled balls.
  6. -
-

What is the proportion red for each group as denoted by the replicate variable? Again, let’s leverage your data ninja skills from Chapter ??.

-
bowl_samples_virtual <- all_samples %>% 
-  mutate(is_red = color == "red") %>% 
-  group_by(replicate) %>% 
-  summarize(prop_red = mean(is_red))
-bowl_samples_virtual
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2.4: Virtual simulation: 10 sample proportions red based on samples of size 50
replicateprop_red
10.34
20.36
30.40
40.38
50.36
60.30
70.36
80.38
90.26
100.46
-

Compare Tables 2.2 and Table 2.4; they are similar in output format and also the resulting prop_red are similar in values. Let’s plot this using the same histogram code as in Figure ??, but switching out bowl_samples for bowl_samples_virtual:

-
-Virtual simulation: 10 sample proportions red based on 10 samples of size 50 -

-Figure 2.3: Virtual simulation: 10 sample proportions red based on 10 samples of size 50 -

-
-

We’ve replicated the sampling distribution, but using simulated random samples, instead of the “in real life” random samples that our students collected in Table 2.1. Let’s compute the center of this histogram and it’s standard deviation, which recall has a specific name: the standard error.

-
bowl_samples_virtual %>% 
-  summarize(mean = mean(prop_red), sd = sd(prop_red))
- - - - - - - - - - - - - -
meansd
0.370.052
-
-
-

2.4 “Extreme” virtual sampling

-

Say we were feeling particularly unkind to Yaw and Drew and made them draw not 10 samples of size \(n\) = 50, but TEN THOUSAND such samples. They would probably be at work for days! This is where computer simulations come really in handy: doing repetive and boring tasks quickly. To achieve this virtually, we just use same code as above but setting reps = 10000:

-
-Virtual simulation: Ten thousand sample proportions red based on ten thousand samples of size 50 -

-Figure 2.4: Virtual simulation: Ten thousand sample proportions red based on ten thousand samples of size 50 -

-
-

This distribution looks an awful lot like the normal distribution. That’s because is is the normal distribution! Let’s compute the center of this sampling distribution and the standard error again:

-
bowl_samples_virtual %>% 
-  summarize(mean = mean(prop_red), sd = sd(prop_red))
- - - - - - - - - - - - - -
meansd
0.370.052
-
-

-Learning check -

-
-

(LC8.1) Repeat the above “extreme” virtual sampling exercise for 10,000 samples of size \(n\) = 100. What do you notice is different about the histogram, i.e. the sampling distribution.

-

(LC8.2) Repeat the above “extreme” virtual sampling exercise for 10,000 samples of size \(n\) = 25. What do you notice is different about the histogram, i.e. the sampling distribution, when compared to the instances when the samples were of size \(n\) = 50 and \(n\) = 100.

-

(LC8.3) Repeat the above “extreme” virtual sampling exercise for 10,000 samples of size \(n\) = 50, but where the population is the pennies dataset in the moderndive package representing 800 pennies and where the population parameter of interest is the average year of minting of the 800 pennies. See the help file ?pennies for more information about this dataset.

-
- -
-
-
-

2.5 Central Limit Theorem

-

What you have just shown in the previous is a very famous theorem, or mathematically proven truth, called the Central Limit Theorem which loosely states that when samples means and sample proportions are based on larger and larger samples, the sampling distribution corresponding to these point estimates get

-
    -
  1. More and more normal
  2. -
  3. More and more narrow
  4. -
-

Shuyi Chiou, Casey Dunn, and Pathikrit Bhattacharyya created the following 3m38s video explaining this crucial theorem to statistics using as examples, what else?

-
    -
  1. The average weight of wild bunny rabbits!
  2. -
  3. The average wing span of dragons!
  4. -
-
- -
-
-
-

2.6 Conclusion

-
-

2.6.1 What’s to come?

-

This chapter serves as an introduction to theoretical underpinning of the statistical inference techniques that will be discussed in greater detail in Chapter ?? for confidence intervals and Chapter ?? for hypothesis testing.

-
-
-

2.6.2 Script of R code

-

An R script file of all R code used in this chapter is available here.

- -
-
-

Chihara, Laura M., and Tim C. Hesterberg. 2011. Mathematical Statistics with Resampling and R. Hoboken, NJ: John Wiley; Sons. https://sites.google.com/site/chiharahesterberg/home.

-
-
-

Diez, David M, Christopher D Barr, and Mine Çetinkaya-Rundel. 2014. Introductory Statistics with Randomization and Simulation. First Edition. https://www.openintro.org/stat/textbook.php?stat_book=isrs.

-
-
-

Grolemund, Garrett, and Hadley Wickham. 2016. R for Data Science. http://r4ds.had.co.nz/.

-
-
-

Xie, Yihui. 2017. Bookdown: Authoring Books and Technical Documents with R Markdown. https://CRAN.R-project.org/package=bookdown.

-
-
-
-
-
-
- -
-
-
- - -
-
- - - - - - - - - - - - - - diff --git a/docs/3-hypo.html b/docs/3-hypo.html deleted file mode 100644 index 33d8ed29e..000000000 --- a/docs/3-hypo.html +++ /dev/null @@ -1,1420 +0,0 @@ - - - - - - - - An Introduction to Statistical and Data Sciences via R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
- -
- -
-
- - -
-
- -
- -ModernDive - -
-

3 Hypothesis Testing

-
-
-

-Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. -

-

-Please check out our sneak peak of infer below in the meanwhile. For more details on infer visit https://infer.netlify.com/. -

-
-Drawing -
-
-
-
-

3.1 Sneak peak of infer

-
-Drawing -
-
    -
  • Question: Of all the cars in the mtcars dataset, do automatic cars get better gas mileage than manual cars?
  • -
  • Approach: Two-sample test for difference in means.
  • -
-
library(dplyr)
-library(ggplot2)
-library(infer)
-
-# Clean data
-mtcars <- mtcars %>%
-  as_tibble() %>% 
-  mutate(am = factor(am))
-
-# Observed test statistic
-obs_stat <- mtcars %>% 
-  group_by(am) %>%
-  summarize(mean = mean(mpg)) %>%
-  summarize(obs_stat = diff(mean)) %>%
-  pull(obs_stat)
-
-# Simulate null distribution of two-sample difference in means:
-null_distribution <- mtcars %>%
-  specify(mpg ~ am) %>%
-  hypothesize(null = "independence") %>%
-  generate(reps = 1000, type = "permute") %>% 
-  calculate(stat = "diff in means", order = c("1", "0"))
-
-# Visualize:
-plot <- null_distribution %>% 
-  visualize()
-plot +
-  geom_vline(xintercept = obs_stat, col = "red", size = 1)
-

-
-
-
-

3.2 Under construction…

-

We saw some of the main concepts of hypothesis testing introduced in Chapter ??. We will expand further on these ideas here and also provide a framework for understanding hypothesis tests in general. Instead of presenting you with lots of different formulas and scenarios, we hope to build a way to think about all hypothesis tests. You can then adapt to different scenarios as needed down the road when you encounter different statistical situations.

-

The same can be said for confidence intervals. There is one general framework that applies to all confidence intervals and we will elaborate on this further in Chapter 2. The specifics may change slightly for each variation, but the important idea is to understand the general framework so that you can apply it to more specific problems. We believe that this approach is much better in the long-term than teaching you specific tests and confidence intervals rigorously. You can find fully-worked out examples for five common hypothesis tests and their corresponding confidence intervals in Appendix ??.

-

We recommend that you carefully review these examples as they also cover how the general frameworks apply to traditional normal-based methodologies like the \(t\)-test and normal-theory confidence intervals. You’ll see there that these methods are just approximations for the general computational frameworks, but require conditions to be met for their results to be valid. The general frameworks using randomization, simulation, and bootstrapping do not hold the same sorts of restrictions and further advance computational thinking, which is one big reason for their emphasis throughout this textbook.

-
-

Needed packages

-

Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section ?? for information on how to install and load R packages.

-
library(dplyr)
-library(ggplot2)
-library(mosaic)
-library(knitr)
-library(nycflights13)
-library(ggplot2movies)
-library(broom)
-
-
-
-
-

3.3 When inference is not needed

-

Before we delve into the two techniques of inference (hypothesis testing and confidence intervals), it’s good to remember that there are cases where you need not perform a rigorous statistical inference. An important and time-saving skill is to ALWAYS do exploratory data analysis using dplyr and ggplot2 before thinking about running a hypothesis test. Let’s look at such an example selecting a sample of flights traveling to Boston and to San Francisco from New York City in the flights data frame in the nycflights13 package. (We will remove flights with missing data first using na.omit and then sample 100 flights going to each of the two airports.)

-
bos_sfo <- flights %>% 
-  na.omit() %>% 
-  filter(dest %in% c("BOS", "SFO")) %>% 
-  group_by(dest) %>% 
-  sample_n(100)
-

Suppose we were interested in seeing if the air_time to SFO in San Francisco was statistically greater than the air_time to BOS in Boston. As suggested, let’s begin with some exploratory data analysis to get a sense for how the two variables of air_time and dest relate for these two destination airports:

-
bos_sfo_summary <- bos_sfo %>% group_by(dest) %>% 
-  summarize(mean_time = mean(air_time),
-            sd_time = sd(air_time))
-kable(bos_sfo_summary)
- - - - - - - - - - - - - - - - - - - - -
destmean_timesd_time
BOS38.754.066
SFO345.9317.394
-

Looking at these results, we can clearly see that SFO air_time is much larger than BOS air_time. The standard deviation is also extremely informative here.

-
-

-Learning check -

-
-

(LC10.1) Could we make the same type of immediate conclusion that SFO had a statistically greater air_time if, say, its corresponding standard deviation was 200 minutes? What about 100 minutes? Explain.

-
- -
-

To further understand just how different the air_time variable is for BOS and SFO, let’s look at a boxplot:

-
ggplot(data = bos_sfo, mapping = aes(x = dest, y = air_time)) +
-  geom_boxplot()
-

-

Since there is no overlap at all, we can conclude that the air_time for San Francisco flights is statistically greater (at any level of significance) than the air_time for Boston flights. This is a clear example of not needing to do anything more than some simple descriptive statistics to get an appropriate inferential conclusion. This is one reason why you should ALWAYS investigate the sample data first using dplyr and ggplot2 via exploratory data analysis.

-

As you get more and more practice with hypothesis testing, you’ll be better able to determine in many cases whether or not the results will be statistically significant. There are circumstances where it is difficult to tell, but you should always try to make a guess FIRST about significance after you have completed your data exploration and before you actually begin the inferential techniques.

-
-
-

3.4 Basics of hypothesis testing

-

In a hypothesis test, we will use data from a sample to help us decide between two competing hypotheses about a population. We make these hypotheses more concrete by specifying them in terms of at least one population parameter of interest. We refer to the competing claims about the population as the null hypothesis, denoted by \(H_0\), and the alternative (or research) hypothesis, denoted by \(H_a\). The roles of these two hypotheses are NOT interchangeable.

-
    -
  • The claim for which we seek significant evidence is assigned to the alternative hypothesis. The alternative is usually what the experimenter or researcher wants to establish or find evidence for.
  • -
  • Usually, the null hypothesis is a claim that there really is “no effect” or “no difference.” In many cases, the null hypothesis represents the status quo or that nothing interesting is happening.
    -
  • -
  • We assess the strength of evidence by assuming the null hypothesis is true and determining how unlikely it would be to see sample results/statistics as extreme (or more extreme) as those in the original sample.
  • -
-

Hypothesis testing brings about many weird and incorrect notions in the scientific community and society at large. One reason for this is that statistics has traditionally been thought of as this magic box of algorithms and procedures to get to results and this has been readily apparent if you do a Google search of “flowchart statistics hypothesis tests”. There are so many different complex ways to determine which test is appropriate.

-

You’ll see that we don’t need to rely on these complicated series of assumptions and procedures to conduct a hypothesis test any longer. These methods were introduced in a time when computers weren’t powerful. Your cellphone (in 2016) has more power than the computers that sent NASA astronauts to the moon after all. We’ll see that ALL hypothesis tests can be broken down into the following framework given by Allen Downey here:

-
-Hypothesis Testing Framework -

-Figure 3.1: Hypothesis Testing Framework -

-
-

Before we hop into this framework, we will provide another way to think about hypothesis testing that may be useful.

-
-
-
-

3.5 Criminal trial analogy

-

We can think of hypothesis testing in the same context as a criminal trial in the United States. A criminal trial in the United States is a familiar situation in which a choice between two contradictory claims must be made.

-
    -
  1. The accuser of the crime must be judged either guilty or not guilty.

  2. -
  3. Under the U.S. system of justice, the individual on trial is initially presumed not guilty.

  4. -
  5. Only STRONG EVIDENCE to the contrary causes the not guilty claim to be rejected in favor of a guilty verdict.

  6. -
  7. The phrase “beyond a reasonable doubt” is often used to set the cutoff value for when enough evidence has been given to convict.

  8. -
-

Theoretically, we should never say “The person is innocent.” but instead “There is not sufficient evidence to show that the person is guilty.”

-

Now let’s compare that to how we look at a hypothesis test.

-
    -
  1. The decision about the population parameter(s) must be judged to follow one of two hypotheses.

  2. -
  3. We initially assume that \(H_0\) is true.

  4. -
  5. The null hypothesis \(H_0\) will be rejected (in favor of \(H_a\)) only if the sample evidence strongly suggests that \(H_0\) is false. If the sample does not provide such evidence, \(H_0\) will not be rejected.

  6. -
  7. The analogy to “beyond a reasonable doubt” in hypothesis testing is what is known as the significance level. This will be set before conducting the hypothesis test and is denoted as \(\alpha\). Common values for \(\alpha\) are 0.1, 0.01, and 0.05.

  8. -
-
-

3.5.1 Two possible conclusions

-

Therefore, we have two possible conclusions with hypothesis testing:

-
    -
  • Reject \(H_0\)
    -
  • -
  • Fail to reject \(H_0\)
  • -
-

Gut instinct says that “Fail to reject \(H_0\)” should say “Accept \(H_0\)” but this technically is not correct. Accepting \(H_0\) is the same as saying that a person is innocent. We cannot show that a person is innocent; we can only say that there was not enough substantial evidence to find the person guilty.

-

When you run a hypothesis test, you are the jury of the trial. You decide whether there is enough evidence to convince yourself that \(H_a\) is true (“the person is guilty”) or that there was not enough evidence to convince yourself \(H_a\) is true (“the person is not guilty”). You must convince yourself (using statistical arguments) which hypothesis is the correct one given the sample information.

-

Important note: Therefore, DO NOT WRITE “Accept \(H_0\)” any time you conduct a hypothesis test. Instead write “Fail to reject \(H_0\).”

-
-
-
-
-

3.6 Types of errors in hypothesis testing

-

Unfortunately, just as a jury or a judge can make an incorrect decision in regards to a criminal trial by reaching the wrong verdict, there is some chance we will reach the wrong conclusion via a hypothesis test about a population parameter. As with criminal trials, this comes from the fact that we don’t have complete information, but rather a sample from which to try to infer about a population.

-

The possible erroneous conclusions in a criminal trial are

-
    -
  • an innocent person is convicted (found guilty) or
  • -
  • a guilty person is set free (found not guilty).
  • -
-

The possible errors in a hypothesis test are

-
    -
  • rejecting \(H_0\) when in fact \(H_0\) is true (Type I Error) or
  • -
  • failing to reject \(H_0\) when in fact \(H_0\) is false (Type II Error).
  • -
-

The risk of error is the price researchers pay for basing an inference about a population on a sample. With any reasonable sample-based procedure, there is some chance that a Type I error will be made and some chance that a Type II error will occur.

-

To help understand the concepts of Type I error and Type II error, observe the following table:

-
-Type I and Type II errors -

-Figure 3.2: Type I and Type II errors -

-
-

If we are using sample data to make inferences about a parameter, we run the risk of making a mistake. Obviously, we want to minimize our chance of error; we want a small probability of drawing an incorrect conclusion.

-
    -
  • The probability of a Type I Error occurring is denoted by \(\alpha\) and is called the significance level of a hypothesis test
  • -
  • The probability of a Type II Error is denoted by \(\beta\).
  • -
-

Formally, we can define \(\alpha\) and \(\beta\) in regards to the table above, but for hypothesis tests instead of a criminal trial.

-
    -
  • \(\alpha\) corresponds to the probability of rejecting \(H_0\) when, in fact, \(H_0\) is true.
  • -
  • \(\beta\) corresponds to the probability of failing to reject \(H_0\) when, in fact, \(H_0\) is false.
  • -
-

Ideally, we want \(\alpha = 0\) and \(\beta = 0\), meaning that the chance of making an error does not exist. When we have to use incomplete information (sample data), it is not possible to have both \(\alpha = 0\) and \(\beta = 0\). We will always have the possibility of at least one error existing when we use sample data.

-

Usually, what is done is that \(\alpha\) is set before the hypothesis test is conducted and then the evidence is judged against that significance level. Common values for \(\alpha\) are 0.05, 0.01, and 0.10. If \(\alpha = 0.05\), we are using a testing procedure that, used over and over with different samples, rejects a TRUE null hypothesis five percent of the time.

-

So if we can set \(\alpha\) to be whatever we want, why choose 0.05 instead of 0.01 or even better 0.0000000000000001? Well, a small \(\alpha\) means the test procedure requires the evidence against \(H_0\) to be very strong before we can reject \(H_0\). This means we will almost never reject \(H_0\) if \(\alpha\) is very small. If we almost never reject \(H_0\), the probability of a Type II Error – failing to reject \(H_0\) when we should – will increase! Thus, as \(\alpha\) decreases, \(\beta\) increases and as \(\alpha\) increases, \(\beta\) decreases. We, therefore, need to strike a balance in \(\alpha\) and \(\beta\) and the common values for \(\alpha\) of 0.05, 0.01, and 0.10 usually lead to a nice balance.

-
-

-Learning check -

-
-

(LC10.2) Reproduce the table above about errors, but for a hypothesis test, instead of the one provided for a criminal trial.

-
- -
-
-

3.6.1 Logic of hypothesis testing

-
    -
  • Take a random sample (or samples) from a population (or multiple populations)
  • -
  • If the sample data are consistent with the null hypothesis, do not reject the null hypothesis.
  • -
  • If the sample data are inconsistent with the null hypothesis (in the direction of the alternative hypothesis), reject the null hypothesis and conclude that there is evidence the alternative hypothesis is true (based on the particular sample collected).
  • -
-
-
-
-
-

3.7 Statistical significance

-

The idea that sample results are more extreme than we would reasonably expect to see by random chance if the null hypothesis were true is the fundamental idea behind statistical hypothesis tests. If data at least as extreme would be very unlikely if the null hypothesis were true, we say the data are statistically significant. Statistically significant data provide convincing evidence against the null hypothesis in favor of the alternative, and allow us to generalize our sample results to the claim about the population.

-
-

-Learning check -

-
-

(LC10.3) What is wrong about saying “The defendant is innocent.” based on the US system of criminal trials?

-

(LC10.4) What is the purpose of hypothesis testing?

-

(LC10.5) What are some flaws with hypothesis testing? How could we alleviate them?

-
- -
-
-
-
-

3.8 Example: Revisiting the Lady Tasting Tea

-

Recall the “There is Only One Test” diagram from earlier:

-
-Hypothesis Testing Framework -

-Figure 3.3: Hypothesis Testing Framework -

-
-

We will now walk through how each of the steps to the diagram apply to determining whether the lady tasting tea was actually better than chance at determining whether or not milk was added first. We will see that the process of creating a null distribution is a statistical way to quantifying surprise.

-
-

3.8.1 Data

-

Let’s assume as we did in Chapter ?? that the lady is correct in determining whether milk was added first or not in 9 out of 10 trials. Our data, therefore, may look something like

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
x
Correct
Correct
Correct
Incorrect
Correct
Correct
Correct
Correct
Correct
Correct
-
-
-

3.8.2 Test statistic \(\delta\)

-

We are interested in the number of Correct out of our 10 trials. We can denote this number of successes using the symbol \(t\), where \(t\) corresponds to total. This is our test statistic \(\delta\) in this case.

-
-
-

3.8.3 Observed effect \(\delta^*\)

-

The actual observed value of the test statistic from our observed sample is \(\hat{t}_{obs} = 9\). Thus, \(\delta^* = 9\).

-
-
-

3.8.4 Model of \(H_0\)

-

Our null hypothesis is that the lady is only as good as chance at guessing correctly. Hypotheses always correspond to parameters and are denoted with Greek letters. Thus, symbolically, we have \(H_0: \tau = 5\). Since we are assuming chance and we have 10 flips with 0.5 probability of success of each flip, we have \(\tau = 10 \times 0.5 = 5\).

-
-
-

3.8.5 Simulated data

-

We now want to use this null hypothesis to simulate the test statistic assuming that the null hypothesis is true. Therefore, we want to figure out a way to simulate 10 trials, getting either the choice Correct or Incorrect, assuming that the probability of success (getting it Correct) in any given trial is 0.5.

-

Tactile simulation

-

When you are presented with a hypothesis testing problem, frequently the most challenging portion is setting up how to simulate the data assuming the null hypothesis is true. To facilitate with this, setting up a tactile, hands on experiment can help.

-

In this case, flipping a fair coin is a great way to simulate this process. This simulates how the sample could be collected assuming the null hypothesis is true. To simulate 10 trials, we could flip the fair coin and record Heads as Correct and Tails as Incorrect.

-

Some simulated data using this coin flipping procedure may look like the following. Note that this data frame is not tidy, but is a convenient way to look at the results of the simulation in this wide format. The numbers on the far left correspond to the number of the trial.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 3.1: A table of three sets of 10 coin flips
sample1sample2sample3
1CorrectCorrectCorrect
2CorrectIncorrectIncorrect
3IncorrectIncorrectCorrect
4IncorrectIncorrectCorrect
5CorrectIncorrectIncorrect
6CorrectIncorrectCorrect
7IncorrectIncorrectCorrect
8IncorrectCorrectIncorrect
9IncorrectCorrectIncorrect
10IncorrectCorrectIncorrect
-

We then use the formula for the Test Statistic to determine the simulated test statistic for each of these simulated samples. So in this case we have

-

\(t_1 = 4\), \(t_2 = 4\), \(t_3 = 5\)

-
-
-

3.8.6 Distribution of \(\delta\) under \(H_0\)

-

We could continue this process, say, 5000 times by flipping a coin in sets of 10 for 5000 repetitions and counting and taking note of how many heads out of 10 we have for each set. It’s at this point that you surely realize that a computer can do this procedure much faster and more efficient than the tactile experiment with a coin.

-

Recall that we’ve already created the distribution of 5000 such coin flips and we’ve stored these values in the heads variable in the simGuesses data frame:

-
simGuesses <- do(5000) * rflip(10)
-ggplot(data = simGuesses, aes(x = factor(heads))) +
-  geom_bar()
-

-
-
-

3.8.7 The p-value

-
-

Definition: \(p\)-value:

-

The p-value is the probability of observing a sample statistic as extreme or more extreme than what was observed, assuming that the null hypothesis of a by chance operation is true.

-
-

This definition may be a little intimidating the first time you read it, but it’s important to come back to this “The Lady Tasting Tea” problem whenever you encounter \(p\)-values as you begin to learn about the concept. Here the \(p\)-value corresponds to how many times in our null distribution of heads 9 or more heads occurred.

-

We can use another neat feature of R to calculate the \(p\)-value for this problem. Note that “more extreme” in this case corresponds to looking at values of 9 or greater since our alternative hypothesis invokes a right-tail test corresponding to a “greater than” hypothesis of \(H_a: \tau > 5\). In other words, we are looking to see how likely it is for the lady to pick 9 or more correct instead of 9 or less correct. We’d like to go in the right direction.

-
pvalue_tea <- simGuesses %>%
-  filter(heads >= 9) %>%
-  nrow() / nrow(simGuesses)
-

Let’s walk through each step of this calculation:

-
    -
  1. First, pvalue_tea will be the name of our calculated \(p\)-value and the assignment operator <- directs us to this naming.

  2. -
  3. We are working with the simGuesses data frame here so that comes immediately before the pipe operator.
    -
  4. -
  5. We would like to only focus on the rows in our simGuesses data frame that have heads values of 9 or 10. This represents simulated statistics “as extreme or more extreme” than what we observed (9 correct guesses out of 10). To get a glimpse of what we have up to this point, run simGuesses %>% filter(heads >= 9) %>% View().

  6. -
  7. Now that we have changed the focus to only those rows that have number of heads out of 10 flips corresponding to 9 or more, we count how many of those there are. The function nrow gives how many entries are in this filtered data frame and lastly we calculate the proportion that are at least as extreme as our observed value of 9 by dividing by the number of total simulations (5,000).

  8. -
-

We can see that the observed statistic of 9 correct guesses is not a likely outcome assuming the null hypothesis is true. Only around 1% of the outcomes in our 5000 simulations fall at or above 9 successes. We have evidence supporting the conclusion that the person is actually better than just guessing at random at determining whether milk has been added first or not. To better visualize this we can also make use of blue shading on the histogram corresponding to the \(p\)-value:

-
ggplot(data = simGuesses, aes(x = factor(heads), fill = (heads >= 9))) +
-  geom_bar() +
-  labs(x = "heads")
-
-Barplot of heads with p-value highlighted -

-Figure 3.4: Barplot of heads with p-value highlighted -

-
-

This helps us better see just how few of the values of heads are at our observed value or more extreme. This idea of a \(p\)-value can be extended to the more traditional methods using normal and \(t\) distributions in the traditional way that introductory statistics has been presented. These traditional methods were used because statisticians haven’t always been able to do 5000 simulations on the computer within seconds. We’ll elaborate on this more in a few sections.

-
-

-Learning check -

-
-

(LC10.6) How could we make Table 3.1 into a tidy data frame?

-

(LC10.7) What is meant by “pseudo-random number generation?”

-

(LC10.8) How can simulation be used to help us address the question of whether or not an observed result is statistically significant?

-

(LC10.9) In Chapter ??, we noted that barplots should be used when creating a plot of categorical variables. Why are we using barplots to make a plot of a numerical variable heads in this chapter?

-
- -
-
-
-
-
-

3.9 Example: Comparing two means

-
-

3.9.1 Randomization/permutation

-

We will now focus on building hypotheses looking at the difference between two population means in an example. We will denote population means using the Greek symbol \(\mu\) (pronounced “mu”). Thus, we will be looking to see if one group “out-performs” another group. This is quite possibly the most common type of statistical inference and serves as a basis for many other types of analyses when comparing the relationship between two variables.

-

Our null hypothesis will be of the form \(H_0: \mu_1 = \mu_2\), which can also be written as \(H_0: \mu_1 - \mu_2 = 0\). Our alternative hypothesis will be of the form \(H_0: \mu_1 \star \mu_2\) (or \(H_a: \mu_1 - \mu_2 \, \star \, 0\)) where \(\star\) = \(<\), \(\ne\), or \(>\) depending on the context of the problem. You needn’t focus on these new symbols too much at this point. It will just be a shortcut way for us to describe our hypotheses.

-

As we saw earlier, simulation is a valuable tool when conducting inferences based on one population variable. We will see that the process of randomization (also known as permutation) will be valuable in conducting tests comparing quantitative values from two groups.

-
-
-

3.9.2 Comparing action and romance movies

-

The movies dataset in the ggplot2movies package contains information on a large number of movies that have been rated by users of IMDB.com (Wickham 2015). We are interested in the question here of whether Action movies are rated higher on IMDB than Romance movies. We will first need to do a little bit of data wrangling using the ideas from Chapter ?? to get the data in the form that we would like:

-
(movies_trimmed <- movies %>% select(title, year, rating, Action, Romance))
-
# A tibble: 58,788 x 5
-                      title  year rating Action Romance
-                      <chr> <int>  <dbl>  <int>   <int>
- 1                        $  1971    6.4      0       0
- 2        $1000 a Touchdown  1939    6.0      0       0
- 3   $21 a Day Once a Month  1941    8.2      0       0
- 4                  $40,000  1996    8.2      0       0
- 5 $50,000 Climax Show, The  1975    3.4      0       0
- 6                    $pent  2000    4.3      0       0
- 7                  $windle  2002    5.3      1       0
- 8                     '15'  2002    6.7      0       0
- 9                      '38  1987    6.6      0       0
-10                  '49-'17  1917    6.0      0       0
-# ... with 58,778 more rows
-

Note that Action and Romance are binary variables here. To remove any overlap of movies (and potential confusion) that are both Action and Romance, we will remove them from our population:

-
movies_trimmed <- movies_trimmed %>%
-  filter(!(Action == 1 & Romance == 1))
-

We will now create a new variable called genre that specifies whether a movie in our movies_trimmed data frame is an "Action" movie, a "Romance" movie, or "Neither". We aren’t really interested in the "Neither" category here so we will exclude those rows as well. Lastly, the Action and Romance columns are not needed anymore since they are encoded in the genre column.

-
movies_trimmed <- movies_trimmed %>%
-  mutate(genre = ifelse(Action == 1, "Action",
-                        ifelse(Romance == 1, "Romance",
-                               "Neither"))) %>%
-  filter(genre != "Neither") %>%
-  select(-Action, -Romance)
-

We are left with 8878 movies in our population dataset that focuses on only "Action" and "Romance" movies.

-
-

-Learning check -

-
-

(LC10.10) Why are the different genre variables stored as binary variables (1s and 0s) instead of just listing the genre as a column of values like “Action”, “Comedy”, etc.?

-

(LC10.11) What complications could come above with us excluding action romance movies? Should we question the results of our hypothesis test? Explain.

-
- -
-

Let’s now visualize the distributions of rating across both levels of genre. Think about what type(s) of plot is/are appropriate here before you proceed:

-
ggplot(data = movies_trimmed, aes(x = genre, y = rating)) +
-  geom_boxplot()
-
-Rating vs genre in the population -

-Figure 3.5: Rating vs genre in the population -

-
-

We can see that the middle 50% of ratings for "Action" movies is more spread out than that of "Romance" movies in the population. "Romance" has outliers at both the top and bottoms of the scale though. We are initially interested in comparing the mean rating across these two groups so a faceted histogram may also be useful:

-
ggplot(data = movies_trimmed, mapping = aes(x = rating)) +
-  geom_histogram(binwidth = 1, color = "white", fill = "dodgerblue") +
-  facet_grid(genre ~ .)
-
-Faceted histogram of genre vs rating -

-Figure 3.6: Faceted histogram of genre vs rating -

-
-

Important note: Remember that we hardly ever have access to the population values as we do here. This example and the nycflights13 dataset were used to create a common flow from chapter to chapter. In nearly all circumstances, we’ll be needing to use only a sample of the population to try to infer conclusions about the unknown population parameter values. These examples do show a nice relationship between statistics (where data is usually small and more focused on experimental settings) and data science (where data is frequently large and collected without experimental conditions).

-
-
-

3.9.3 Sampling \(\rightarrow\) randomization

-

We can use hypothesis testing to investigate ways to determine, for example, whether a treatment has an effect over a control and other ways to statistically analyze if one group performs better than, worse than, or different than another. We will also use confidence intervals to determine the size of the effect, if it exists. You’ll see more on this in Chapter 2.

-

We are interested here in seeing how we can use a random sample of action movies and a random sample of romance movies from movies to determine if a statistical difference exists in the mean ratings of each group.

-
-

-Learning check -

-
-

(LC10.12) Define the relevant parameters here in terms of the populations of movies.

-
- -
-
-
-

3.9.4 Data

-

Let’s select a random sample of 34 action movies and a random sample of 34 romance movies. (The number 34 was chosen somewhat arbitrarily here.)

-
set.seed(2017)
-movies_genre_sample <- movies_trimmed %>% 
-  group_by(genre) %>%
-  sample_n(34) %>% 
-  ungroup()
-

Note the addition of the ungroup() function here. This will be useful shortly in allowing us to shuffle the values of rating across genre. Our analysis does not work without this ungroup() function since the data stays grouped by the levels of genre without it.

-

We can now observe the distributions of our two sample ratings for both groups. Remember that these plots should be rough approximations of our population distributions of movie ratings for "Action" and "Romance" in our population of all movies in the movies data frame.

-
ggplot(data = movies_genre_sample, aes(x = genre, y = rating)) +
-  geom_boxplot()
-
-Genre vs rating for our sample -

-Figure 3.7: Genre vs rating for our sample -

-
-
ggplot(data = movies_genre_sample, mapping = aes(x = rating)) +
-  geom_histogram(binwidth = 1, color = "white", fill = "dodgerblue") +
-  facet_grid(genre ~ .)
-
-Genre vs rating for our sample as faceted histogram -

-Figure 3.8: Genre vs rating for our sample as faceted histogram -

-
-
-

-Learning check -

-
-

(LC10.13) What single value could we change to improve the approximation using the sample distribution on the population distribution?

-
- -
-

Do we have reason to believe, based on the sample distributions of rating over the two groups of genre, that there is a significant difference between the mean rating for action movies compared to romance movies? It’s hard to say just based on the plots. The boxplot does show that the median sample rating is higher for romance movies, but the histogram isn’t as clear. The two groups have somewhat differently shaped distributions but they are both over similar values of rating. It’s often useful to calculate the mean and standard deviation as well, conditioned on the two levels.

-
summary_ratings <- movies_genre_sample %>% 
-  group_by(genre) %>%
-  summarize(mean = mean(rating),
-            std_dev = sd(rating),
-            n = n())
-summary_ratings %>% kable()
- - - - - - - - - - - - - - - - - - - - - - - -
genremeanstd_devn
Action5.1121.48934
Romance6.0621.14934
-
-

-Learning check -

-
-

(LC10.14) Why did we not specify na.rm = TRUE here as we did in Chapter ???

-
- -
-

We see that the sample mean rating for romance movies, \(\bar{x}_{r}\), is greater than the similar measure for action movies, \(\bar{x}_a\). But is it statistically significantly greater (thus, leading us to conclude that the means are statistically different)? The standard deviation can provide some insight here but with these standard deviations being so similar it’s still hard to say for sure.

-
-

-Learning check -

-
-

(LC10.15) Why might the standard deviation provide some insight about the means being statistically different or not?

-
- -
-
-
-

3.9.5 Model of \(H_0\)

-

The hypotheses we specified can also be written in another form to better give us an idea of what we will be simulating to create our null distribution.

-
    -
  • \(H_0: \mu_r - \mu_a = 0\)
  • -
  • \(H_a: \mu_r - \mu_a \ne 0\)
  • -
-
-
-

3.9.6 Test statistic \(\delta\)

-

We are, therefore, interested in seeing whether the difference in the sample means, \(\bar{x}_r - \bar{x}_a\), is statistically different than 0. R has a built-in command that can calculate the difference in these two sample means.

-
-
-

3.9.7 Observed effect \(\delta^*\)

-
mean_ratings <- movies_genre_sample %>% 
-  group_by(genre) %>%
-  summarize(mean = mean(rating))
-obs_diff <- diff(mean_ratings$mean)
-

We see here that the diff function calculates \(\bar{x}_r - \bar{x}_a = 6.0618 - 5.1118 = 0.95\). We will now proceed similarly to how we conducted the hypothesis test above for the Lady Tasting Tea using simulation. Our goal is figure out a random process with which to simulate the null hypothesis being true. Earlier in this chapter, we used flipping of a fair coin as the random process we were simulating with the null hypothesis being true (\(H_0: \tau = 5\)).

-
-
-

3.9.8 Simulated data

-

Tactile simulation

-

Here, with us assuming the two population means are equal (\(H_0: \mu_r - \mu_a = 0\)), we can look at this from a tactile point of view by using index cards. There are \(n_r = 34\) data elements corresponding to romance movies and \(n_a = 34\) for action movies. We can write the 34 ratings from our sample for romance movies on one set of 34 index cards and the 34 ratings for action movies on another set of 34 index cards. (Note that the sample sizes need not be the same.)

-

The next step is to put the two stacks of index cards together, creating a new set of 68 cards. If we assume that the two population means are equal, we are saying that there is no association between ratings and genre (romance vs action). We can use the index cards to create two new stacks for romance and action movies. First, we must shuffle all the cards thoroughly. After doing so, in this case with equal values of sample sizes, we split the deck in half.

-

We then calculate the new sample mean rating of the romance deck, and also the new sample mean rating of the action deck. This creates one simulation of the samples that were collected originally. We next want to calculate a statistic from these two samples. Instead of actually doing the calculation using index cards, we can use R as we have before to simulate this process.

-
shuffled_ratings <- #movies_trimmed %>%
-  movies_genre_sample %>% 
-     mutate(genre = shuffle(genre)) %>% 
-     group_by(genre) %>%
-     summarize(mean = mean(rating))
-diff(shuffled_ratings$mean)
-
[1] -0.1324
-
-

-Learning check -

-
-

(LC10.16) How would the tactile shuffling of index cards change if we had different samples of say 20 action movies and 60 romance movies? Describe each step that would change.

-

(LC10.17) Why are we taking the difference in the means of the cards in the new shuffled decks?

-
- -
-
-
-

3.9.9 Distribution of \(\delta\) under \(H_0\)

-

The only new command here is shuffle from the mosaic package, which does what we would expect it to do. It simulates a shuffling of the ratings between the two levels of genre just as we could have done with index cards. We can now proceed in a similar way to what we have done previously with the Lady Tasting Tea example by repeating this process many times to create a null distribution of simulated differences in sample means.

-
set.seed(2017)
-many_shuffles <- do(5000) * 
-  (movies_genre_sample %>% 
-     mutate(genre = shuffle(genre)) %>% 
-     group_by(genre) %>%
-     summarize(mean = mean(rating))
-   )
-

It is a good idea here to View the many_shuffles data frame via View(many_shuffles). We need to figure out a way to subtract the first value of mean from the second value of mean for each of the 5000 simulations. This is a little tricky but the group_by function comes to our rescue here:

-
rand_distn <- many_shuffles %>%
-  group_by(.index) %>%
-  summarize(diffmean = diff(mean))
-head(rand_distn, 10)
-
# A tibble: 10 x 2
-   .index  diffmean
-    <dbl>     <dbl>
- 1      1 -0.132353
- 2      2 -0.197059
- 3      3 -0.026471
- 4      4  0.714706
- 5      5 -0.473529
- 6      6 -0.120588
- 7      7 -0.173529
- 8      8 -0.208824
- 9      9 -0.008824
-10     10 -0.332353
-

We can now plot the distribution of these simulated differences in means:

-
ggplot(data = rand_distn, aes(x = diffmean)) +
-  geom_histogram(color = "white", bins = 20)
-
-Simulated differences in means histogram -

-Figure 3.9: Simulated differences in means histogram -

-
-
-
-

3.9.10 The p-value

-

Remember that we are interested in seeing where our observed sample mean difference of 0.95 falls on this null/randomization distribution. We are interested in simply a difference here so “more extreme” corresponds to values in both tails on the distribution. Let’s shade our null distribution to show a visual representation of our \(p\)-value:

-
ggplot(data = rand_distn, aes(x = diffmean, fill = (abs(diffmean) >= obs_diff))) +
-  geom_histogram(color = "white", bins = 20)
-
-Shaded histogram to show p-value -

-Figure 3.10: Shaded histogram to show p-value -

-
-

Remember that the observed difference in means was 0.95. We have shaded green all values at or above that value and also shaded green those values at or below its negative value (since this is a two-tailed test). We can add a vertical line to represent both the observed difference and its negative instead. To better estimate how large the \(p\)-value will be, we also increase the number of bins to 100 here from 20:

-
ggplot(data = rand_distn, aes(x = diffmean)) +
-  geom_histogram(color = "white", bins = 100) +
-  geom_vline(xintercept = obs_diff, color = "red") +
-  geom_vline(xintercept = -obs_diff, color = "red")
-
-Histogram with vertical lines corresponding to observed statistic -

-Figure 3.11: Histogram with vertical lines corresponding to observed statistic -

-
-

At this point, it is important to take a guess as to what the \(p\)-value may be. We can see that there are only a few shuffled differences as extreme or more extreme than our observed effect (in both directions). Maybe we guess that this \(p\)-value is somewhere around 2%, or maybe 3%, but certainly not 30% or more. **You’ll find yourself getting very strange results if you’ve messed up the signs in your calculation of the \(p\)-value so you should always check first that you have a range of reasonable values after looking at the histogram for the \(p\)-value. Lastly, we calculate the \(p\)-value directly using dplyr:

-
(pvalue_movies <- rand_distn %>%
-  filter(abs(diffmean) >= obs_diff) %>%
-  nrow() / nrow(rand_distn))
-
[1] 0.0042
-

We have around 0.42% of values as extreme or more extreme than our observed effect in both directions. Assuming we are using a 5% significance level for \(\alpha\), we have evidence supporting the conclusion that the mean rating for romance movies is different from that of action movies. The next important idea is to better understand just how much higher of a mean rating can we expect the romance movies to have compared to that of action movies. This can be addressed by creating a 95% confidence interval as we will explore in Chapter 2.

-
-

-Learning check -

-
-

(LC10.18) Conduct the same analysis comparing action movies versus romantic movies using the median rating instead of the mean rating? Make sure to use the %>% as much as possible. What was different and what was the same?

-

(LC10.19) What conclusions can you make from viewing the faceted histogram looking at rating versus genre that you couldn’t see when looking at the boxplot?

-

(LC10.20) Describe in a paragraph how we used Allen Downey’s diagram to conclude if a statistical difference existed between mean movie ratings for action and romance movies.

-

(LC10.21) Why are we relatively confident that the distributions of the sample ratings will be good approximations of the population distributions of ratings for the two genres?

-

(LC10.22) Using the definition of “\(p\)-value”, write in words what the \(p\)-value represents for the hypothesis test above comparing the mean rating of romance to action movies.

-

(LC10.23) What is the value of the \(p\)-value for the hypothesis test comparing the mean rating of romance to action movies?

-

(LC10.24) Do the results of the hypothesis test match up with the original plots we made looking at the population of movies? Why or why not?

-
- -
-
-
-

3.9.11 Summary

-

To review, these are the steps one would take whenever you’d like to do a hypothesis test comparing values from the distributions of two groups:

-
    -
  • Simulate many samples using a random process that matches the way the original data were collected and that assumes the null hypothesis is true.

  • -
  • Collect the values of a sample statistic for each sample created using this random process to build a randomization distribution.

  • -
  • Assess the significance of the original sample by determining where its sample statistic lies in the randomization distribution.

  • -
  • If the proportion of values as extreme or more extreme than the observed statistic in the randomization distribution is smaller than the pre-determined significance level \(\alpha\), we reject \(H_0\). Otherwise, we fail to reject \(H_0\). (If no significance level is given, one can assume \(\alpha = 0.05\).)

  • -
-
-
-
-
-

3.10 Building theory-based methods using computation

-

As a point of reference, we will now discuss the traditional theory-based way to conduct the hypothesis test for determining if there is a statistically significant difference in the sample mean rating of Action movies versus Romance movies. This method and ones like it work very well when the assumptions are met in order to run the test. They are based on probability models and distributions such as the normal and \(t\)-distributions.

-

These traditional methods have been used for many decades back to the time when researchers didn’t have access to computers that could run 5000 simulations in under a minute. They had to base their methods on probability theory instead. Many fields and researchers continue to use these methods and that is the biggest reason for their inclusion here. It’s important to remember that a \(t\)-test or a \(z\)-test is really just an approximation of what you have seen in this chapter already using simulation and randomization. The focus here is on understanding how the shape of the \(t\)-curve comes about without digging big into the mathematical underpinnings.

-
-

3.10.1 Example: \(t\)-test for two independent samples

-

What is commonly done in statistics is the process of normalization. What this entails is calculating the mean and standard deviation of a variable. Then you subtract the mean from each value of your variable and divide by the standard deviation. The most common normalization is known as the \(z\)-score. The formula for a \(z\)-score is \[Z = \frac{x - \mu}{\sigma},\] where \(x\) represent the value of a variable, \(\mu\) represents the mean of the variable, and \(\sigma\) represents the standard deviation of the variable. Thus, if your variable has 10 elements, each one has a corresponding \(z\)-score that gives how many standard deviations away that value is from its mean. \(z\)-scores are normally distributed with mean 0 and standard deviation 1. They have the common, bell-shaped pattern seen below.

-

-

Recall, that we hardly ever know the mean and standard deviation of the population of interest. This is almost always the case when considering the means of two independent groups. To help account for us not knowing the population parameter values, we can use the sample statistics instead, but this comes with a bit of a price in terms of complexity.

-

Another form of normalization occurs when we need to use the sample standard deviations as estimates for the unknown population standard deviations. This normalization is often called the \(t\)-score. For the two independent samples case like what we have for comparing action movies to romance movies, the formula is \[T =\dfrac{ (\bar{x}_1 - \bar{x}_2) - (\mu_1 - \mu_2)}{ \sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}} }\]

-

There is a lot to try to unpack here.

-
    -
  • \(\bar{x}_1\) is the sample mean response of the first group
  • -
  • \(\bar{x}_2\) is the sample mean response of the second group
  • -
  • \(\mu_1\) is the population mean response of the first group
  • -
  • \(\mu_2\) is the population mean response of the second group
  • -
  • \(s_1\) is the sample standard deviation of the response of the first group
  • -
  • \(s_2\) is the sample standard deviation of the response of the second group
  • -
  • \(n_1\) is the sample size of the first group
  • -
  • \(n_2\) is the sample size of the second group
  • -
-

Assuming that the null hypothesis is true (\(H_0: \mu_1 - \mu_2 = 0\)), \(T\) is said to be distributed following a \(t\) distribution with degrees of freedom equal to the smaller value of \(n_1 - 1\) and \(n_2 - 1\). The “degrees of freedom” can be thought of measuring how different the \(t\) distribution will be as compared to a normal distribution. Small sample sizes lead to small degrees of freedom and, thus, \(t\) distributions that have more values in the tails of their distributions. Large sample sizes lead to large degrees of freedom and, thus, \(t\) distributions that closely align with the standard normal, bell-shaped curve.

-

So, assuming \(H_0\) is true, our formula simplifies a bit:

-

\[T =\dfrac{ \bar{x}_1 - \bar{x}_2}{ \sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}} }.\]

-

We have already built an approximation for what we think the distribution of \(\delta = \bar{x}_1 - \bar{x}_2\) looks like using randomization above. Recall this distribution:

-
ggplot(data = rand_distn, aes(x = diffmean)) +
-  geom_histogram(color = "white", bins = 20)
-
-Simulated differences in means histogram -

-Figure 3.12: Simulated differences in means histogram -

-
-

If we’d like to have a guess as to what the distribution of \(T\) might look like instead, we need only to divide every value in rand_distn by \[\sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}}.\] As we did before, we will assign Romance to be group 1 and Action to be group 2. (This was done since Romance comes second alphabetically and the reason why we have a number mismatch below with 1 and 2.) Remember that we’ve already calculated these values:

-
kable(summary_ratings)
- - - - - - - - - - - - - - - - - - - - - - - -
genremeanstd_devn
Action5.1121.48934
Romance6.0621.14934
-

We will create some shortcuts here so you can see the value being calculated for the denominator of \(T\).

-
s1 <- summary_ratings$std_dev[2]
-s2 <- summary_ratings$std_dev[1]
-n1 <- summary_ratings$n[2]
-n2 <- summary_ratings$n[1]
-

Here, we have \(s_1 = 1.1494\), \(s_2 = 1.4887\), \(n_1 = 34\), and \(n_2 = 34\).

-

We can calculate the denominator via

-
(denom_T <- sqrt( (s1^2 / n1) + (s2^2 / n2) ))
-
[1] 0.3226
-

Now if we divide all of the values of diffmean in rand_distn by denom_T we can have a simulated distribution of \(T\) test statistics instead:

-
rand_distn <- rand_distn %>% 
-  mutate(t_stat = diffmean / denom_T)
-ggplot(data = rand_distn, aes(x = t_stat)) +
-  geom_histogram(color = "white", bins = 20)
-
-Simulated T statistics histogram -

-Figure 3.13: Simulated T statistics histogram -

-
-

We see that the shape of this distribution is the same as that of diffmean. The scale has changed though with t_stat having less spread than diffmean.

-

A traditional \(t\)-test doesn’t look at this simulated distribution, but instead it looks at the \(t\)-curve with degrees of freedom equal to 33 (the minimum of \(n_1 = 34 - 1 = 33\) and \(n_2 = 34 - 1 = 33\)). This curve is frequently called a density curve and this is the reason why we specify the use of y = ..density.. here in the geom_histogram. We now overlay what this \(t\)-curve looks like on top of the histogram showing the simulated \(T\) statistics.

-
ggplot(data = rand_distn, mapping = aes(x = t_stat)) +
-  geom_histogram(aes(y = ..density..), color = "white", binwidth = 0.3) +
-  stat_function(fun = dt,
-    args = list(df = min(n1 - 1, n2 - 1)), 
-    color = "royalblue", size = 2)
-

-

We can see that the curve does a good job of approximating the randomization distribution here. (More on when to expect for this to be the case when we discuss conditions for the \(t\)-test in a bit.) To calculate the \(p\)-value in this case, we need to figure out how much of the total area under the \(t\)-curve is at our observed \(T\)-statistic or more, plus also adding the area under the curve at the negative value of the observed \(T\)-statistic or below. (Remember this is a two-tailed test so we are looking for a difference–values in the tails of either direction.) Just as we converted all of the simulated values to \(T\)-statistics, we must also do so for our observed effect \(\delta^*\):

-
(t_obs <- obs_diff / denom_T)
-
[1] 2.945
-

So graphically we are interested in finding the percentage of values that are at or above 2.9452 or at or below -2.9452.

-
ggplot(data = rand_distn, mapping = aes(x = t_stat)) +
-  stat_function(fun = dt,
-    args = list(df = min(n1 - 1, n2 - 1)), 
-    color = "royalblue", size = 2) +
-  geom_vline(xintercept = t_obs, color = "red") +
-  geom_vline(xintercept = -t_obs, color = "red")
-

-

At this point, you should make a guess as to what a reasonable value may be for the \(p\)-value. Let’s say the \(p\)-value is 0.01 or so. To actually perform this calculation by hand, you’d need to do some calculus. Let’s have R do it for us instead using the pt function.

-
pt(t_obs, df = min(n1 - 1, n2 - 1), lower.tail = FALSE) +
-  pt(-t_obs, df = min(n1 - 1, n2 - 1), lower.tail = TRUE)
-
[1] 0.005876
-
-
-

3.10.2 Conditions for t-test

-

In order for the results of the \(t\)-test to be valid, three conditions must be met:

-
    -
  1. Independent observations in both samples
  2. -
  3. Nearly normal populations OR large sample sizes (\(n \ge 30\))
  4. -
  5. Independently selected samples
  6. -
-

Condition 1: This is met since we sampled at random using R from our population.

-

Condition 2: Recall from Figure 3.6, that we know how the populations are distributed. Both of them are close to normally distributed. If we are a little concerned about this assumption, we also do have samples of size larger than 30 (\(n_1 = n_2 = 34\)).

-

Condition 3: This is met since there is no natural pairing of a movie in the Action group to a movie in the Romance group.

-

Since all three conditions are met, we can be reasonably certain that the theory-based test will match the results of the randomization-based test using shuffling. Remember that theory-based tests can produce some incorrect results in these assumptions are not carefully checked. The only assumption for randomization and computational-based methods is that the sample is selected at random. They are our preference and we strongly believe they should be yours as well, but it’s important to also see how the theory-based tests can be done and used as an approximation for the computational techniques until at least more researchers are using these techniques that utilize the power of computers.

-
-
-
-
-

3.11 Resampling-based inference for regression

-

We can also use the concept of shuffling to determine the standard error of our null distribution and conduct a hypothesis test for a population slope. Let’s go back to our example on Alaskan flights that represent a sample of all Alaskan flights departing NYC in 2013 from Section ??. Let’s test to see if we have evidence that a positive relationship exists between the departure delay and arrival delay for Alaskan flights. We will set up this hypothesis testing process as we have each before via the “There is Only One Test” diagram in Figure 3.1.

-
-

3.11.1 Data

-

Our data is stored in alaska_flights and we are focused on the 50 measurements of dep_delay and arr_delay there.

-
# To ensure the random sample of 50 flights is the same for
-# anyone using this code
-set.seed(2017)
-
-# Load Alaska data, deleting rows that have missing departure delay
-# or arrival delay data
-alaska_flights <- flights %>% 
-  filter(carrier == "AS") %>% 
-  filter(!is.na(dep_delay) & !is.na(arr_delay)) %>% 
-  # Select 50 flights that don't have missing delay data
-  sample_n(50)
-
-
-

3.11.2 Test statistic \(\delta\)

-

Our test statistic here is the sample slope coefficient that we denote with \(b_1\).

-
-
-

3.11.3 Observed effect \(\delta^*\)

-
delay_fit <- lm(formula = arr_delay ~ dep_delay, data = alaska_flights)
-(b1_obs <- tidy(delay_fit)$estimate[2])
-
[1] 1.218
-

The calculated slope value from our observed sample is \(b_1 = 1.2177\).

-
-
-

3.11.4 Model of \(H_0\)

-

We are looking to see if a positive relationship exists so \(H_a: \beta_1 > 0\). Our null hypothesis is always in terms of equality so we have \(H_0: \beta_1 = 0\).

-
-
-

3.11.5 Simulated data

-

Now to simulate the null hypothesis being true and recreating how our sample was created, we need to think about what it means for \(\beta_1\) to be zero. If \(\beta_1 = 0\), we said above that there is no relationship between the departure delay and arrival delay. If there is no relationship, then any one of the arrival delay values could have just as likely occurred with any of the other departure delay values instead of the one that it actually did fall with. We, therefore, have another example of shuffling in our simulating of data.

-

Tactile simulation

-

We could use a deck of 100 note cards to create a tactile simulation of this shuffling process. We would write the 50 different values of departure delays on each of the 50 cards, one per card. We would then do the same thing for the 50 arrival delays putting them on one per card.

-

Next, we would lay out each of the 50 departure delay cards and we would shuffle the arrival delay deck. Then, after shuffling the deck well, we would disperse the cards one per each one of the departure delay cards. We would then enter these new values in for arrival delay and compute a sample slope based on this shuffling. We could repeat this process many times, keeping track of our sample slope after each shuffle.

-
-
-

3.11.6 Distribution of \(\delta\) under \(H_0\)

-

We can build our randomization distribution in much the same way we did before using the do and shuffle functions. Here we will take advantage of the coef function we saw earlier to extract the slope and intercept coefficients. (Our focus will be on the slope here though.)

-
rand_slope_distn <- do(5000) *
-  (lm(formula = arr_delay ~ shuffle(dep_delay), data = alaska_flights) %>%
-     coef())
-
names(rand_slope_distn)
-
[1] "Intercept" "dep_delay"
-

We see that the names of our columns are Intercept and dep_delay. We want to look at dep_delay since that corresponds to the slope coefficients.

-
ggplot(data = rand_slope_distn, mapping = aes(x = dep_delay)) +
-  geom_histogram(color = "white", bins = 20)
-

-
-
-

3.11.7 The p-value

-

Recall that we want to see where our observed sample slope \(\delta^* = 1.2177\) falls on this distribution and then count all of the values to the right of it corresponding to \(H_a: \beta_0 > 0\). To get a sense for where our values falls, we can shade all values at least as big as \(\delta^*\).

-
ggplot(data = rand_slope_distn, aes(x = dep_delay, fill = (dep_delay >= b1_obs))) +
-  geom_histogram(color = "white", bins = 20)
-
-Shaded histogram to show p-value -

-Figure 3.14: Shaded histogram to show p-value -

-
-

Since 1.2177 falls far to the right of this plot, we can say that we have a \(p\)-value of 0. We, thus, have evidence to reject the null hypothesis in support of there being a positive association between the departure delay and arrival delay of all Alaskan flights from NYC in 2013.

-
-

-Learning check -

-
-

(LC10.25) Repeat the inference above but this time for the correlation coefficient instead of the slope.

- -
- -
-
-
-
-
-

3.12 Theory-based inference for regression

-

Recall the regression output table from Section ?? with delay_fit being a least squares linear regression fit with arr_delay as the response and dep_delay as the predictor in the alaska_flights data frame created in Section ??.

- - - - - - - - - - - - - - - - - - - - - - - - - - -
termestimatestd.errorstatisticp.value
(Intercept)-14.1552.809-5.0380
dep_delay1.2180.1368.9510
- - -

We saw in Section ?? that random samples have variability and, thus, statistics from those samples have variability as defined by the sampling distribution.
-Recall from Section ?? that alaska_flights represents only a random sample of 50 Alaska Airlines flights and not all flights. Hence if we repeated the analysis but with another random sample of 50 flights, the fitted line would likely change slightly due to sampling variability. In this case, there is a true population least squares line is defined by the formula \(y = \beta_0 + \beta_1 x + \epsilon\) where

-
    -
  • \(\beta_0\) is the true population intercept parameter
  • -
  • \(\beta_1\) is the true population slope parameter
  • -
  • \(\epsilon\) represents the error term
  • -
-

\(\epsilon\) corresponds to the part of the response variable \(y\) that remains unexplained after considering the predictor variable \(x\). We will see in Section 3.12.2 that ideally they should exhibit no systematic pattern in that they are normally distributed, have mean 0, and constant variance.

-

The values \(b_0 = -14.155\) and \(b_1 = 1.218\) are point estimates of \(\beta_0\) and \(\beta_1\), and thus the second column of the regression output table that has their values is called estimate. The third column std.error represents the standard errors for each estimate using a theory-based approach.

-

The rows of the fourth and fifth columns correspond to theory-based hypothesis tests testing \(H_0: \beta_0 = 0 \mbox{ vs. } H_1: \beta_0 \neq 0\) and \(H_0: \beta_1 = 0 \mbox{ vs. } H_1: \beta_1 \neq 0\). Of particular interest is the second hypothesis test because if \(\beta_1 = 0\) then \(y = \beta_0 + \epsilon\). Hence the value of \(y\) does not depend on the value of \(x\) at all, in other words there is no relationship between them. Recall that any hypothesis test involves 1) an observed test statistic and 2) a \(p\)-value resulting from the comparison of the observed test statistic to a null distribution. The columns “statistic” and “p.value” correspond to these values.

-

In our example, since the \(p\)-value corresponding to the hypothesis test \(H_0: \beta_1 = 0 \mbox{ vs. } H_1: \beta_1 \neq 0\) is 0, for any value of \(\alpha\) we would reject \(H_0\) in favor of \(H_1\) and declare that there is a significant relationship between arrival delay and departure delay.

- -

For the conclusions of the hypothesis tests for regression to be valid, there are certain conditions that must be met, in particular relating to the behavior of the residuals. We will address these assumptions in the upcoming Subsection 3.12.1.

-
-

3.12.1 Conditions for regression

-

In order for all inferences from regression to be valid (in particular the hypothesis tests from Subsection 3.12, certain conditions must roughly hold.

-
    -
  1. Nearly normal residuals with mean 0 and constant variance. (Check quantile-quantile plot of standardized residuals.)
  2. -
  3. Equal variances across explanatory variable. (Check residual plot for non-uniform patterns.)
  4. -
  5. Independent observations. (Check residual plot for no time series-like patterns.)
  6. -
-

As you can see the residuals will play a large role in determining whether the conditions are met. In particular, the first two conditions can be roughly interpreted as requiring that there being no systematic pattern to the residuals. The residuals \(\widehat{\epsilon}_i\) are estimates for the error term \(\epsilon\) we discussed with the true population regression line, and this is a big reason why they play an important role in validating regression assumptions.

-
-
-

3.12.2 Residual analysis

-

The following diagram will help you to keep track of what is meant by a residual. Consider the observation marked by the blue dot:

-

-

Recall that \(y_i\) is the observed value of the arr_delay variable (y-position of blue dot), \(\widehat{y}_i\) is the fitted value of the arr_delay (value that is being pointed to on the red line), and the residual is \(\widehat{\epsilon}_i = y_i - \hat{y}_i\). We can quickly extract the values of all 50 residuals by using the augment() function in the broom package. Specifically, we are interested in the .fitted and .resid variables. Let’s look at the residuals corresponding to the first six rows of data.

-
regression_points <- augment(delay_fit) %>% 
-  select(arr_delay, dep_delay, .fitted, .resid)
-regression_points %>% 
-  head() %>% 
-  kable()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
arr_delaydep_delay.fitted.resid
-38-3-17.808-20.192
866969.86416.136
-383-10.502-27.498
615350.38110.619
3120.4572.543
212-11.72032.720
-

Let’s begin by analyzing the distribution of the residuals. We would expect the shape of the distribution to be symmetric and roughly bell-shaped with a peak near zero and fewer and fewer values going into the tails on both the left and right sides.

-
ggplot(data = regression_points, mapping = aes(x = .resid)) +
-  geom_histogram(binwidth = 10, color = "white") +
-  geom_vline(xintercept = 0, color = "blue")
-

-

Next, we create a scatterplot looking at how the fitted values relate to the residual values.

-
ggplot(data = regression_points, mapping = aes(x = .fitted, y = .resid)) +
-  geom_point() +
-  geom_abline(intercept = 0, slope = 0, color = "blue")
-
-Fitted versus Residuals plot -

-Figure 3.15: Fitted versus Residuals plot -

-
-

Lastly, we create a quantile-quantile plot that compares the residual values to what would be expected from a bell-shaped distribution (in particular, the normal distribution).

-
ggplot(data = regression_points, mapping = aes(sample = .resid)) +
-  stat_qq()
-
-QQ Plot of residuals -

-Figure 3.16: QQ Plot of residuals -

-
-

Checking conditions:

-
    -
  1. We are looking to see if the points are scattered about the blue line at 0 relatively evenly as we look from left to right in Figure 3.15. We have some reason for concern here as the large lump of values on the left are much more dispersed than those on the right.

  2. -
  3. The second condition is invalidated if there is a trigonometric pattern of up and down throughout the fitted by residual plot in Figure 3.15. That is not the case here.

  4. -
  5. We look at the quantile-quantile plot (“Q-Q plot” for short) for the third condition in Figure 3.16. We are looking to see if the residuals fall on a straight line with what we would expect if they were normally distributed. We see some curvature here as well. We should begin to wonder if regression was valid here with both condition 1 and condition 3 in question.

  6. -
-

We have reason to doubt whether a linear regression is valid here. Unfortunately, all too frequently regressions are run without checking these assumptions carefully. While small deviations from the assumptions can be OK, larger violations can completely invalidate the results and make any inferences improbable and questionable.

-
-
-
-
-

3.13 Conclusion

-
-

3.13.1 What’s to come?

-

This chapter examined the basics of hypothesis testing with terminology and also an example of how to apply the “There is Only One Test” diagram to the Lady Tasting Tea example presented in Chapter ?? and to an example on comparing the IMDB ratings of action movies and romance movies. Lastly, we looked at how to use resampling and theory-based methods on regression. We’ll see in Chapter 2 how we can provide a range of possible values for an unknown population parameter instead of just running a Yes/No decision from a hypothesis test.

-
-
-

3.13.2 Script of R code

-

An R script file of all R code used in this chapter is available here.

- -
-
-

Chihara, Laura M., and Tim C. Hesterberg. 2011. Mathematical Statistics with Resampling and R. Hoboken, NJ: John Wiley; Sons. https://sites.google.com/site/chiharahesterberg/home.

-
-
-

Diez, David M, Christopher D Barr, and Mine Çetinkaya-Rundel. 2014. Introductory Statistics with Randomization and Simulation. First Edition. https://www.openintro.org/stat/textbook.php?stat_book=isrs.

-
-
-

Grolemund, Garrett, and Hadley Wickham. 2016. R for Data Science. http://r4ds.had.co.nz/.

-
-
-

Lock, Robin, Patti Frazer Lock, Kari Lock Morgan, Eric F. Lock, and Dennis F. Lock. 2012. Statistics: UnLOCKing the Power of Data. Wiley.

-
-
-

Wickham, Hadley. 2015. Ggplot2movies: Movies Data. https://CRAN.R-project.org/package=ggplot2movies.

-
-
-

Xie, Yihui. 2017. Bookdown: Authoring Books and Technical Documents with R Markdown. https://CRAN.R-project.org/package=bookdown.

-
-
-
-
-
-
- -
-
-
- - -
-
- - - - - - - - - - - - - - diff --git a/docs/3-tidy.html b/docs/3-tidy.html deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/3-viz.html b/docs/3-viz.html index 6b9ca7267..33cda7a0b 100644 --- a/docs/3-viz.html +++ b/docs/3-viz.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -638,7 +638,7 @@

3.1.3 Other components of the Gra

3.1.4 The ggplot2 package

-

In this book, we will be using the ggplot2 package for data visualization, which is an implementation of the Grammar of Graphics for R (Wickham and Chang 2018). You may have noticed that a lot of the previous text in this chapter is written in computer font. This is because the various components of the Grammar of Graphics are specified in the ggplot function, which expects at a bare minimum as arguments:

+

In this book, we will be using the ggplot2 package for data visualization, which is an implementation of the Grammar of Graphics for R (Wickham and Chang 2017). You may have noticed that a lot of the previous text in this chapter is written in computer font. This is because the various components of the Grammar of Graphics are specified in the ggplot function, which expects at a bare minimum as arguments:

  • The data frame where the variables exist: the data argument
  • The mapping of the variables to aesthetic attributes: the mapping argument, which specifies the aesthetic attributes involved
  • @@ -1447,7 +1447,7 @@

    3.8.4 Summary

    3.9 Conclusion

    3.9.1 Review questions

    -

    Review questions have been designed using the fivethirtyeight R package (Ismay and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the chapters of the DataCamp course available below:

    +

    Review questions have been designed using the fivethirtyeight R package (Kim, Ismay, and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the chapters of the DataCamp course available below:

    • Scatterplots & Linegraphs
    • Histograms & Boxplots
    • @@ -1534,11 +1534,9 @@

      3.9.4 Script of R code

      (function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/4-tidy.html b/docs/4-tidy.html index f2a3dd7de..70ce79adf 100644 --- a/docs/4-tidy.html +++ b/docs/4-tidy.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -613,7 +613,7 @@

      4.1 What is tidy data?

      In this case, even though the variable “Boeing Price” occurs again, the data is tidy since there are three variables corresponding to three unique pieces of information (Date, Boeing stock price, and the weather that particular day).

      The non-tidy data format in the original table is also known as “wide” format whereas the tidy data format in the second table is also known as “long”/“narrow” data format.

      In this book, we will work with work with datasets that are already in tidy format. But data isn’t always in this nice format that the tidyverse gets its name from. Data actually may come to you in a variety of different formats that require data cleaning and reshaping beyond the scope of this book. For a thorough example of the steps needed to take a messy dataset and turn it into a tidy one, check out the different functions available for data tidying and a case study using data from the World Health Organization in R for Data Science (Grolemund and Wickham 2016).

      -

      Most frequently though, data that isn’t in long format and is instead in wide format can be converted into “tidy” format by using the tidyr package (Wickham and Henry 2018) in the tidyverse. We’ll now investigate how that can be done using the gather() function in tidyr. Before we proceed with reshaping our data, we will discuss how to read data stored in CSV format into R as a data frame.

      +

      Most frequently though, data that isn’t in long format and is instead in wide format can be converted into “tidy” format by using the tidyr package (Wickham and Henry 2017) in the tidyverse. We’ll now investigate how that can be done using the gather() function in tidyr. Before we proceed with reshaping our data, we will discuss how to read data stored in CSV format into R as a data frame.


    @@ -684,18 +684,18 @@

    4.3 Importing CSVs via readr

    dem_score <- read_csv("http://ismayc.github.io/dem_score.csv")
     dem_score
    # A tibble: 96 x 10
    -   country    `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992`
    -   <chr>       <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
    - 1 Albania       - 9    - 9    - 9    - 9    - 9    - 9    - 9    - 9      5
    - 2 Argentina     - 9    - 1    - 1    - 9    - 9    - 9    - 8      8      7
    - 3 Armenia       - 9    - 7    - 7    - 7    - 7    - 7    - 7    - 7      7
    - 4 Australia      10     10     10     10     10     10     10     10     10
    - 5 Austria        10     10     10     10     10     10     10     10     10
    - 6 Azerbaijan    - 9    - 7    - 7    - 7    - 7    - 7    - 7    - 7      1
    - 7 Belarus       - 9    - 7    - 7    - 7    - 7    - 7    - 7    - 7      7
    - 8 Belgium        10     10     10     10     10     10     10     10     10
    - 9 Bhutan        -10    -10    -10    -10    -10    -10    -10    -10    -10
    -10 Bolivia       - 4    - 3    - 3    - 4    - 7    - 7      8      9      9
    +      country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992`
    +        <chr>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
    + 1    Albania     -9     -9     -9     -9     -9     -9     -9     -9      5
    + 2  Argentina     -9     -1     -1     -9     -9     -9     -8      8      7
    + 3    Armenia     -9     -7     -7     -7     -7     -7     -7     -7      7
    + 4  Australia     10     10     10     10     10     10     10     10     10
    + 5    Austria     10     10     10     10     10     10     10     10     10
    + 6 Azerbaijan     -9     -7     -7     -7     -7     -7     -7     -7      1
    + 7    Belarus     -9     -7     -7     -7     -7     -7     -7     -7      7
    + 8    Belgium     10     10     10     10     10     10     10     10     10
    + 9     Bhutan    -10    -10    -10    -10    -10    -10    -10    -10    -10
    +10    Bolivia     -4     -3     -3     -4     -7     -7      8      9      9
     # ... with 86 more rows

    Second, let’s read in the same data, but using the file you just downloaded on to your computer: Go to the Files pane of RStudio -> Navigate the directories to where your downloaded files are -> Right click dem_score.csv -> Click “Import Dataset…” -> Click “Import”. You’ll see two things happen:

      @@ -712,8 +712,8 @@

      4.4 Converting from wide to long< filter(country == "Guatemala") guat_dem

    # A tibble: 1 x 10
    -  country   `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992`
    -  <chr>      <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
    +    country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992`
    +      <chr>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>  <int>
     1 Guatemala      2     -6     -5      3      1     -3     -7      3      3

    Now let’s produce a plot showing how the democracy scores have changed over the 40 years from 1952 to 1992 for Guatemala. Let’s start by laying out how we would map our aesthetics to variables in the data frame:

      @@ -733,17 +733,17 @@

      4.4 Converting from wide to long< - country) guat_tidy

# A tibble: 9 x 3
-  country   year  democracy_score
-  <chr>     <chr>           <int>
-1 Guatemala 1952                2
-2 Guatemala 1957               -6
-3 Guatemala 1962               -5
-4 Guatemala 1967                3
-5 Guatemala 1972                1
-6 Guatemala 1977               -3
-7 Guatemala 1982               -7
-8 Guatemala 1987                3
-9 Guatemala 1992                3
+ country year democracy_score + <chr> <chr> <int> +1 Guatemala 1952 2 +2 Guatemala 1957 -6 +3 Guatemala 1962 -5 +4 Guatemala 1967 3 +5 Guatemala 1972 1 +6 Guatemala 1977 -3 +7 Guatemala 1982 -7 +8 Guatemala 1987 3 +9 Guatemala 1992 3

We can now create the plot to show how the democracy score of Guatemala changed from 1952 to 1992 using a linegraph and ggplot2.

ggplot(data = guat_tidy, mapping = aes(x = year, y = democracy_score)) +
   geom_line()
@@ -807,7 +807,7 @@

4.6 Conclusion

4.6.1 Review questions

-

Review questions have been designed using the fivethirtyeight R package (Ismay and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the Tidy Data chapter of the DataCamp course available here.

+

Review questions have been designed using the fivethirtyeight R package (Kim, Ismay, and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the Tidy Data chapter of the DataCamp course available here.

4.6.2 What’s to come?

@@ -872,11 +872,9 @@

4.6.3 Script of R code

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/4-wrangling.html b/docs/4-wrangling.html deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/5-multiple-regression.html b/docs/5-multiple-regression.html deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/5-wrangling.html b/docs/5-wrangling.html index 1f01f64dc..4e53fe1f2 100644 --- a/docs/5-wrangling.html +++ b/docs/5-wrangling.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -799,17 +799,17 @@

5.5 5MV#3: Group rows using group
# A tibble: 36 x 3
 # Groups:   origin [?]
    origin month count
-   <chr>  <int> <int>
- 1 EWR        1  9893
- 2 EWR        2  9107
- 3 EWR        3 10420
- 4 EWR        4 10531
- 5 EWR        5 10592
- 6 EWR        6 10175
- 7 EWR        7 10475
- 8 EWR        8 10359
- 9 EWR        9  9550
-10 EWR       10 10104
+    <chr> <int> <int>
+ 1    EWR     1  9893
+ 2    EWR     2  9107
+ 3    EWR     3 10420
+ 4    EWR     4 10531
+ 5    EWR     5 10592
+ 6    EWR     6 10175
+ 7    EWR     7 10475
+ 8    EWR     8 10359
+ 9    EWR     9  9550
+10    EWR    10 10104
 # ... with 26 more rows

Alternatively, you can use the shortcut count() function in dplyr to get the same result:

by_monthly_origin <- flights %>% 
@@ -917,52 +917,52 @@ 

5.7 5MV#5: Reorder the data frame summarize(num_flights = n()) freq_dest

# A tibble: 105 x 2
-   dest  num_flights
+    dest num_flights
    <chr>       <int>
- 1 ABQ           254
- 2 ACK           265
- 3 ALB           439
- 4 ANC             8
- 5 ATL         17215
- 6 AUS          2439
- 7 AVL           275
- 8 BDL           443
- 9 BGR           375
-10 BHM           297
+ 1   ABQ         254
+ 2   ACK         265
+ 3   ALB         439
+ 4   ANC           8
+ 5   ATL       17215
+ 6   AUS        2439
+ 7   AVL         275
+ 8   BDL         443
+ 9   BGR         375
+10   BHM         297
 # ... with 95 more rows

You’ll see that by default the values of dest are displayed in alphabetical order here. We are interested in finding those airports that appear most:

freq_dest %>% 
   arrange(num_flights)
# A tibble: 105 x 2
-   dest  num_flights
+    dest num_flights
    <chr>       <int>
- 1 LEX             1
- 2 LGA             1
- 3 ANC             8
- 4 SBN            10
- 5 HDN            15
- 6 MTJ            15
- 7 EYW            17
- 8 PSP            19
- 9 JAC            25
-10 BZN            36
+ 1   LEX           1
+ 2   LGA           1
+ 3   ANC           8
+ 4   SBN          10
+ 5   HDN          15
+ 6   MTJ          15
+ 7   EYW          17
+ 8   PSP          19
+ 9   JAC          25
+10   BZN          36
 # ... with 95 more rows

This is actually giving us the opposite of what we are looking for. It tells us the least frequent destination airports first. To switch the ordering to be descending instead of ascending we use the desc (descending) function:

freq_dest %>% 
   arrange(desc(num_flights))
# A tibble: 105 x 2
-   dest  num_flights
+    dest num_flights
    <chr>       <int>
- 1 ORD         17283
- 2 ATL         17215
- 3 LAX         16174
- 4 BOS         15508
- 5 MCO         14082
- 6 CLT         14064
- 7 SFO         13331
- 8 FLL         12055
- 9 MIA         11728
-10 DCA          9705
+ 1   ORD       17283
+ 2   ATL       17215
+ 3   LAX       16174
+ 4   BOS       15508
+ 5   MCO       14082
+ 6   CLT       14064
+ 7   SFO       13331
+ 8   FLL       12055
+ 9   MIA       11728
+10   DCA        9705
 # ... with 95 more rows

@@ -1125,7 +1125,7 @@

5.9.3 Find the top number of valu

5.10 Conclusion

5.10.1 Review questions

-

Review questions have been designed using the fivethirtyeight R package (Ismay and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the chapters of the DataCamp course available below:

+

Review questions have been designed using the fivethirtyeight R package (Kim, Ismay, and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the chapters of the DataCamp course available below:

  • Filtering, Grouping, & Summarizing
  • dplyr Review
  • @@ -1211,11 +1211,9 @@

    5.10.4 Script of R code

    (function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/6-ci.html b/docs/6-ci.html deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/6-regression.html b/docs/6-regression.html index 41e0b18c0..48aaf0534 100644 --- a/docs/6-regression.html +++ b/docs/6-regression.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -1010,8 +1010,8 @@

    6.2.1 Exploratory data analysis
    glimpse(gapminder2007)

Observations: 142
 Variables: 4
-$ country   <fct> Afghanistan, Albania, Algeria, Angola, Argentina, Austral...
-$ continent <fct> Asia, Europe, Africa, Africa, Americas, Oceania, Europe, ...
+$ country   <fctr> Afghanistan, Albania, Algeria, Angola, Argentina, Austra...
+$ continent <fctr> Asia, Europe, Africa, Africa, Americas, Oceania, Europe,...
 $ lifeExp   <dbl> 43.8, 76.4, 72.3, 42.7, 75.3, 81.2, 79.8, 75.6, 64.1, 79....
 $ gdpPercap <dbl> 975, 5937, 6223, 4797, 12779, 34435, 36126, 29796, 1391, ...

We see that the variable continent is indeed categorical, as it is encoded as fctr which stands for “factor”: R’s way of storing categorical variables. Let’s look at a summary of the explanatory variable continent:

@@ -1722,11 +1722,9 @@

6.4.1 Script of R code

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/7-hypo.html b/docs/7-hypo.html deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/7-multiple-regression.html b/docs/7-multiple-regression.html index 8468a71f7..51fcc66b5 100644 --- a/docs/7-multiple-regression.html +++ b/docs/7-multiple-regression.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -1414,11 +1414,9 @@

7.4.2 Script of R code

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/8-inference-for-regression.html b/docs/8-inference-for-regression.html deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/8-sampling.html b/docs/8-sampling.html index b6cd844b8..aaddbb93d 100644 --- a/docs/8-sampling.html +++ b/docs/8-sampling.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -975,11 +975,9 @@

8.6.2 Script of R code

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/9-ci.html b/docs/9-ci.html index ed55f4a3c..d846cc235 100644 --- a/docs/9-ci.html +++ b/docs/9-ci.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -716,18 +716,18 @@

9.4 Relation to hypothesis testin

Note all this code was moved over from hypothesis testing

(movies_trimmed <- movies %>% select(title, year, rating, Action, Romance))
# A tibble: 58,788 x 5
-   title                     year rating Action Romance
-   <chr>                    <int>  <dbl>  <int>   <int>
- 1 $                         1971   6.40      0       0
- 2 $1000 a Touchdown         1939   6.00      0       0
- 3 $21 a Day Once a Month    1941   8.20      0       0
- 4 $40,000                   1996   8.20      0       0
- 5 $50,000 Climax Show, The  1975   3.40      0       0
- 6 $pent                     2000   4.30      0       0
- 7 $windle                   2002   5.30      1       0
- 8 '15'                      2002   6.70      0       0
- 9 '38                       1987   6.60      0       0
-10 '49-'17                   1917   6.00      0       0
+                      title  year rating Action Romance
+                      <chr> <int>  <dbl>  <int>   <int>
+ 1                        $  1971    6.4      0       0
+ 2        $1000 a Touchdown  1939    6.0      0       0
+ 3   $21 a Day Once a Month  1941    8.2      0       0
+ 4                  $40,000  1996    8.2      0       0
+ 5 $50,000 Climax Show, The  1975    3.4      0       0
+ 6                    $pent  2000    4.3      0       0
+ 7                  $windle  2002    5.3      1       0
+ 8                     '15'  2002    6.7      0       0
+ 9                      '38  1987    6.6      0       0
+10                  '49-'17  1917    6.0      0       0
 # ... with 58,778 more rows
movies_trimmed <- movies_trimmed %>%
   filter(!(Action == 1 & Romance == 1))
@@ -767,16 +767,16 @@

9.4 Relation to hypothesis testin
# A tibble: 10 x 2
    .index diffmean
     <dbl>    <dbl>
- 1   1.00 -0.132  
- 2   2.00 -0.197  
- 3   3.00 -0.0265 
- 4   4.00  0.715  
- 5   5.00 -0.474  
- 6   6.00 -0.121  
- 7   7.00 -0.174  
- 8   8.00 -0.209  
- 9   9.00 -0.00882
-10  10.0  -0.332  
+ 1 1 -0.13235 + 2 2 -0.19706 + 3 3 -0.02647 + 4 4 0.71471 + 5 5 -0.47353 + 6 6 -0.12059 + 7 7 -0.17353 + 8 8 -0.20882 + 9 9 -0.00882 +10 10 -0.33235
ggplot(data = rand_distn, mapping = aes(x = diffmean)) +
   geom_histogram(color = "white", bins = 20)
@@ -790,7 +790,7 @@

9.4 Relation to hypothesis testin
# A tibble: 1 x 1
      se
   <dbl>
-1 0.340
+1 0.34

We can use the general formula of \(statistic \pm (2 * SE)\) for a confidence interval to obtain the following result for plausible values of the difference in population means at the 95% level.

(lower <- obs_diff - (2 * std_err))
     se
@@ -900,11 +900,9 @@ 

9.6.2 Script of R code

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/A-appendixA.html b/docs/A-appendixA.html index 5d82f6fcf..04b27ccc4 100644 --- a/docs/A-appendixA.html +++ b/docs/A-appendixA.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -563,11 +563,9 @@

A.1.6 Outliers

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/B-appendixB.html b/docs/B-appendixB.html index 34731194d..14281ab2d 100644 --- a/docs/B-appendixB.html +++ b/docs/B-appendixB.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -1575,11 +1575,9 @@

B.7.5 Comparing results

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/C-appendixC.html b/docs/C-appendixC.html index 42f6380b5..a791428d7 100644 --- a/docs/C-appendixC.html +++ b/docs/C-appendixC.html @@ -7,7 +7,7 @@ An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -525,7 +525,7 @@

C.2.1 Interactive linegraphs

flights_summarized <- select(flights_summarized, -date) dyRangeSelector(dygraph(flights_summarized))

- +


The syntax here is a little different than what we have covered so far. The dygraph function is expecting for the dates to be given as the rownames of the object. We then remove the date variable from the flights_summarized data frame since it is accounted for in the rownames. Lastly, we run the dygraph function on the new data frame that only contains the median arrival delay as a column and then provide the ability to have a selector to zoom in on the interactive plot via dyRangeSelector. (Note that this plot will only be interactive in the HTML version of this book.)

N%9ALvH}%BooL!gPsV_J%P=CF8ICS zWb@1|OmQngGBbmvR;j!T#vZFgxd_ zbCaC?t??JYQ`*i+fMfCpRwoboz#74q(>qb;n`8N)6cl5!60k&;iRx6ptv zYxrX@&hxyxF|C#76eU9ugs9=mB{dq(E!tGwp$R9;dnDX;zklfLf`$tfLO6^(p;j&` zR#J;`X*;&J+RDm0^tr`o`en@SjNAOOM&=KY`8`QxB;eq2-EoXpqpyb=?lrbt{KZQ! z839+YCm8F%IpAj}BDvdLL12_BDcGe(NXG1NcwF^i&s^gm#0gVYiar4r6y>8VtcRUDg?W}IxLc9eNra*B3M^Cug>b6q_wP8SuHWhWUYDhNEB@&`fy29^9fWp%sFb;f!S2q55c3CQQJGC1bC zu-TP-MK|PL>{Sbz7-c?bMe3{jAc_rk0BOE$nz! znW$Y$a@#GQSy-_w7jED{7#(s*sG-06G)Sdl6ol zW2bn3Mg+wKZ#c*V;DX8y4#GC5BalJvat=jcYrY|#Fjm!LfOa2ujz&+e2VtCoI^b8a zh{j=56P)83sU+bGIHr<|Z7W%(oL`c+EBK|NtyaBDVeqkxT5(k>**iO3sM|>Cr=Mbl z>of_nX#z=&p>ug@@FI*_87>`d8W0gkXVKVl#&50oufPv(2fD%`)83{{l~Ng1Ael%)z?K7+0PQ4(+A=+I2hz1PD;o*+Jc_KufT{-Vyq>shAMpWe$C`NWmncH1? zTDogX+sgV|Tvl;X6yLJ=l9w&g_imHA()Ycud--%1NBBA6=xmwhSe|mZ-68@vvvLZO zHz+-T$0Q7z;k-@oY6za(o z&CHRbr#Z+{3mkwpjijq`FvR@_6+Nl(K7gRv&2h7K)s8~U7%9%+Ks;wXNGH>r52-v; zq{F6a4Qah(LRo__!zfo}0N{{G!jX=P)Q&5kIz_DLsAG&|l_6AZ!1V_tspNW&c*jj; zo8qM@P{pWX6*l2goo{|OPRS{2dvs~7HC9(XhY!U~(~GGUIL#+{Xr-rA(`oB#B%}GB zaijcAy|)V=h;5d5@X@n`0O0ibPIHbr@y}CUjb$H&{842rmIfrcbi$3HU0-MQ|0|vRhL&0{JDnN~K^C$q|=a4V}AdaA9k3-j%u3SjmRBrZ=QE-uEce71_0I}X#)zHSdfM+1;?)2IN_d^+&NuN}stHM_3BL-FzXwPlwLAw zN}VXQmfWsMH_Wb*l5HgUw^p(AXM=Sg4Qhe3?M`S!DmJW-fKXtOhGpjgc>v@QfO^+? zb>I(%vM`(N*5X7RORnc^1BW;mQZRRPz`zGQdq|?(sWeviPO9kGz-Ysatb^qM zaxgd`*W`V_i9AS}q&DrUOv7;Bi5&qM%A6L!z#w38#~qD+gZv=W8urS{d#m|afSZuG z-MlUdRxOYeAS!w5$Q^6*t}Vzh96u=Lam*oJt#MQwNb{{X`uvjxtrW_3$x9_@hhf6`}T<~SRR zD{?muPR8xQ?oG%(79BF;TM6A@yNR7%6OHFTcaTWNNo^p&^R) z)u3fy;RKt^MO3qs!v!BRsV8y904E0Z-|QC^-}-1N&Yh88xHLp9FHql#GP!RK%cuE6Jlq~sp^NH`e+zOnJ=#P~JO zg>B$_A!ndzOQi^I<{&H1vB%|`aNG#hmns!NEI7@4SpGWKG}~#ewRlVrC@vlOMFS%{ zTLX48%H^~CJuBt!il4NNypc(#Lve&{>mU*{02GznK;St55F6L#IpVnZ8xx%2aTxqA zTB_8knbD3P8#NDU$xSyKMypOyx|4jXM{O2^;!4<9;i=*=wJAwqWa`msq#IQ%^o$(2 zWiN+bSy{WL&m#Dp;adycUh30Il?AL*v~cbF-cgy9oHtm@7Q+4F0-xdlcy6Vl+xT>} zT8*+wk}Ktux4ZEcopCLd+WzYPJ3}lcXCLaWTWIpP0!nTG6bAxaNs3Z&m3FHd;jHARn79OwmRVp~xVih$~_@?QpL940D z+r|#kO(;n>8O5fg_FRzREXxxZ;%UO1Y00mCGlZjitEI~4tX}JM*4v*v#rr>ahJ6m- z#do{X28dll3#KD5hT1}wVY$a;#z5##O7@=w_(4*}-Y1PzCJ}-Rfaht+`EkHLr;p8Hu47(vy`6R9t=9O||b@TF;fPXvN9KUk@?*X9{E! z$|D-@{oPI37(NK_g$jQOT$bQ;jN8gz8h2`Jp&U3KDX)tqyBlLe0B1rS)x|6x~Qyrz$c}_i|Mm zl%o|HS?L7rDJ`R~OI>KsmLB$Pv7#ANBPuf&C_KPU7mR$^84Lyx@N=F$OTv1dopT#O zeHoHwP>bdOU$#|C6M`E%y?XbqZpTJddD`JYW(CVK5>s=K$e@4&72|e(Q`0?a?w90Q z>lU+38l*AY;gOef?a0Q##xOZ0k3O9RdKi}q@M@Bpi&Altnu={}%cJIYv)ai$9<3u< zbvfnA^}Bq{-6f;5Pg}O_ew{Vx>uaX(h0VRJiVGtWnHv+F9lJnpx<&!y;}|EOrM?+x zw+z;H)*wr9DnsUw;|lBp6(g$hNWt&G=DeFh@n(rWqYj=#Qo^BgE;jGqhFH$!Ybxmii5wP{^# z%NHeU?6>B+zP=XmWtYUihizn&q|4%6I(x}Nur?Zc$-eq|m~Gn<#qt8f_l3$1we?5s zg>UfhLDDoGT06@Lb?HJOw?-pyfcXHWWUC1=c7M0{% zYcDkUXsF+H!Vj2*AfD_vKd~j6R4`6SRHB>u#HUi#DbCKy6y4z4-&bbdpF?X`v+Ghl z$(7pP0?x4lWJCZRtN|w*&)vghWFAiypYZd=0fd*Gv6O7>83UGKxO1H2 zCxc#Jr+Dr53pt^lF*x9`^9+%YN_@x=4Y~PC5Pfl7D#X)6{)(CN0s?RY;F3VWV}>P$ zPEW3T*YhqihC3;=G0>E`Vx20pn{^tDxs+o$Hs33PS8qkLiqQRsD$KI{%8cVitT}30 zlcyQ0LOad7yQd{&+e>!t-iz1%B6z}I8sGWGQzS6}!*dp9$SZ-8KnkU^jIcOAHx=Fd zBQ}=p8QPgv+*lyS0pJBV$?6@C1Q2uHzIA&&Ldq2d0hUD)AZ6UxUI-^8a(Uy`Pvh-h z50dn(3y~Gg)T?faakq1jGm=oMl5%$e=nr%78Mbj-ACAH%=G`n-l;t%!PNbz2trq^& zxg!?tz2~O4KBib&G%~4Er8<7a#X&goq`7&Ua#nU$=GWfZ?%E%9U1)v*)x0z=W{%7I zK>`WjC>CwONi4Gc)*TKrklnM*alZ^Kd>P^2i(U%U8LZMfZCzuzGNJ@~X`^u7X6%8I zR0ap{E)O-(FYM9r9{0o1>e_y)lM9G^@3R|uQ4Zw{s*D-Kh>oOD%GZzj`s+u$WcOcRf5AnxkB8cg--oWeDC{oSn5{QC)Tlhgsz zl5#i|;r=7m<Yp7dn@;(oL>Nb4|Atf}#72y!{jXTL$;m=1br`GY-(9Sa zNb!!N1?GZu?Jnh$2m$iR;Yh$Ne~fPDX#1m+!5Q;E4)}Lan$ec(SCHGXGM)%!Bpu2~ zBm$vHP?RF|yQrM> za+7W7)4P4{-=?QwqE2nBqPC2yIwHn55;rJ07~A|~7%Rc+&p}?x;cZ^}3s81GR0bmi z;Q{0+KYxzHIPN;vp7>`+v%k2qjyBk4xq>-PG7%m{+~DNx;NvGJ(~A8<(Ek8u{R>m@ zJVQ%|b=*;+UXlaJ8@yrRytk2ZL~;}JfxDc70L^spKZcp_72@$&yuPJ4P{L4@s?)CI zrCOx9oj6lUE-{qSlbpG2P4ei}pD)gF4-{qDWjb{6ROdHdvs~&w4V+Y^JEZx2^wN&E zR@&_Fe*<`!YWBBUCC=tsfFy+ue|B=qBzWgLlMD|bn;?4Elz6A$mZN{G%+Nt3cC)jx zAdewIec9-6eZHdyiu#wqzX&e0ZD@J1znyD41|Of%XbyaB%ExU zlCwWG&A8VKmSXBx%4Zkt=`U)VOW*iZqLtFTZLas}x%unx*Y zMh*@-j(s|0V*EKgqHvL+bciN}_~Qmn3U+;a~8kJ`@`J~^Q{=XM; zG>k6ZIZ`+rbKe|ffDdoHNzY1?f-&!rj=sK}_62YU88`-!p9Hxp(;VQCpx}f3;Cq}7 zPI`5tHP}tSo;mB?e}sZD$6OvTI6QG(QBO-HX7{$5U(>IXTA8-mTK7-z{L8XP7c5Wk z@&E+!(1Gd8WM>1mSA#<0@epJc2a<3}7%PsL$2??XIX;=JYrR!tQroytIOJsX>H3k| zrE$8Ch*cH7Vbtg5$MwPd^Zhi9-miPVm5Dk%OIAqN8-{QP1P+6qxX(4`I=_gK0n45V z&p)3a_3!VF=aI{6o+?FF%k(6ZxE2`e_#duvIj;}#M~+h6Uo)s9ErR&SAmH`@_23iu zaZR+ZuG@vS^E;0b_{KQ%nN)Ci-VRSZ4s+O#&)U9l@sGrq?UG&hAxI1{xBz(UdJsBi zJRhO2EAj8eYkPR*BH(Tc!9zNMkPiTYdH3n>o}Zsz7Qbi2 z7&rvraw`gzBW)XQ*0*0Kl6T(Py%IVhNw%%uR@RHX{{T+!ORJwkd{6l0Z>L3aTJbZ7 z81qH~fy)Exdf@Zv&3;+@XZ@m{b8!v5+U@xX<_TcisDLs92lfmjSQ3?{M z(~{+ijm9p`ETS1s(s8(wj7Y{oAb@ex9nT!vO(fS@kycXMw>dfQ zlEHES#zExd5!6?_{5$Y%mc6Q4+Ui#iVIAGQ%#dBN2nF0*9^N@i9g4Rk7?q#oiJg4H zs=(J%kF&E$`>HFMrLXd+^0lJ3XJyodCp{gt=&ZEseKvgwd?Vr6ZSQQZE~6i3XqIm@ zjp`0|$S|PvIZl}^@{&2Psw{P11QfLx;)7$7{VYnsxUX(+bkrQNT0 zc9oJ%Z7m+C*3hOpuN2#JIy|FSGuS9C`MQ?3w7(aY7 zs!4ISJ8k0)y?|A>@h+WlsCj>87TSAGTH+PBkjmDupY(VkNe=eCj$N~v<_+c{^42KT zRzY7<@w8VF$!%#S%aH#7=mn*Mf3QF&4#GLSz5DK3Bw4|CMFCDwhi2@KkxDX+lv3t$ zvbRlL@3&3N>PAhuN}QsjCE96gw=5O+R@K>jo&7gc*T3N(n$J>}!%U5@7CTp;Qq%13 z0MpVbcd_3YHuA|U8D^SI+piE7c8%3X4S9T?BcA8QFliQgHI9*YZ*h5ZqR9j{7ck1^ zD_BB8U(36_be`@-ahX-6^0vfK4JYwWgFI$g{9&cN>=zT;#W~UM0n(Jb&zmhWfbZ9X45=Gr%uVk^tbhlpJ}{L@9VByKxu_rD`L^O0w`i@dV7g?TwGxo|aw0}+Rpp0FlFj$StJy4Z+g|CnQ;)JFF0%QK>m|z|R3LX&<(1{CNv&5hvuj&6 zlWiuoc23Kh*t<4R+@8MTS03YF}Iq=Sjx);@COSO z>L$|CDz zvG`se7U?>t+w4~2KyU4*OG)hGyM>-Bc;F!~P; zJa-qx&lGru+ISkG6pC$jO!|ipGlmo#5>k z#2yjU9?t&&Rb`Fu9}428`P)MrSjm4;Tp=%O|g<|Nn{a6nWKrWWVTm%pbB@| zolEHp#Uz_+d&S+Q8(k|U&3o#XF6_!`lGStbC$+5Fz16JSO8cuTYp+x44+(rq()>#n zjW(6yo1Go*?=KbgC_rt3R zZmt8yH5+uA+2XsnC2n5h$dzMuXn|S2a-~=og|bCjv-nZ)s{2T94ojz{#fFaenx2Vm zKC`I8Qtwff4y&lgDAZv$d*e}U9E~fjsw+hb#Ee56(u-DtmW^B5C3~x_99vDlb=peH zU0BjdC3#u4wAW{@wq1I*wd>USa_8cnkD*$_cP6}9GijyVy~8ckQ7qxWn&o7T&zCEW zrfEl%BvKn-$gTP1wDIni&}o;R9JRe`7W*!_ZKr8xPz8~t2Tr(DWVdNqQrXx60a-W6 z{;vnYbsvRa4g5)CquA>H7{AnG((SD5*4s?i^&1OmY~xfh#dgr$tP@1r7FqWji7w=9 zI6x2%^T56a@E^me)jTnzY8oA;v#b9AXN`lJbZ;)g+2csn z8Q5j9S+8x(lcP;R#Yu-2 zL*cK)81!hZHLr*w!&X(a6F~=u^ry8;=p}D5NYVydb+us{-G{ORvSuzaY5dQ#Y z?;UvGN4JYx_>JLxYE3C_t#o^z?5$_R*NA6W-f3=R(yp#;M8;sPZpm!Qu*RNnW@C=_ z&sjQR%D-$nHU?{7T$UK@3oKW9g~f~( zzFwTV()e|57UYQMo_n@Of#hr!ju{HNv0OZRnTC!VsZOF+X}HeK`RJT<>9(5M`Vgd+ zvC-+eO?jxCu9HYtt9`e^L5l3F`k z@%xkM+xWicz}LDZmxVlO;LnQD>3R*__N}E{={K6h7k246Yh;c)ttAlL&GvPBmAAQR zL@6pCnbZoU;a`N_5!7xpE6)pSe-`xXTPQ4b4KGZ+)h|YwJ%lpI5-mkFOJ#YkCv^=t zxbo$aF>@&xE1uHFRrrT>t!WEmai-mAUu4woHMwk4S+Kj3ZP7NF_I1=C`y$)Ma3)yg zYm0@9%!I{*SM*;Nd|a~7b=mdJE5g_M#jdbg)+qINv#`{3)2Nv4VX~6*PgH$Q#>W%M zy^2q8jRaB{U7Z}GO}N=xlStY)gw@wIy~@0{O*O1rZLMv4ic+`7yq|4XSF-Q6nqTGV zdNh72x78NLZEM9Tr@gYnZFza2*)^;SBA0UWiCRmUpJwKNk~K0!s!j;44;wYLo|!ba zx`wM}B4pDReIrf1kX%i49y?2k&74z7i6)U=RuSxuBs-ZI#x4&YX*O0CbLuhOTG(IP zf|rc(80_O&7T)P5h!6Fu&6HIIiEKs+pkvo*TBe_M3$=~FxwX34Bx;g}DI{c-8*}@i zu>8tGhhTD~aBJ+wQP)JH?=^1MQg_o!-Co=4ajRapEfc97&YOR zUlzU>7&hzPF!0^BwG`J@`lYs=cY4-V*S6PFTijb)nQY>aq!L77JL5tb7;PA=-8u3m zoUWwPw7cF-*~wd5+T99oTQ{w*tNXX^esg$Z#%U@>eBj3_K*{JvI`r$;oO&K}Uge|f z>nKsYFu((o)w$que=~#EAawkk_;>MY3#g%3u7jK>2Z9u1fDb2=_q%#^1MF`Kc=|hl zg>DpV+AvDxuyN5#U|{Dw<2`xB3es2B`m~x`dHU|YW>SlbYh60o*!z1;)YXcO;N*^+ zj(YVu_vzQtyBka17X%yt-Eq+K>)WvFiut=k@hZ#ZVqCG$BLs9OAouJC=Zf^rKUgjl z?>zM64u7sOz&v{KP~x>t&9`^t{#M(|e==-)HLd3Z9C8O-lY`v-G1ok2CydtI7kmy$ zCpj47*mWn4ImUWm^T(H2YDDnPZ~(yI<+_2#Kzo|$t+fWo+H<%Lco`&Mu1_3&K_F+I zDx=?P=n3~DvGx@LOq>8g;A1;a-sg^TM<*E^DdU3#)+r## z10)hj=rDU8N#ON3!TC;W$;IXL>&8k@Uh;8i>l-zyvt2IzUiUj_;-^i;-Pu~rHoMzv zZGY;=#(pjE95;~28URafUPezj{P@SdM?7`-v*J&}8FtMrtUE{!*a}WCatR#dd-l$8 z>-y_|phX~HV;qB=_u%#PB;phG?un~ zdL!_YUC=M|n}Zb4vVd|94^YGooroNP(2>YIoV#sAhD3HeHdt~<3JD)KUj0sS+>x67 zIrxL{Rt-fbku5pc#2u#xZ(w*i&(vcCbv5xPjeZd78bp%35U|=#2h7J9CxAG{0sF@| z#~rKqgNwW<;(imYS{RHKOm+&Knx~7y)0CwcyYjlU`5L-Ow&M1U+SOnBMFAPlZYQO{1jMmuwk)WIS|7_m@z;EmjL%JZK6h6t{|V;Lme zRGsdU*}XbzS~VI})S7Kdi^%XOWP9{*PTCGZx zddf0)ig#N^KJL!?B(3O=E|y7`<bPByO^9N@TT zZP)-3a(EmrdUxZF`1YMV$@Xy+cssB%au9$BQa0e0Jm(nX^PaWlnn7u=F_H4&Mih^e zB>H1M{QjoBlTNZGHEphTk@vb2j3^^3fhs9fq+OHlFNc~$?Dkvcj=03a;vhUbW^Y% zGC&z2vz??9jmIE@SCfo$@m-0X%NJa(!b-O0!8uhRe4zEn%XU2VxGn%-5<|dVLX2<# z8?r%R=ngiVoReQ&hr_iM2IW$$ttr%#)=^6BwQIRu`?5{*D=T$8I@O)qQA#mdM%GbX z5=(!V+;oV81+p-}@Y%>99PI-bB!WQ4IL}f6p*EhgNZZ%sVgUq#4u85ww-^Lx(|{=Z zN{_qL738)~a5+}SKqolh@P3tU&VADSf61eIMl7&|q`e5t{l#;og&^+}eqx3%KB9g(?9D2X0O|$3LB7 zc-qPvOP4TU1!7o^Ne7Te`y}KRI6c6~8Rx`dYtX{O7aH8rRZ?`5cXv)Uyk%?Z?AmJA zTfKQ@o?=~TQ0A!s%|J507^vMl%Ufxpy_Wl@uUnp2k87n{HNCtW6p&eh5>LyF1vvL4 zXVihzSCn{zSB4dL^LDWk92Nis&I0rYa}$;1Wbkv6PiL-Qgo;eEl0m>Hz6ikRPhPm= z(DPqBc)M0t4ySPAieuF@>E%fym&)@9T!t)|;|kdB2PBWWNaXWh-Tny2FP2XVN;GNWBPS}6l8k3j z$u_0Y$==p_>7n?aiI{B8mTywD*A9{-@g@{74JK`HQ@X0X|y0 za58%B>FdTTm+^hIpTRACF6MDHyu?d#la-U~%3PDWOqI?EJadd>^Ix5w5B;UQVc|(F z$A(hk@>uXZn`qvA1p_29DGQvOk~kw7AdF#rVEwCg4;fj#rnLpcI;$W?is3dL=O{#r zv^L;W92|^b=b%53;XEeHB^uQ68G2N)7?%B|DPNM6?75)glzAhqlC!&3Zr1x5XY}DX z)uUN*s3}yVIP%J)QMR&Dl9sw#slaK~S-8l93`m^Atk2Nm{c#lQb?Qrwh-CNxV$yYA1NHV;P2a!~+ z4(#U#9V_MNbon7z+a$PEA#u4tWev4J%Wm9pf^aeTSM0z16R-An@CS&k65L3?Id^3UMpX_g>T$Vi$2Crx!}Ce2s?92H*LH2ECY)E; zO}C=E(D|wpoNGdKq~R#FB`C$Y^DElgrSf~;+g%^F+E0#t6Z~cUn(ejs8h_gFbau7W z?OSJ=1>AP~WRR%=mE({Wj!Y?4$Wxx5h8pIOy2iPGV`BikNN#7hfz$^t3QHnKCPDeO zfyVBcA9w-}wtRKfwJkfv+Lwer4(f$87R-8zX%TI(c2p67iwlT%s3+!JV{gnEPHU6+ zQ{b-^Xu6~~I?b<{6SBB@0rNLDS(u!B+nDkIz%9w*zYg&)JiybfP8dglqm0XGPYUG; ze)gnSB5+bsjHL>5i}q2JuABQR>3-$GRBEafr&(0CB|@|)&q&66uT>W(txa6HZ0)VK zIGaC&H+G7%-`g^w8}?w9BZ3P6IBbEqi~u-Z*rToLH(niIBEr2>#VfCRE@~;=rmtk|r1nhxI|)vfCO*sD zr#f*-?&6x=#{1n_UH54Czy1cJI63q!8vf{{ zct@8YWsFP*P=62(?2-m{`eM5MKjCMca=45E#x|+jlzi^V#xtL~Gwgd;(AGZ+>@Bo5 zxLA})wl&;W?vS4_#_r_q&VLRwUd?<(99Na*1f=tbSIcPRzz7ZqQWTtwlaQsm9Fn6wX@bjBw=8_A z0ss_0%r=Yz)rTZ=*bZ_{cz;-Q+LapK&D}m;ZJTdSWbJ-#WaF|)S!j+5xH;~$Za2~N zcWrXpM{Re#^jp})ztIlq9w>tnyHpic0YT1BAo0%`_3K=8o*ua~7CVZT8;Ysii=2`I z^~Q0Wfq~bKJI2-C84NRR2Zd3bfFDe0dI?YjbHEgl39T26m0g z3bAv})1H85Cm6`DZz6^(8$1?eg`-k$3yFpzn@^kCI8}v6sjIf(%?s+fZL&KvDwHVT z>d>n?eD$K@+?q}iagvp^(Wt!=X}7;a=wI8{_J`8GB}?Jo3fWw&vFX!Gd3h*u=wyl! z6akoEGoWHl52E!I@DljLP17`6e<_6Xt1_%3c3oA81hV9_Ha<|j$GvcW6ScV{)Gn_# zd6G!dd2%B$+F8a(;Dy2G7{@prjd!23hr{bX75JM?pIx1sOA;2DA0{+lc5cHc`AKGM z^j!Li{o66i=a~IhInArQGQ`IV8nK5hr1^cEVHWMmnw78aw%cv1CVoqZ#^I%ht(V~6 zHChv#aOo!1r6(!Lt8%vF{QQqu_-n4Oj1^|J;Dxl?a z!;F55Mexr@(mXo`r0kD6!wZHa5X{NI2oCMTAY+nn20bYJL-2=yeidley2hU+qg`9u zz!vMvg-r3!8W5$U8;Om83%EWDF*q0l#C#j^Zui9#-!yZq`hA7{ydu?@zV|FlVkl4- zc}NYEzypk&99QNXO_5g3^S6@W@Y1P-Ei?|n5% zTSNMT@aM(-C&Qj0_;I4?n#P;1cyGin4Y(;RQW0f6x0xEU<;ap3+Kfp9?+!96=a2X( z&Y9tuE%c3BL=Y{^*Kr@*gx=`jZ$belVd%ht*p3cQ<5$9O6L{Cc*Oqrv>X*`gXUQCv zethycu`!j~_H||fMgZVrgPibd=uaGcPw`L0i;FMq%egNiwT)E}ZehGARb#jKiNoZa z_BqCD?YQs7ESojqtow&(<#lOR#^EU8Fdoj8I+SVEQJ|$#^x57gw?rUyHP?~+4A!)xAC74-9+|L+esJ< zRPIs;Dj4)!95N`zGIF5foPa#S=SPM}%dQ-@cB+i$XwF+BBX2)37{++WIrh8X3hNr< z#~rPd%B62x87Wuln(0TWEEgovH4EWLWTnaAaXI0 z#ePY{yeFLISb8r7gsUpfDl>D9Skrd4t+=G!(r;dCE7_;7)Z=s99$(KY*oeV7xqCRZ zDOy+2`_}kZ#I@5^W90ntA&IJN=$*bk~Ysu>u)XHt2{@y@46Q&PnK?;BZM9`qRSy z00(UCJeONEP^C!$K;4`Vz;V-@_8A$k**IgsoMt+taa=)Ld3#9V=)tJ;X-ZN_zjsEW zlHY5y^Zf6`n8%);1zmF5Gj8&Ym9)0n@3U^*k?|LRe`n-|TW6Nra0MGK2*DURBo1&$ z0Jpbl{Q>x8;hkH<5=7Cwvs%pvf@r22qcH`-ft-LM1LfPxdv^6bAHcTOXq%%OllXwZ z`@J@S#sKNgaoW54T`C9yswl#*0|zCs$RL4}&q2mL^Iy_DEx=h%4bxD|@U-Zrtvg4T zog~t7*H)8ua&4shqw>7liLu$7rR}RqNw;MwIW2eJZFjc1ecV_4B6d?=HQGdoWo_I9 z0kQ{N3NL4bsNCO*lfC&T-PNN4I z;AfolsRM-o01qDgV>}$=t`1L9KKB@}zNLkuPTa9eMWw9zHGQ70+P(Yke9cPJlGaVP z+1-C$q$>n!1^_s~&fl05%Js%M$j(ZJ83UdiNL*xKW2xJPJn%^8BN@m%4tjzqUBSUT zaxvdI1ac1to|w-+F9xc~I-noj9gZ7m&n!6Qk9Fi%RABW>@3L0kw*LU|3?$S3wE2I* zEExs{066Fm%jhxaK?I(_;BYtwyWA?|Wk>^RpH%}SayZXlML&_un*B!wj?v2J1fFm~ z=YiaI&(^uC-DDvlT}BHIGJE5adE<=x{&lK?zsYIlw$&kbJxiA-x?%H(Jo@9xyif50I2KT6OpV7Slw%wL!98$JPfv5je6!6>C&Q7vb%QH&8=H z&EJTB87{Rs=Z&ueN0Asy$(!<6!oR#Hg*UtM{Thi{@bsVZ| z$rqwlT`rc2?R34h-KDOVI}LAIxxPZ>m5aM$?Sr1OdAvJooR_cy~+s?|p?8cWFDex4zA5b<=;t(CGX#qZ_ND zjEn(`<{?Q!TwoPqdCzqxBomS`$J2fu(V1e*@w#g5XzE|2Yyyldmw2HmDb6usT$u!e; z*(og0g1Fyf({2AeWb3wbf@P>)2L8jeXzOiTf>v=Uz%f9wAEUeHoO#V_w z6RCzZwKPgmFV4Qs>Nwu}CpCb5hAO3`4IBeh#=OIs^@E2niu*e1S~ zNZIctc-hc6+&Nzyc-sC=aOu~YCH=ShOIu5QsJ+B`U4_q*UOheU83Idq;E5W*BV1-! z;$WslHBgM(YEGPzQnL4yRpgUj6VW9lZS_`6xjHYdyJhvNB0bFM6exk8WM(`$@$U`z zTjAKcTU|p=Gs5d}E#1p&SCLAAmHfEF81|S}TuT^;t4R=WTn51RuK@f@z3@fOrS_YB zKg7CbsbFpO&k5>ou|YWU3ft<|@=EdB+h5z32^6~|mQpS)r?+K|Sb5*XzlDA>@LYOk zto|9+wCx+lY;W{@7;f!PoPN+CS>S2%u*Eb|N4v{Li3D*&Dc$AoD5o1wm95fV?UHKE zB$v9Zmb%?84B3)h!?5sXo#EXVSG%&4SMf|%a+X(J?)*KedxVusWi5ziR+etjole{s!g6lFpr<9igcifqt zA&Du*Eqn`SLi}&F3_)S+gEyDM`b40HLjy&wwFs&vcK@pg0*|r zvDG|%~|Kr zmSeKGv$$5anh7qak+-5_yVpJ#d~WeKi8UV{=-(556Wm>1J+=P7;g1emJ(N0>w(eEs z7FQC-eYL`>ms z#-K#1tT56}Y^{NniaRnSc#(V;;~u}Q_>af>XNkNY;9nA^pW#1+{uS_dil0NWi&T|7 zai>8hv3YRzmWF@yYnNn6E_azqOK&K*X!5Pl_MCe z;!B+)Q5V*X3f!Q#k`=d(HIc;8eU0LVPcUy0pr;wxD5=di2+NvQaprJRcS)recGJ-% zZo-oGlIL=U`#D}wU3GC!<#*erxqcG;sG_si^vwgt8ZU@^Jsi?~n&K}NX|U;ccRG#5 z(1y;}O@$?rE30|qb22;cpOL_*+=iY@*RTYi)I= z&89_vYhw-M_D>`e$tA175imgt`G_Y`1j)RGuQ2e&sp7wnT2$9l{6F|nt-I)Q+-hm9 zT;1sYC|ms_Q-xr(hTSi%ZWaS@i!)ruWG#)POi-+h75xX{+#VzFz2)zMJW1f&O$hj1 z;?{H_7NITlTB?a2YwO!-Zto|!kmd0ey~NWnMMA6&Vig%aYx5~59VC-_-)*^CcWX+L zX|w9m}_Uq<-W2g9sQf(IAY4p3@Ul=i3`i*JNlCNZ#tkNj%u?#Bm8;%0x0?hM9z(QDCM_Z&IUec6w)s zbUV#EQ##enjP?-RTSAxYEm7>*!Ys^hEX-5FVUPt@4;T@lhE2e>b&|7D*IU}^+ufz* z<$dlXMJ{_5F;hk z(bd}CIa$GsI<_*Q)CDTTv+%#|2jjgr!qz&*zwkHWD*dZjzSAvq!w#DI$-A;)9mGpu zmt~E!Gwq65m|Oj!q-1j%w6)N9C&XIjk!>B68r<^fzI^u<&ot)#-rzE{u(S;`Nb#(y zOSWZ{Nxb~U$z$s;@da$nq`Jk$j5FOztpZ#}Zm?WTu_$;76+!1+<~vJ$y8)I_00D9g zB~$s}n!8`ST~)Pw`r7H}lWkkMHn-SKH@dc#v+s3xee}@qJ#Y5P@b-hE_`6rvJ{){S z@MCyfS=;KmmyJ9bs#$33HRH5eV#9xDE%0TPmH4%C?9qwZDB@)X4-0(sl*bR87h?%ziJ5ciSGq5QJYj2p zt6%9pEY!4r5#J=bg@w!5>-tsOTum%13>to=q-yb-SSNy08ba~Cu-qh-RH~;P@Xy3w z27U_9s%W1Nbe|D;T5Udk`jm2L{v(FlRB6nY(|xYqOEaW2`(J;X0$RFlM3Q+K0uocI zoKzJFMK-EVa&Xg2I7aO@<#he{zg-heMJYKcb6)DrStrr%*RIz+gTj9q^*UweJaEX?kvWn*m4X<=3b4l{pSdukZNFepv8hmmu&jxEci~vI zTN@egKG64eGu@e?nmN+W$z@_>yVK;K8w7}}6vEx*-UL}H#?w^-@&CG2HhEkjO@Mnr|=l0*h**v81f z?o)_}F2Pu^D%#~uB(_O(T{N^^_Pvu{wgqPFle=!#)+t-6diK4v*F&uE{{V^f?NPqR z6!7?p(&l$6u|;R9Y1a4#Sk>i(v9)keutzH*Ibh=;0kvzS>9*57_Lpch>7=?6T&<+C z#Ivw+3rQT(Wf-0Bv=Nxqq!|MT!*t(;p9|vF7sTEg@Mnj-FXD+WucXzl^tiQcF*TLC zi}#Zz8fwm-|#HIFX>~!pAw_E`#xs`lDkNCU@!&`%hx<$0()nv$I>1j@rvB6d6zf? zImgqUdgtgnXV>tD;jhP3k!^S0oa77_>Y+vs1~^fibAUhwzV7fZ#i%Z0fQz!+FG&eF z9ApggPC6Xqb4hFD?$b}c`dgukgVn9Nb#GmEZ(gVBZj-KwK*NG^xH;;80T zB2ZVUIppN?&f~xf`O&VLdVgQ={{V>x($;tR11vcMCz3PQIpE{AbDV%V&2L-Y3}=y@ zpoPYJ^Xd*W!0FKQUMZ(*s>%nO^JPMeoCO%b>%r!=Pcsypj-rG^h2e`*4o^z6UIT`-|JXY9aoMeA0#&oZ) z_ulDf`dNAF`CVvIm7dmD>D$jos#+$kx6Sr!5=AS31`i{Vg#dtho}D@5o;|bfbok~| zm0V!w1Y@3nU#<^-?qcqdAPoq~~k6+arMs~Z4wzl7Pwft7UBZJjEEpK#;tr#QbA1dxV0oxx; zcO3FM*M<1Q;Z>#dQXjL%a>V1yz$AmoBXj+lCr#A_t~dz zjac?DS*|AuILeh-#ip#ICbqX{<$Ev7L-JS0p8#W+%<zp&cK-xMH zjC!Az{8#V-=EdZDYgWK4Q)&)QGlHOWC#WX`k^u*=^6wLPL&khHDzl@8mS=^7lZ{F- zYQG}Wno#zB&FyQZr1ZK>{o5q*Hz&>e_wO4B~5cR#wIxc_4he0gN2`fsvkh&*hSiI|W`;v6yQ3dY7|U zighV9t*p6{jnm%JzQ@%nxO{R_p^2wn6HZZ3sIPY`>l?J}uD({cQ;^mg)-Xh90dffB zHV-4_&q0Dn!OlAK)0veVr|#E+LWal$a7oTNDha?KXK-G<4@Gep-b9Eo$tB1S^56i< zLENJ}9tb!ct3ElQ7XSbWN8=$##s>sr2h?{5A4>YXzYv`z%I7+d;?+vemqf1gSKeEC z*z+-wl_;d5?Hj*&=_Zr1+1k%r-^#|9hb&}C70AgYikuEQZqGy9V?6RRUb}5_ks>UN z2gqAIXK~s9D!iVDkVfBQTz-vi_i`ykA+fPa009T4;4_e*5=h6UI%B5PwArm=`$x>7 zMo#9z`FJ~Vj-3haM|_c7IE*!1ULi(S)h5zi+D$t&nn^3YFD0W+=g!X3pT5$QdM5UI z-r7IGr*7LBy3ND=*hb(-%oK9D#FFEEzL`^A-;j&aW?jC`Yv zcOJJFRatIC02XlLmS33X+rA0mi5M6KN#y#rg{0fwMnuu62Mocnk&JHbj)x$6XEf_& zv%orhkZI3d671VnipJ-&(~jQx0F7|2Wu_} zAaJCqAfHY~I)k1KRJGNWY#k~!1>gYyx$0~qHy$>O@3DGcs|K4C1s zGJ}DWk`E)NJ@JFyy;>OArT8 z0l)*Yj=k&5^mmdNKvR-XWdq=Fdwq8=8SF+4Wca65@*rTxVN~-JhRba%Q~&|UJF-Z} zOaWgphW(MGy3j^) zABAJofZSw)a!x_cIN%JL`TNA$(35(u36zi*en<)r1C!KeIpmImAXiu74Oy-vbzHH2 zq{;Jd84rw(2~*IM>7JFqe`JkL@-%ci7-W(Oz}7$vetan5+auV*8`QJ~{r5p{Ov z?W28r-urKVCD8h;vXXLwyR=hxyIQNezMKC5mWc9AOHY?j^Gn1^Fkn@bkO3zc8RtDg z$pgPkSI%BN_kLhqrU4I2Of8Bzh;!R1c}l5vC6@~_anPfoPDM>0haCwI(K zf~+t=##x(oKIkDq=t0eJH=Ybjrwdb~Yu&r=*YND~ft4DyCxeWo8@8aX?XC6geKdU!m$bhR$q*bIs^=`F zTO&LJ=s;1lafT+fq|o(ryN#^jmuuths{xiDAOHiMr@veuL0m&DRB3 zF5bKzg>l=Tt$Id-qS(i+StOQ$;|@212^;~m;1PxxB%X8A>FMLR-i8MFiX64vY09IF zrOV8ts#deJzK?4scYO{F4+qw)xhuuelhaE+w{Gd(J7}Gry$+w?F!1k&G>L9JRb?;Q zC5f3CR!!|8R>)`LXvj=s0|zH}CcG=+KZd38myKoAbc-i~%k2fwg)O}^^6n#n#t&c# zB-h<1MbWNoE#ivGAdvva0Cxa4oH~piRFZfg;|CRnbgLT)RGS!(phH|jn&)Sz3obhiOrwyy=lDZtu6ErEg$9Fu@Y zA8$eMW5pf^)1lC`eM?h|P9Yk5TPbFZADl8tDzGCf_mj9MjBs%N6nKi~N_b?q4H(Ef z)FSz9&j$<39N>^K?ah9U{5{ruDXmz6t;uZj2$jrIl*>NcnHb0svM_Q$7#v{N&tZ6z ziTHysuZLNTvg@fvlcxQgV529eKUwm-T{P2okG-ksOCifL+El8?4zUgiN>P6l+q?Iy zv{BPn+Q-db4lMNl01Sa1qif}pAxU2A22U}8EM1sxY~`@dP5@ANuX=A3cy{5UfLmMf zv3$tNDGQOu$^itfe?EJf@?VcWA@I(bbkC+ND9lkM5V9(|l+Ji3I2-~w`I{N9m~Fl# z>~xMIM499kI2%YG-WkXnP7i)7@-8aN@wjY$E`q4>wIf%ac$vA)$+>gQww$i+rOnMd zMOxQx$LP5g8aSF1YetHqwTvLz)0Hh#(Wb82Z%cPx>H3Rr;++o0(gA5aZ|27u7?tD1 zDy_!uIb)1xu6XZHyzuRxjS;1_i5<*)w*&=2$IMvc>GeIv1$;T+zZA)-&fa9ZzjCD` zEmpPY0_H(f@4`nr|$_*2Tfe%92u)5#-86lVw4oEz3e=3G~uI`}z&$8a@aL5X&CkzIAbocki(!HzW zKZUOJuM!yN{oFSM$!xgW<;S>|Ds#xrK)~Q*CyK!Ey`GU{EOT7EL55iFz#xErZgZZe z>D2bl^z!V>F3qtyCp56}p@@|!IX4L^PD!aZHy&i-)tX$=PRcj3cB)aWgr$$Nt0~h} zVv@XEt$Qssw6uPAN0;ee3hrgNTZ_1Vcb&{xNFxkR2w;1$JZEq{x)kw#gQeQrn;Uqd zY4L(oDv&@RV0#cZb^v=C_WuADd`R%_k*CU#Nda)7K~lNSIo!F=9Ag9P>0cshJ}$l1 zb#Jv>!!(eYI3VR&OP&YJ20D&&)2RekR(q6Tu*#e;*qBwTB_C-uWee+MouuPE8g9+q zG}h@Xzw0>JO137GrAiTQ5tq8QZ985sR+YQkTkLsmx8OZOTd^b&u=86hum)f@Et@ZZACRqf34Hq-MFP!)}`GCoHMfWQC=13!DW9OPtpxJ*54V_16el&Hq) z4J8>(+1;rtyWL&t(@SkyICBhZ!pXW$5tLorteu_i)~?zmx=FV6ZpY0&Bk;YIrKHxg z$ewn_7-n73xET2kMo%E*^v4}}_J! z@gL$(ize}uS}vA;(Jkfqx3-VWU?h@h6##d^7>I6iInPXtSFE3hy3VQK%Z*P^^RA2v z_Vb{HQ#XMLa*W}d3x*5GBN;pbEBXV&{tV3MLk)?HVJtl2=O`u5?l*2{A9W_9X*B%X zN!|7c0yx00KArVNg#Q3%SzXHVvJvD${d-O?*_b^d%WeQgu_4S9e#uo3~ATwR$7$ zPl?~Oj=SQEC$PTNrE6-zEyIqw`XCHPft&)f8dyUu7l%GiIYj8i9P-_H?TYufXCdMImTIb0 zYvLm$>HCVjp(ygC^^B~WS~)k$;-svTKC_82_{ma+3ZFD7@>;V`l}b`c%IRB0XQ##Q zS+n}g@h`*AgWfvO<vqnC|u1mx+IE?4yQ_yfT#eIwLhfDG1pW+=WU%Z6B!{d9qYpHA$6k#FV7$i0}@<=D<9Gn4@o)1dCI^jGY z8Rbpn>Uedqs9HkVRZ8gbvE4I08v_8up$C&ETbm-J{ zqVAH7<83>q{dx5GyTacAEMR!p*hCwG#K)77kT7z4XF21ZmGno4J{4J6M;QAu51Dh7 z91sQ-cqASH$m6Ct1Xs{jnhmT9yCOhw$p|r=WDWovgYUu5BQ>=hq%rWQ2poOhhXfpS z!Q^8Y?85!sf z2O}o5flzjWc?Xe7{5T zZ|>gPX{ue9UWb=DvTIp6Ut3@F{{VtJ%T*X+2N=l6_9PCyFaq@*dQ(ZjJd=^0{Z}{u zj1#~oox#0^jyUQ;9fmpQ9XAh7 zF^);iV_a$>RDee(CvnKiouHiY)32>_x0=9A{_~D`B;|%rUVCKiIO4CCH`!f(Oa2Ft zj;iARH3#R$4+Td&VDvfTIXD>n0L^pPy1JlmoQ!0F+mZP3gOl&;is7{n5(OlQaC44W z9JQRp&a#j21FMn`sNl-M|E%m~8a*82l-#B$DZ;dv^2n`~$z?ZilAo9w9}} zB*qEngU4+8_a4L0b@NSQ#fuqi$_FjZKm(j-w?2pRuOrvIe-t5!xWLEv`91jLdklMY z>0dwir{b(u{%JQL02p&F2q5?8zti>UO{m%K?zZT>*Gp>kex)e6bkf@2vpw&`9zKp0 zf8UISU^3%4_2lC?;B$fjJm$V`@o&XgVM#yIFhL{l4tAc{@86t&K=NC+4QErLpxVmgtOJunUj$sZAZJzd_l!55sd zkT368mFL|Il z1$5L>R`08B`uSe#dm6byo{>sTD>t><(e`fLG}ZOg_g^3UWxLd5a<^_nNh4rB_R9v$ zk}<$60RSlHXkOLxcCF&8%l39Mvk6#`9k2;4ox>f68*`qRImaA6p{dPqSmnuL9FR|3 zU>>A;ZUc63bB<}2k~I4xb#2UeBn*R$WE>n5(;YbII@e@r#urPv zFi)B(_Tg@bf+6;a5J!m4({KW+V|&G(H@cPx;xb#cjEc8bXHt&cnn_hbYq2D&Fo=)gvj+YrZPBv(t2KPgm1CKdm1oFE_*zS?My~TU%R?vnpQSLdiYAlViNF zoFl>`dxl>zqi-MhljGlp{yn_V{?P`FZ*LvaLwOCA#mD+|z^qFz*sZP>I~914tO*|S zi}J>y3n{{tKY8;jM$&3p-O1fPXjw;8v{%=CQ?b)lZcvSvG}W&kcHEmQIZjf%e$!fS zv(&MDbz^_1MAlbYRP#t0VKkSK+u50z6C`as@(5nybV+>3mP0(U7HHIv1$_(Q{{V|8 zU+@zAHu1E#68txIg406P*4kUWcj5HtWtAb&(kUaF=S3F??vWR3-6}%UTa}J!VsD)9 ztn^P0==MGvveKTUGUA#%o1JBWs=y(YHTfUWcy@LTiZ4o;zvYq99!;pF5K;h zh;RHqVduv&hYK`UL7}vkJGiB6Udpjd9Tmt{lzD*~+5Z506D&cT^h1@Wc|Km3-No;v zlwIwvi*F=z%bn_|r8w^5`IYS5o~kQLH1%(1W2^CEYC2@l-(TuOTF|4r@_aqww7Mq4 zT#g2K*I^<^ zCOaMpr@WERm*xpwQXidSaMkor!cP_a3;3gDb?~?1{C1i?ldQue{vP<1;d622_&z(h z{J137bjXd%Sn1MTh;8q6wT0J74HPp+9i_+GSIV`^OYhm&;w{I+Z5zgV2Z%fcc?|RF zJ|oq2xjaLlT{L$4?QZ8yC|uc%MoVIm#x#~#{J9nj;>$*~1K6u^2>|fqS!INC{OI z!m}1tNkK8bWN?UW%Oc1HbKV|04z~KezNw>YFy2obR}kIU&10=v$}VG&{{U;eKiKy{ zjH~6#40f$xmUif{%*=$y|!J-0F~NHn3^e5k1RKso!eyJXX4l zPBikiUuLqlxCU=IR_;GL&yJj*GQ6XE-ZqkYy;ZK&`K_mWG?s)YzF*+C9UPj3*4w8h zovyWd>ECnJ?|fCEYgg0hQs3L#YF9A8me-KPiESA`TKVG;jG*1jTw=xW+kmHZMbKxeeJxU{vnhG^_=E^H)^EYA>Ih$2vB+|gvn7}Yzg z&xdkrwkO1%5Ye>*VDL$&MW!a9;WKk>_7W|vr{MVSz9@KIHCFMZ&2{5BZahJ%taHAdtX%3EwjS=*c^NR)@LMb~K;{)|TR34s^T!@NLqzze zu4%fDhxO0xj}v)Kdueeu*t}zZX{G9s+r}XK9J)k;OGb_*L^9bKB8KbAhT+vR(bvZw zCH<8&{Xbs0{{V!0_L<$Gillh;nc#tZ8)u-y2BB$XV6Ue`0KJOuP_j4oQ^~4o z$#m;&_H=^NPg%3HQkL&_H2{@Z!G%6no$#vK7fRZ82? zbr`OeC@(b0?X?Ser&}?8;J=qMtK8d$y4nIHs_7alsw-}XZ|?~oEkD9DtoT<))4Un+ zf^Q3Inhu+3X?m-qOQ>l6G4TY~5#2wE8c(#rccyA9X3^T46^x5{kCz&sA(~HDyYLr_ zyaS;{CyMobd*W4%)99hEejMMO3in6V?ovY~)%+eLTkBij4q3wtrs`X3%WH`*=8UUs z-HKIYoK)#lk2Tx85{zwYMsrf-O=5Fb8Qn^*hwk6Z?wy++q9?cuPhNu_A?V;>{B>s(oNm{hPmNwQqt~Qz?cdF`Lj+JqG%m(bA$)>u;ah1XLaB&aLnX!dgW+u&ElNxO00nrK+HTgp^w-e(=fjPn>DL;chV;#Bcz;g&6|~w+QdrG#6@}8`SWsL&)5P{~ zY87qLCxyZ`*GvLVC9jRYXbW!^-dJhckHT$R!J0Osq!WLs&*B>`R!C%Yn&Mp|(&EV4 z$#Y;ryfV!^kWYBZk+Pzzx6k+fFz{!Kms0Sbggj2V9=qc24Z&j89x?DYjN;N>J9k@~ zo6iql>DG{#uWh3UZvNELTtq%cRbm)-&!BkMR`8wXiLU%f@J)rrp$4BHguHFxOMOb( zKx6WtjtyhOejw1~Hwz?=Vus;$NvD!V4rBRP8I*e5tbL)QYC1)u-QU^?^vl>~l%wg7dbV>T-960mm?0trnqM*Jo)Am}6}h^zy^?E#1QBeGSRpYh#v%+++6&0UtkSmIfwyV= zskgB4@4?ivxA>Lu$4!@Bw0kSrU;Y(7xv2PaTGJ(9mbUO~aBB9&E~Au)@2!k>Z7U4h zD1{g|*Y|gS6r{G(_5T0}c+26n@7wKShD$4N3HX{lBwRGM>PD$!rR&$T$6;oy_j6v` z1FG9c47;N#A;iW}lS|pnS#w@Vb2PM1mN!X1i_di!w=HH1@3;~R}%O&YG9t(48QFQisG8ubwG7lDb!lX;PKl*DH5$+WPZGTdO-0$sH|HHqqU^7R#mDUQYKvN3}WSz0tnI z;r)Kc!J2`VGi|J0=&|c7r^_TmFr7YWtqZJ#WnG^o+iNHtcnt%>pR{$hq>$?$v=77W zS4EEPZ-2Ab;bpd$tj%o#NA_fU!LNud<&YT}rnZ5}P>CwDZ_kdtjr&>JM+{naq43ki zTE*t2b#oEbdIMDZS%KZ{-UZ4vbQU1G~!@SXnvgoGMp#KO)gBh;s|w9>Dx+To&WS*;#qxDw(< zMLP$hM{w5ni8hsMt6JHrd8sN|UZk)X-^!XiL@({Uo2Dyrw$~30-McQsl)ya{zAlz) zs~sm*p5suryPxd#cNaD?O$537LB8E)Y2+j=e4 zlGU}f?60dLD_Gu5Yb9pg`>TFFTWi`)4u28dT;JR4o+O6nO}Lb#O>p*+OLJkUnaV0p zWhBtSeG=r`s>KUU5`xaNs9|;;6UEjst);HB=G#G(f<^HjzaHp!6B%@C8Az9D0%PVl zX+y9$U{mi8vnJjhXT9Irbvt<(=D1+@NOl>4M26lKd{>K-i6 zG+Wz^D^&3ux^?}N`8PI6b~eEKLjb;JnJeMZmO+Ml*L0&9Cw0=_ zjrQ+<*SW_TIK4mNy+7f#KTduh{9Kj?x6R>2=n0e)$lzgt80pmHbsa13e}?}69NHlU z>#n#^JkVRVPDsGyw;XawNxzpE9)-#Q0^7Bld-2z!1vZ08b%F>ND3p`3IW(2l#!iYRVwndnmyGU?AuL z8NzaX&PnJ_EAP*TI>x5(+m^hT;c_xY0a7u*Tz(kA;~?NyKf!D3*{;`mKe)L=ch^rx zb?K-41NIj|@x*f~{`)88ByagwBcaD!^}*-?uU68$PLe1hL5%Uw0FI*>>P9&9>0g|_ z5!Ce!q)JHRDDC)b<~{f8WO=DOQWRak%=9zo%;+rEFq_4VoV zmWg$6LX#_f^1ZrbanijO#(9p=WNZS7+wPo_I(qX-rFR$nywB5{m+w~7Z90j2-l?VK zyI+~@mfDK--_JY(I~?@u_zK*&zh2FZsvS%BQ8-Gx_{Oh8% zk@FwSdB8s7nocS1yZ&E~Kw4ejZGRP}{d)Nxj}6#kfZ03)$Oor6^z3olu1__g9m+N{ zozTV5-$vyDq62&XPG3v^W^e-}_sUcEWsSCKj>S+EG&tGDKEG8mk5-vd9F zO8M*#30K5HVw$NXE=#F7M$S`>?Km}~N-1)rqLaPbN71)W5d|j~XzZV6lIZ?h?BC0& zZxL#G&FVz#L{8Y+31$1o01k_SM&Xf;M;Wh9z4(1;sOW-8M523zAzcnuBZ5!JNmJW- zJqNKh!PEf2#6bh0N=k;wx%1mI^Ln6J?C zjvc^e9Q3MCrj!l7Ur%_2?Lq2W;SAaZva>{yk!DHxP1gKZMD*9)N8e z3oVHVUz4JNRJC zL1J)u9OoT9vs)H=R8U;0RR@*Iup|+?B&jDH514a+D^|%@3Y%AHjj}L1Dv|HOHPTzd z6EMmHxeo5U?jcWpGQ4q&cExpCZRss97^vOZEw7{QmYP{!&tvQI_{J7$@17UgZOr3s z`o6|attzgX>WCf8WkO^I%9FdEysrRvJdUQmS@C9&^R&|#-v>W76pf+SF&ntXM%C|@ z{JyomRq_JhW6YDV{skp@>%jw_{qtTY;|Sw1K2d+wzX3oTGh=~)+taT-YwLJt6IPBU z6w^|2mm_ZaawP2*o}KQ~cUs>=&~#ww$`M`Py=T_RZF??lx-E~5^@N7i%48B8l1puF z4^mIBKjBL|yWBUljfOi@)=9B?wnARd6R>dTXk-D;1FfJ>*^0Qrx<-e%4-oM!{4 zrEv{3u|+D!BPiJC0Gwno!N*Md*Yu7L1hF}ND^zfmPh_0ZT+5r5`u<;e@%YO^y+tIP zROKF2m6NuOU1@tO*{?Lsi>vENwU&c!?Z9Cpcss|nx`j|$OQGz2Pcl4*VP^mjyAipRz)l4#JC`1 z7|uU3Pu94&s!uM##Zr`|%S)0?w|RSsyXy3MXzzZe8FdI(t5HcMZp&-7mAxP5_xlHi zd@FkPX*r7zCziu-I2*CCSm1%Vc^rCY8939`JPc(IBCLc6!DeK_lbn>tOm5sWf$9z` z+4RUEi%HCJs+J5113W5*TxU4JCm6;vj+|Aim0z>Qm>Wu9j1EQtaRwAtBoM-C<`RVAWnN>u&ioz!)DTT8oJR{Ne);ZFxgZ49q_ z00{YQfRM~d3%QBpzAy@BXgR^I6U6!~V^~uRauCh~D*f>oINC;WjidrN2R!G2Uqa5t zDAdP<$dWUFF|}|&!5GLSo!H%;ro7_vG&iymppY1Y-@ksnhf~Stt$J|HD#w*Z4f93w zIaGJO)LXPtjg|CPS6a4t6{#ssRT@bpWfZM+p((q&c8gKo+AG^rCq`Lx+i)5+1V}EQZk*6>vu;*=z01M)zItp~@o#{&y>`Jh8|&#wWj|(wtGfUcjL2Mk%tt~npq>b? zoWHTJA=LbhlQ4!yO~*S(R>JZ!a0my2dBHi!$J|ohT)&2PJf)Q*J8~42!jpj8cVj)X zk81he(ll$W?4Y@lW+d~L+}z;t)MpsRdsJq40na>lPDR8x(M{{Vn=%h-}RoHPruBVs`y;Aij+#yKQ)75ZoU9{A$N z!^~~^e}PiT7mF|6 zH)M9SiAw{{){aJDlbmIMB#a!6wdQbcWUzc&mDi0gdDp8Hxw(>yrzbSIt*n!cpV_lZ zMtb>#s$#PIeF?boIuf#1TGMcG*6pO4ZCiVz+<$408h>l}y1^{M#S+^~7(GY&!Gn%~ zW52QLc|IL$qWFWv5I5K+Y=BW%I}8(?Z6IJ1$o~Lqj{JzeG~HY`i8R1vVm`>t+o_wc zO!gm@cb^L`W;zYtUpR=rDyQcgu-|v6$;V+{{{Rr+US-CZ`m?FX@iM10CR4JuUW-Vm~LSGsd<&F^HSzG(AZKjF0Ed0St(c^Wp{>2mlx zxg81G%0bA-1RU35)?Nzm)}HSy)}f+8LvCIU%#Ppz)O5)tXQg@H#NB&XhI`*C{&?^T z{F?^~7%1U-0r~gMd`aT1ds@AYSZ=jD$zx`4$Yx~(3=j^&rvRT?^6&qEJZgN z)b??yRx+HU8C0|{ABDK7JH6ZL_j)&0y%|)kR!S-<@={!>r5Lxn(|2XBO_B6J#1Gnc z!k!Y2?$Xy#hs<63Rwh?DTm>H~IOCw>gTM!`$ajCWMycYv`E0Dx^U9UlFk(k-=v;$> zKItHihdA%l*ASi>(Dhr@mr>KAypl&D+Gm-I1A~M6(Z+gXr)*c%-wD1Q_$E7;8GJdX zhB7BRWhat=j%7R$PaP}zBZWLR$l-#cR}!a-uJ08o^3$gosdBcV6(-c0ZQj;aT~Ek( zqdmneWU!cfQKc2{OIcK!Ppq4?Zkpy%(d+Z`8xIBelUJWb@e~?tjd>QK71S*xPFaC0 zL%*RaagGK+IL-xrn0Q-A)b-zomiANWq6?-?vqv02w6nBqB$buTA)U z;Xe))v9~%!nT1S1Ax0no;YTI6aJ+TTUgX!>-W%}7fp2RhQ0Z2Y#;F@BsvH7#f$$EZLlJflBbgPB{Nn_Xf!mG;74%Pne_)*s+sH$s zAwMC1IvCq_2XlkZUgP|KTlB3HNi#;q!rZgI80|cOc<;qO9%fzfBL%&jJ3( zx)zNTC^VQ;U{U~qa4=8J^6~G}ACa%9{4ekWO0fxzIw-=0DnvNwc^xu1Kac8us?zLj z6_hQ!U}O##9A~9l@WW5C+ly6FyKq8)amP}6{$9UR=++(>v$fb3Bn%GRm0q~y zAAT|hdU80f!BGHE6m}nhCnNM9OmR@LJbQq635 zaD6k3_4TOTh%taqF@`<);Afv|-BM9WDPPwG6%$SrV=dGS4CI1xdFOzAGCAsTRVHQwIppw82dMV>Km44+=Pz{Wa^4yTcknyDq| zP%yxgjIhQ@JYyVzjN|BWj;66^k;j+=9AxqMxA+RfxRybjIps2br*3-m{3*Eiv%l$U ze_x27chy~g!TXueTxzH{0^@)<+&Dc5IqG=sdB+vcUu*HAoXDYoIp=}?_Bwi;dV$ut z8{3J1Z{9aRJ9o+S_2_zz)x>JIGlg&7w=ILyk-O>aKPq-fU2S_=_uI{Wq@8x7;)GRi z-l+qHImqDS^7Q)Go$CHEiZFkAI0P@v_klcfo)7uvxE*Tp(2HejTgF zJVkwWH-T{=1}PYT-^8SaBhwh|k9sA0+4fIOzd@+Cr=7nG{)eRN-w-2&HfK<9#kVmS z>Oz6<>T*ZFe+BVJ#aLk9{*a(9Q~>_~HUTFjoM)cL+nzY`4;E^=jwByzo0VXB9N`>} zah!qy&s_Jfl01E_YO58=Tj{}7`AA`r$sKX&`t#SCRVQn^Z7D}hHi~*$-DICly$zvF zJH0gD%_&;z%=X_K{BO6nSxLJ9eEUF@5-@n&PH})we{7z$@ejmL+AU)j3YV%0E~Doq z$;Lw*5H^mdByf22ub+M->)OF)&1Nb7Y|v@ZImLwU;!Kvt4`XZ@l*Jh(ELq z&BNR(UbZmHfhG$U#z!RajE`LOJ?rET75q-S)ZTC|Wcl}y44B6_J5F=Shk3ukuZ6zx)vvBKsHbFa_jb4+A@j=Z?L2;4joxwY(0i6ri$+ z4po!_6$c|Aax!z0JLB-HYs)5i`5g1>v=7T2x_egrul#z#d!&x4eH4%g_a9u;w|O;g zy4$Un`3-8Fx?5hizQ3V75l?u0y~0F|fm~;K$l7oUmSLVso)m+CE4tBi;=d!dRgg+I z9ideTJEA?#a#>DsjN=%_E}n{0I3S~@(ixWQcC)*b^ZjA$J;H# ze&#(r?o}`_CleJCXB&wD6aYz4jsWJVq`k(j(@OW>OZK;>o43%YK6_5>MoLX9EiHAm z?XJ(u)bwpH!2rM9+9SisZyOa-6$wvf z@RM6>cb*^9q=Qr|qw7-M+}T}EX473*Uq+_x?#ekuq&kJtqUCJ03x&6g!qMAIrKGp7 zp7aZ=m%P4>t|l?B?-e}P2(oX;kSJgTfb?)T000C40Qa8{+|O{9GS4)VIMA+6WMbg| z0A?9RpOkE)eUp0_K~fc%vCACP&8IG2wz_R!OWV4$`~6F$P3t7?qT|gL+*6dU)vbL! z*V}%_*4j3Uf8s9-3+*GuI!1|dZf_C|GsIpW8vUDTjdCNkx0>Dv2HS^J4|5|aw4Q9W zq=^bgUTdvr5b38>weba)!><&dvESZUO``Z~TGNn5Rx4{wR^e>1aQ2p$=0!`W8Y?H3 zHHmL-rAH@!K$c4%4_SSgf_<{m(&Fg4tmft7)TcyQjN(%zzQt90RwmX(P$Y%Lc`fIO zyg{Sd>#L>f7xoEo+IgNkYln&{qrcQq?k%N>PnRW*=_EG#d#SOr5=boV;8$>YDatc; zmFIcr+ELL>#?I2#&3xA9eMKnKa!-{;$;C^VTaj~FICWBr_@lPzq~2bg7u&3?uk^d0 z5O}|A^4V(lSGRW%$D+q*R#6_Ab#ZRr=%&~+Lv3wx=F-JvH!+>DLi6cQL*quVJa*U1%t7Ev)6aj$^0Z!E5HnE{dfgchr_EYw0Y*#V4H%tzvooCtubfyVUKW z)F+PK^gNd^p;eAaFWCLEK%qd6OO4IuyjvN!H$O4YrTiqZw10^3X>vf5SbdvYyEgM? zWohm0CYA?wl)R22mMGmLcWBj-NKPxRaa@f#NjG?|TRrZUuWNHFHMN<=PVuO{*h!^( zOPR%4b3V71GQIheS9Z3El&w5FqiS9{{?hPmuZL}JOgfd)wwtQQXQtVtb}mE_Sji=n zmy&9$xox7Cy;zfO_O~XYmi>ev{$j;8*x>f8sxc^IzyQc*{Vw(R?Q^yQW2N zA-L6aEjIqbH@(uvw(+#O#ndv~8{aJ*;j@kIVm>?L2olKrKJg{1&knmbu?@_cn%!KR zrkHCb756l(6e=R3BB(?~2&xnTUroiQS%1Pm;^l8hlzS~qtudA?U?tG8War>%_LRaSK+Do})zN-CO43EJ!2 zwz^$CbZ+Zo#_lwq5b1X__|w2zZn1Nr>zB8;rqb(Fw@pvs_MG92I%V0ly-6?bZ0y~b zy`(eSt_(uo1=2rJvG87=-W89<79J^@KN{$kIu*{9rAOi`9aqEtF>e*h_O`YcHu7po zdu@D*tu5pQ5)CNb+rzERr{4n#dp?u!-@y8(#s2_}+Ew?5Y~|B*kA`*{mX~CbU0d7O zU)%`dn(o#+Ylv1Eo-wz208})ac9~?|^4Injk>URU6GaA*V->6uc#hV}TXr&CTfM9J zb?zP}g7NMlA?7|~HqGSV79ap6Db2~Z7VRB5R8*xT=kHl_B$nK*H1@J?`bu=-qZFqY zt0t3LUMVSSD@yI_86=W+SGC`!>(<)D7n=2-i|jS+GW)^0)rNzqYx-5Jdc#=WYSz)k zVPuz9w>MEsZ5_&a2I}8-pKWU&l-Doipfo%+u3g?oJX(rr?>CgqZ+#?hadP%n*Alhn zmeE?y8qsBn&>76eGYc_2CMQe8)QLKp-qfXF8CKqkQ zYGh*Z+^nfD0%edYh4Dx)B|bFMwL+Gr`ED%jjK5*B*(_6AMR6zEE&k8;YnfVk8c#mr z8D|V5+aM7{6L?-p-76&4t-kTmTFX^?`gJ+mQI(T?&D!>iceS;Sy{@+DvGiSsj{g9( z^dqO~)><{zoffgD>iV6MOKEu!wYFz4!gSqMQ>Y)?QP+W+EZz7H<*l*M*!2j z6QezxddG+Ed^>Xk+-esspwAV;i$4-PcXonnZ#rn8X`V@M1hQT#qRp|)2^fyBJ|1{m z!_PJBZ==|{UTY2ZfY6340^N;>#Fm0QvpE=?XkyM3Dd@a*?~EJZQ_3ly4}g^2IwvM@M}X+f8+T zdU@|o_Pc!2PV$qx-s?o<*4ya1`CIF632OSBT86pd-6gJXbngz_*k9enezI9zSXjvs z7J}X&Vb^nPiK3^e++oLf1SErD+iBUIFo4+`7f#X(5AAv$?a1 z>fx5^#jWm(6|$8h+>k-21%^$jY&`qCq5R;N#TxdN;Qa?!({%>1)URRtP2JVB7O=I< z&*DYX-RDGrMId`HurfLr*-|ztUMSgrN}%xutN#EBf(Mp(Zlt@^G_|{y`o_ahmhCJq zP2GxJ%YAEcCDe9T06nC0w25rLx`k3AQnqrNlv0(YrJGN6CluRKP2Igyz15#f8K}lF z_MECwlYZlJOI4=NOLjP&cjFRx3fIG0<>!GsU1wwCT{&(vXmuY5Ycr+ppz{NBJhEHD ze#I=~ca^6Y;}Ob=@H`R78oNIPd|K1|Wun{|tbQih&eQ5%AJI(OPN`#ScXx9PTBYrk zrmM2X`$x9Z;)%a`c+YP!ZRtJEV?pSC74bg1K9S)^lTdIq8GKNeaYYTZ7SHxcBWrh# z(gQ}e3Pknph8;uU`;fN1$rN#E?;+|7F44C%thVoa+x`br z-eA&nKZtLscuU6T$5vlwTWPdSU&Ed=y70C3k9g3>&=Bk@(G$kjyJO@ zh{hH}=BN^3>)CvA^Z0+kUNODV?(eLwbiFFp;>J|Jy^zB+ceg68R(a(~va-mICs|fR zc1Z&%#Z!V!tF>sUE>|SI9(sp)wcKM#$;XjFY zT2+PCpD&HPOLL&7mw61jE~{;PmNHvv6ItC^OKp2?c?8yTM1^iG97yWFy|eQC^+VXV zio6LOzPYG)S^Q1n8FY(CwX4_{!ZTjQacO+-cL0v_N3)6LQ-H?N zf%EO85!u{BC8%q5m%|<(zmv(5NW)%DqWQAi1Xl>MOCz($RX}sMa64CRT~o#WCh*jn zCZnS3HB)0dPO~OcTlH@zK_6suuG$8GUNYnr`g6G!DM6Ilo*6z;F zRinE5iN3FUXxh5fSv_Aa{{V5*c-!{5@TP}r1hB7+bX^Zt(pu)$_S4}10E@?oyfu4i zsYh{fsHL0RHMXG}LmNCM7lzvDaT}G6Rzv5O{{Zk%--1@YF}&Ax4SU6U_4IbJ$*ujr z!5CWe5F2XqleVrt~@Q&E%9x2e^veLXmdExkP zZnax03m9&$E^qE4OUWfGZ?VP70JBEOV{kB1v-LlTel|^4Qo4&;7WXGe)^k0so1|Jw z-(%80wQYmFwS-LK9YQp@P{e}!m4S`j8Lw+m$wm}y9$Rxtx5dv0-qBqyt$91^e!};4 z(b?Ya>iV>nmPzmOUC#m0zu>7K1MaWWL-6;++o;=Aj^5JF#w{aVwDAU=uH0SuQNNperw=d^<2NoH2_(lz_5^aiK^F-(p{w1C`v;; z7Z*2v9lrZT-NuEa&wC)Tcbd_D+bx^jqC+%|DxyrNWeFpzJaLuI7>bo^RfRfGgelgk zMc%CMI+W+^=Ot=QLOj;xx|E!5nv1hKYhgW)BAZf_sm7w^y{$ELn`ue3p0-xMbLWXa zZa)V2Ze2r1oA#2|t$blG+dtxs@lV395@=ecu>e@wC?3wwz{nF=wJ=JO-9d3L+EQ61 z32?C}PoI2!@lWCB?I+_sA68G>PvhsoG|{8d^}S!=?}dIC&;5&crCU5{J>`drrP6fC z#-DvUF3XG9Zr@Cq=ldf>Br3mXd`Iy%y`*>_!G0~U)TW2TEfn&dMns0?go}M8m}R!Q zihDFIYx|c8ZlXmb$Qg+ykk|DeANXqLPw|TBn)Qw4t>vY~rQU_1tPgD|vhyJk+v(7( zXy$1N5d`^&Z@j2ity08FHDebTD5p-Wp+QQ~NBSq4!+U$H&(G%h?hV^!ll*g&H?XJJ!y+=#5 z7S4eXT;18d^h*B#Y=(b2Bc`9ShsJBD;<>Z<-|$OOvz{oWv%k9dLGdzKZtWg5Ygxpa zR;hodye|Qa+cmZ2!W(ZZbgd@(pIX?9ok7rG~jJYT2B z9mKkBnDU8iZTvu*b=|YI%=@fwEhn-oD<|3ZVMov&67heC*Tb4huBRfZTWnPjT7^|f zWK~rH3ZZv;Q{~vd_@JACIEnje7HEVK@T&tHe?3? z0Px*wmA7&aCIE0xPJaFL=?x|29Vh1nFYgsU|DB@Zg^J3ZuI{0!>6yhN?kp`Wt*or9 zt*vcrY;0|9?dyll$@NLl9G~|n)>nM$FxV>rKhK7WMpJ!W@cw+ z=j7z%=H`C-^eHbdFF!xOprD|zu&}78sJOVeq@<*@w6v_Oto#uN6%`eqKYy;Qto-uj zOI20X*GH^;d&FIJb#+ZmO>J#$U0q#$eSJeiLt|rOQ&UrOb8|~e%lGf!TU%Ry{P@w< z*4EzM-qF#~+1c6kh`sLa?w>z@_B>*-x3{;iudlzqe_&wX*CXx*2M33ThkyV6{pZi0 zk&%(n(b2K7vGMWoiHV8H$;qjysp;wI+1c5-N9_Fl`}g0!fAjP63kwU2i;GK3OUuj4 zD-R*AtgJrbZhd`yBO_yDV`Fo3a&vQYYikSjkWJJS3WeI)ncdmh+1=gU+uPgU-#<7w zIMmfWIyyQ&K0c9=IXOX{oSdA_jh>#Ko^`dHot>SR7hF7$;6v=i#l@ws$K~bam8s#? z)z!7)tLy9Q8(!|4o15FG6n76?d0@!{MIM;(@GKuv|B$SQgFQrZcX$6lsr!3n?VvjV zfTlnZ_VTUw>~40HF{%8IcGH!luavgFRF;^OP3T?ea!hH6?zm{IFjntZL!h`<-@OiX z$A&y60z4|L-_Vv!?bs(QGT9vRh}W zqrJCZG`0ziSnK(Oi>_$s<@>BaKH~1dJI9wQSe>{(jj5^aD(AUTi$?ZHh_z}zRRJdz z?5&Wd3e}E1Mg>tEOqBlNlkhfb?9kV2~3L8ORqP$D{3e)bx2R|ONOm_6$ETBofCkAz567`@I z)_qA``ZePb>9)c`%U$EQH|I z{-_~*|Kx_lLRZ3GFt6x8j@XH=qHHd?vpXqN6FRI|2_Y1f9iBe;?52+F*>*JNU1RS= z7|^O;V@f$%?#EHQ26W5HadZ1%TPzt2+Co1^%wY%Yi!-aGU?JUu9i@=n<#bDzq8Hv= z`8P(jf$(XP#MAYm%nEQb!9VN?N&fsKK)hyxW?H6xFXs0auE;X4gWEsW0&a6VK}JwC zFdnia*4H2I&F(H>|CFziVKA)3be}dg0CCR-79%Nr-%<<8n{@ z2|FP0$w&c!Ig)x;MWWrKeYU`B`kL=>@4vBziyPA7)FqC+-V$ftV0$-HkLij*)H4u4 zUCY4CKub0doIr>eL zI6JcA-}ZcQb=dZ(thcgrm8NNY0UJAe?`%F}@q7XxKH?HWtlfJ5ahmU((I(+kZrTKs zn(wAbgnL87zKLZuR0DiAj^P(g*8c#a5r1sX)Y}sI;M1KY{PG3q_Aq5%L*bTihidMZ za?=9Ymu*?6i;pbOTta)E;(!&j!cuIozJFk}KAqeJVvL zZD6o}WR0HslxMv$Js(u2&H5I&RpUHVoz;&GB)*c{roeW$SK}L@YnFDx-nN}iaD7kOyCB|M96Pbq0=zK6{oG?eaph^@V2cEKv~ zH>4op;joHTDQ7MrU*0Fb33~%Xq=|iuR{*+F)Cow3SC|71J>u_cFWg4ri|(!mxiKU+ zsoH<{9FIQ@d~E_^gYfgnv_ZxGa8G5WJ=+5uk~k=LZlFczH>Rd8y>T$sxhCYp zJCy8`bL<0~n3p@XR}#t#NbPppl!<4{_5t&H@J`g;RK*f{dQN$7`5pQfeJ>H@PRMm| z*}h>|s4cW7#gc0Jo4hbk){<;D&=R9Anr5bdDoJJH(O4%I;Nd8H#<(4kM3^4blDdF$ zGH?ZNEdBLM*rS*&PVB*Z>RYgZA9!wnsoo7v!}MOj;eZZskn0|rdO;icedASTBx5m zMI#-44a$+PCMB{*Wg1V~ncBqtpw{&V(a6qj#fz%WH*U*AmI86Urbh$wsvOHr)UE&* zzbjEJ`i$%cr(BzP+qH9pgA)X`aiGzzupN^m?;RnO+5HpotZZmc>0I0-n>S*XaIM8- z87EL#$T$aRWuk4!bA3)g{f}(#%17TXrH-8)?J}n#TrKaLrF%tWVt;=MTg&giW#@q9 zqZ2Ozw^hKhtKB-_M?WD0xj~Rl@f1*tqxz#_Wveq^jNlu zuCptBvG`<1TX@mD-KeclyA?R&Xjy@JbC?wRs<<$BL1HFk4=A?00!Jd`!y%WIC!{W}fpx0|E?41g^$oA@#lFI|exF-x-_%ehDV<4dl8lX8Y> zta)2TH%*BBpKDtiG(R-Fnr6<6UhCNl^EV)mrGp8YG!NXrutoYbi}^r)Ci@MK)R`;? zBBs@PlEc%hR7APqX!^`0dnu@O)HWNwc*pz`R4|;_Gx{iY^5LC@Y zDd&^;Db8*Iv2*(Mod)#?w_P7Cg-#cd9BAE3ak)`73MdNym23gpJ)elu6OKw4kaSI1 zfzrPID0q-K@mVy8Pi67EtISg;MGkKLaETPqt#Q(# z_wD?Gw@_9YS71C>8Xx3QfK=OuJ{jZgDwR(pCU$n4YH2t!;)DFiXvZV(8>}59na9%a z=-(-mmy=7(OES6gYsBF#zy$NBl9`-8vmR4}mrmSbSLZx~zTFN))S?>^L)c^M+uod- z17UmCeu$rT5rC1e*F>;Ts4?`w4n?}$TJl90Qr$v-B){XKBg5I^vtPFTM1l7oVN;Qc z6LjE1@Oe)254cL3Mjye@P`FqsnGnRY9OvXJ=PGEPy=W`-G+|;Ji2B>!%?966a0q%T$ilkL_!xLWzGP&JoTy@`1c<89Q$&& zNk4{FWLmAJ1*?gD4Q@XtngMd16Tb2-9Y|zvf}@g^;76F{TF@mo9|( zslhipv#zN|M@bR?zV%xgR_(zh?ztqim5iaYO>s)SBQIOgE^nuJ$nWIh4%8 zEbG%JNWd2KfqiyY?5*(hCy-tT#l9YefmZ=DNZvFzW7UN>O3I5`+D@8){j$jWrQRCn zo^{E>A_lNweRYmBj??Ldj4(8suyfK(ZFmc-FUGiwW=Q~J&MmJ}$+j;X9mGQI0W#ln zxhNRN150Y_xw$RzB&p|Y;vjBO_)kai!T8Rx3erOpz&>A4bzL5AHee(ovPw^X3A1|U zb+Re7f`Gv|1~|_9BBO}R5i*fe&YN}Y1iy8rMGNb_ zUh`t!AX0l1ioUW}P$u}VJ4iiX>EN}w0*5(-Erg)2r0MHI&%+$r;cGJm4nfF8$2be- zJED6uvVWZ?6PW?RYh9An!cNYJ4|9<3OASwd4Q~`S> z5alNJuFR1chQR=$Wank!)!d_*%YaAm?v;EIpt}fJ7sUKvsxZ*kt+^|5=gqRy5auB5 z@$WD#K)jLqUO%U1bW zVyfv=HNL6oQ&hl1VGXquF;h$J1^!!Y0Z1UrK$)r3{1D1;T@bYFmH)SspPs>MdLoy# zp?y%kl-Bca!t+3^0L|c_hx3~T+#8|_+F8c?gex|I^-q>Pp#<0ews z*)=0BeE3VdUn8G{4L>0PKde$!F{rI|l2tL#72jb1l|dEdm~>J8oHU=Tjz=h|=N7AO z&<&k1M$Yg9YCmRbhfJ2glNslypNX-(!NUD9|6$=Q4mo0!^61SIx)k!IVYMM(BKP)- z@i7BSdT3>X`F~BFhs`*u7;&rTKg^wJ1)a614aq}$WP^Q`9p`ioXZE8(8bAHVIR^-q zKYv1Tr8^k^v})scV*U#Mcmzx`NBK9sWBlC_oi>})wV?*Y$+%V9lv|dyb>9S4*>r$L zlz#gKf;=A^V<%16LH3k{lr@-3ejas^ULE??FtCC>>>Z}tx#~Vm2IBw3i%f449$2wP z;7@vEZ&zB^i9MWpYsl1Co;R?V73ozqZoS>}bDXsh$Em4UGg8tVvK3)&t|o({iVO6S zzE#3l(ICk`#12e@^U7fJfV>#VWA!;tO61 z@aU`vx95GmK=E7CxFZDmUsJ-{TV#PVO`DqT=RbmgN%QwU^O5Sk6?97{m}% zp`t=Ew0s=vt@|p*ga`-S-35*9Pk5)?JKd*LF3X^$A!7ej-B+Gj?laU-tDChoKdk1h?0=qw*xyYP#0;_HNYS)WKbiSj`GF}3 zp)O0z2!5ve24S{tp@j~DJ*;0XAna;#Ojlw2@kG%YYVSD1Lyz#S)6)Iy!wM_JkGe{5 z?lOKC=TAguM~6$KD4S0SKDDh#B$TGYY3pNRv``QP<{LS@a24`RVBR(!U}7{?5Jby2 za|o}f*i209gD~1E)B+xCvWRcQ7*Pny4+kKPZ}pwo9XzjIF+Q{!8E}NbFYGw%rH12U zVWdP(vc#4U+ez(d|5$6TqbO{AMepEJy>~XYQr@st&tUWjSWEoU4L`l(CDgI*(%VrA zIP%y3!1a~FE#x#f%p%l}HjQvzp@ZKLgd8y8UD@*B?LftK;436F-*MvrcHy=vM>`k0K#8;61`}#fw9zH^92&+oWu>LE2 zbHYDtoFU2tA|^!7;I6<(-p6m4nXoASoCtR{X47Ki84-h(MSQRPb0m+{h4ph+*u1wt zIK@fn3lEgYX9&o#`7&-93#IQ~|>I zGBqCsP|*&~xwMk%&gk`l#bVIp*~K3W;zDA6o>GdkDxQVhBy)OHEv(Ef%)XvEiuL<- zf^&G4Hp*3bLLm?)ElaJ2xqs^c>||PZVTr?A=w){3Dih-FSt7 zaQDe>uIQ+WbVD#>6XL58eB8X3Gn#8R%TR=vVyx^91i#2-+L{yYtK_|qe+{(83$q|1 z(U}EHwt$)56=s%Ff+=EVFgO|Kx~$Q{zAo0m!~K%NDX~qLChAE)H`5wH!osbE`S^RP z$x9z)GLb_tYi+I=nvBx_>O1~l4gFv1u|o-?K|H6$kYtn~2Acl>eJgo$LR5A;j)Ob^ zl_^A87K{g?&=HtgskF8vMQ3kVptWBa$|A;$_RC0c-j4I4CIktwiNfW$kugKO9SiE& zh#u%96&&f|58OISy6HNknL z($D2Eji)8gDRGrlmx!!O+PH;c%B!bQ?TjCui_H8qYm3Zgj~kN1nFqXyL5pYB2*|(8 zn{2eHgPlHGw-C1a3ux}Yj->{PUN*v?htVr6yYRkeG%NFd517)=w5JA{>)MFK{UX8V z%K!OL(dx;Iqgh%Yo_+)Py1z?#_=SXhKsR_PS}A{$o8CxBXQ`DZl?TGUXUx`*_G3`a z1Cm}V%J_O{TIdGz3KK*e;)gSBTH_%M$9-9aj1Ir=u8cm38Cv0mDwE4*RpOs_CmDjT zncYL{Hm_^GM5BE+KJO<}uutwIo__z5mG^pRQZ-+ayCBRm^lb=gN@ktkl0qFdPZ#bz^)F00yz ze;9bNRfgx3Rd;No($fUP(k;(%(?G->n|Rn||vG%`pJ$yq>l=y1% znJ4oY%O|toq!zHR5(>AS_NS5l_xQ5>h%_2?9N)h!_!v!=Fg@f{BUm0?UFr`eqAyPT z7ve%eD?l`bE%i)?RlHC3VH1r+*06t#x+j4$>Dx@GrqXC`h@F4PB`H6uNV=@PExU+C}7)Ls@zAxu>`6davF)U;)j~R@#zI#h0Z$ZGM zJl|;W5KI;mTWY0a!)kVIFkP?$!K(qumqIdZiR@twnv?G|caKi%*m6bm0XQI;bpCG) zb|q?#FZkh{kjI_{V=$1in=okh*i9&IZEdYcukn3x_MM)#BS&oIkf8wTP!;x8djWkK z-iw-Zlgyd|{NWF-wOle?4e@QPr==%-;o7TPZxA`B_u2y(zk6`O%*Y-|&^dnenL(uW zS66x}q4{>edZA)kcalCzNF@H7L3W4>UZHeIhkJq(#KA8h+gSKx&_!@nSY>d-D`$*f%GSls4oE+TPxIiGQF;)_Q8R8nW^5( zBftCGoA|8&HBxcl^D zu;f$3Kh(iFX7wS}PHDtv>q8uUG`-h}zKpXx43 zYA@XgbR(8R60ySE)<)}B`3+{9Q!A?r;qDu2$uk}NKS9-k&v>c;5z@apR1NQhi4uri zIo&RO)@&zc%s617U9r%-U^o4YK)<*RXqe An Introduction to Statistical and Data Sciences via R - + @@ -46,7 +46,7 @@ - + @@ -667,7 +667,7 @@

1.3 Connect and contribute

1.4 About this book

-

This book was written using RStudio’s bookdown package by Yihui Xie (Xie 2018). This package simplifies the publishing of books by having all content written in R Markdown. The bookdown/R Markdown source code for all versions of ModernDive is available on GitHub:

+

This book was written using RStudio’s bookdown package by Yihui Xie (Xie 2017). This package simplifies the publishing of books by having all content written in R Markdown. The bookdown/R Markdown source code for all versions of ModernDive is available on GitHub:

@@ -576,11 +576,9 @@

References

(function () { var script = document.createElement("script"); script.type = "text/javascript"; - var src = ""; - if (src === "" || src === "true") src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; - if (location.protocol !== "file:" && /^https?:/.test(src)) - src = src.replace(/^https?:/, ''); - script.src = src; + script.src = "https://cdn.bootcss.com/mathjax/2.7.1/MathJax.js?config=TeX-MML-AM_CHTML"; + if (location.protocol !== "file:" && /^https?:/.test(script.src)) + script.src = script.src.replace(/^https?:/, ''); document.getElementsByTagName("head")[0].appendChild(script); })(); diff --git a/docs/scripts/02-getting-started.R b/docs/scripts/02-getting-started.R deleted file mode 100644 index 8a6e723fe..000000000 --- a/docs/scripts/02-getting-started.R +++ /dev/null @@ -1,47 +0,0 @@ -## ---- eval=FALSE--------------------------------------------------------- -## library(ggplot2) -## library(dplyr) - -## ----message=FALSE------------------------------------------------------- -library(dplyr) -library(nycflights13) -library(knitr) - -## ----load_flights-------------------------------------------------------- -flights - -## **_Learning check_** - -## **Learning Check Solutions** - -## NA -## ------------------------------------------------------------------------ -glimpse(flights) - -## **_Learning check_** - -## **Learning Check Solutions** - -## NA -## ----eval=FALSE---------------------------------------------------------- -## airlines -## kable(airlines) - -## ----eval=FALSE---------------------------------------------------------- -## airlines -## airlines$name - -## ----eval=FALSE---------------------------------------------------------- -## ?flights - -## ---- echo=FALSE, warning=FALSE, message=FALSE, results='hide'----------- -# needed_pkgs <- c("nycflights13", "tibble", "dplyr", "ggplot2", "knitr", -# "okcupiddata", "dygraphs", "rmarkdown", "mosaic", -# "ggplot2movies", "fivethirtyeight", "readr") -# -# new.pkgs <- needed_pkgs[!(needed_pkgs %in% installed.packages())] -# -# if(length(new.pkgs)) { -# install.packages(new.pkgs, repos = "http://cran.rstudio.com") -# } - diff --git a/docs/scripts/03-visualization.R b/docs/scripts/03-visualization.R deleted file mode 100644 index 8a55ba6bd..000000000 --- a/docs/scripts/03-visualization.R +++ /dev/null @@ -1,290 +0,0 @@ -## ----message=FALSE------------------------------------------------------- -library(nycflights13) -library(ggplot2) -library(dplyr) -library(knitr) - -## ----message=FALSE, warning=FALSE, echo=FALSE---------------------------- -# Packages needed internally, but not in text. -library(gapminder) -library(knitr) - -## ---- echo=FALSE--------------------------------------------------------- -gapminder_2007 <- gapminder %>% - filter(year == 2007) %>% - select(-year) %>% - rename( - Country = country, - Continent = continent, - `Life Expectancy` = lifeExp, - `Population` = pop, - `GDP per Capita` = gdpPercap - ) - -## ---- echo=FALSE--------------------------------------------------------- -gapminder_2007 %>% - head() %>% - kable( - digits=2, - caption = "Gapminder 2007 Data: First 6 of 142 countries", - booktabs = TRUE - ) - -## ----gapminder, echo=FALSE, fig.cap="Life Expectancy over GDP per Capita in 2007"---- -ggplot(data = gapminder_2007, mapping = aes(x=`GDP per Capita`, y=`Life Expectancy`, size=Population, col=Continent)) + - geom_point() - -## ---- echo=FALSE--------------------------------------------------------- -map <- data_frame( - `data variable` = c("GDP per Capita", "Life Expectancy", "Population", "Continent"), - aes = c("x", "y", "size", "color"), - geom = c("point", "point", "point", "point") -) - -map %>% - kable( - caption = "Summary of Grammar of Graphics for this plot", - booktabs = TRUE - ) - -## **_Review questions_** - -## ------------------------------------------------------------------------ -all_alaska_flights <- flights %>% - filter(carrier == "AS") - -## **Learning Check Solutions** - -## ----noalpha, fig.cap="Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013"---- -ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + - geom_point() - -## ----nolayers, fig.cap="Plot with No Layers"----------------------------- -ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) - -## **Learning Check Solutions** - -## ---- include=show_solutions('3-2'), echo=show_solutions('3-2')---------- -ggplot(data = all_alaska_flights, mapping = aes(x = dep_time, y = dep_delay)) + - geom_point() - -## ----alpha, fig.cap="Delay scatterplot with alpha=0.2"------------------- -ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + - geom_point(alpha = 0.2) - -## ----jitter, fig.cap="Jittered delay scatterplot"------------------------ -ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + - geom_jitter(width = 30, height = 30) - -## ---- eval = FALSE------------------------------------------------------- -## ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + -## geom_jitter(width = 30, height = 30) -## ggplot(all_alaska_flights, aes(x = dep_delay, y = arr_delay)) + -## geom_jitter(width = 30, height = 30) - -## **Learning Check Solutions** - -## ------------------------------------------------------------------------ -early_january_weather <- weather %>% - filter(origin == "EWR" & month == 1 & day <= 15) - -## **Learning Check Solutions** - -## ----hourlytemp, fig.cap="Hourly Temperature in Newark for January 1-15, 2013"---- -ggplot(data = early_january_weather, mapping = aes(x = time_hour, y = temp)) + - geom_line() - -## **Learning Check Solutions** - -## ---- include=show_solutions('3-5'), echo=show_solutions('3-5')---------- -ggplot(data = early_january_weather, mapping = aes(x = time_hour, y = humid)) + - geom_line() - -## ----echo=FALSE, fig.height=0.8, fig.cap="Plot of Hourly Temperature Recordings from NYC in 2013"---- -ggplot(data = weather, mapping = aes(x = temp, y = factor("A"))) + - geom_point() + - theme(axis.ticks.y = element_blank(), - axis.title.y = element_blank(), - axis.text.y = element_blank()) -hist_title <- "Histogram of Hourly Temperature Recordings from NYC in 2013" - -## ---- warning=TRUE, fig.cap=hist_title----------------------------------- -ggplot(data = weather, mapping = aes(x = temp)) + - geom_histogram() - -## ----fig.cap=paste(hist_title, "- 60 Bins")------------------------------ -ggplot(data = weather, mapping = aes(x = temp)) + - geom_histogram(bins = 60, color = "white") - -## ----fig.cap=paste(hist_title, "- 60 Colored Bins")---------------------- -ggplot(data = weather, mapping = aes(x = temp)) + - geom_histogram(bins = 60, color = "white", fill = "steelblue") - -## ----fig.cap=paste(hist_title, "- Binwidth = 10"), fig.height=5---------- -ggplot(data = weather, mapping = aes(x = temp)) + - geom_histogram(binwidth = 10, color = "white") - -## **Learning Check Solutions** - -## ---- echo=show_solutions('3-7'), include=show_solutions('3-7'), message=FALSE, warning=FALSE---- -IQR(weather$temp, na.rm=TRUE) - -## ---- echo=show_solutions('3-7'), include=show_solutions('3-7'), message=FALSE, warning=FALSE---- -summary(weather$temp) - -## ----facethistogram, fig.cap="Faceted histogram"------------------------- -ggplot(data = weather, mapping = aes(x = temp)) + - geom_histogram(binwidth = 5, color = "white") + - facet_wrap(~ month, nrow = 4) - -## **Learning Check Solutions** - -## ----badbox, fig.cap="Invalid boxplot specification", fig.height=3.5----- -ggplot(data = weather, mapping = aes(x = month, y = temp)) + - geom_boxplot() - -## ----monthtempbox, fig.cap="Month by temp boxplot", fig.height=3.7------- -ggplot(data = weather, mapping = aes(x = factor(month), y = temp)) + - geom_boxplot() - -## ----monthtempbox2, echo=FALSE, fig.cap="November boxplot", fig.height=3.7---- -weather %>% - filter(month %in% c(11)) %>% - ggplot(mapping = aes(x = factor(month), y = temp)) + - geom_boxplot() - -## ----monthtempbox3, echo=FALSE, fig.cap="November boxplot with points", fig.height=3.7---- -quartiles <- weather %>% filter(month == 11) %>% pull(temp) %>% quantile(prob=c(0.25, 0.5, 0.75)) -weather %>% - filter(month %in% c(11)) %>% - ggplot(mapping = aes(x = factor(month), y = temp)) + - geom_boxplot() + - geom_jitter(width = 0.05, height = 0.5, alpha = 0.2) - -## **Learning Check Solutions** - -## ---- echo=FALSE, eval=FALSE--------------------------------------------- -## weather %>% -## filter(month==5 & temp < 25) - -## ---- include=show_solutions('3-9'), echo=FALSE-------------------------- -weather %>% - filter(month==5 & temp < 25) %>% - kable() - -## There appears to be only one hour and only at JFK that recorded 13.1 F (-10.5 C) in the month of May. This is probably a data entry mistake! - -## ---- echo=FALSE, eval=FALSE--------------------------------------------- -## # weather %>% -## # group_by(month) %>% -## # summarize(IQR = IQR(temp, na.rm=TRUE)) %>% -## # arrange(desc(IQR)) - -## ---- echo=FALSE, include=show_solutions('3-9')-------------------------- -weather %>% - group_by(month) %>% - summarize(IQR = IQR(temp, na.rm=TRUE)) %>% - arrange(desc(IQR)) %>% - kable() - -## **`r paste0("(LC", chap, ".", (lc - 1), ")")`: We looked at the distribution of a continuous variable over a categorical variable here with this boxplot. Why can't we look at the distribution of one continuous variable over the distribution of another continuous variable? Say, temperature across pressure, for example?** - -## ------------------------------------------------------------------------ -fruits <- data_frame( - fruit = c("apple", "apple", "apple", "orange", "orange") -) -fruits_counted <- data_frame( - fruit = c("apple", "orange"), - number = c(3, 2) -) - -## ---- echo=FALSE--------------------------------------------------------- -kable( - fruits, - digits=2, - caption = "Fruits", - booktabs = TRUE - ) - -## ---- echo=FALSE--------------------------------------------------------- -kable( - fruits_counted, - digits=2, - caption = "Fruits (Pre-Counted)", - booktabs = TRUE - ) - -## ----geombar, fig.cap="Barplot when counts are not pre-tabulated", fig.height=2.5---- -ggplot(data = fruits, mapping = aes(x = fruit)) + - geom_bar() - -## ---- geomcol, fig.cap="Barplot when counts are pre-tabulated", fig.height=2.5---- -ggplot(data = fruits_counted, mapping = aes(x = fruit, y = number)) + - geom_col() - -## ----flightsbar, fig.cap="Number of flights departing NYC in 2013 by airline using geom_bar", fig.height=2.5---- -ggplot(data = flights, mapping = aes(x = carrier)) + - geom_bar() - -## ------------------------------------------------------------------------ -kable(airlines) - -## ----message=FALSE------------------------------------------------------- -flights_table <- flights %>% - group_by(carrier) %>% - summarize(number = n()) -kable(flights_table) - -## ----flightscol, fig.cap="Number of flights departing NYC in 2013 by airline using geom_col", fig.height=2.5---- -ggplot(data = flights_table, mapping = aes(x = carrier, y = number)) + - geom_col() - -## **Learning Check Solutions** - -## ----carrierpie, echo=FALSE, fig.cap="The dreaded pie chart", fig.height=5---- -ggplot(flights, mapping = aes(x = factor(1), fill = carrier)) + - geom_bar(width = 1) + - coord_polar(theta = "y") + - theme(axis.title.x = element_blank(), - axis.title.y = element_blank(), - axis.ticks = element_blank(), - axis.text.y = element_blank(), - axis.text.x = element_blank(), - panel.grid.major = element_blank(), - panel.grid.minor = element_blank()) + - guides(fill = guide_legend(keywidth = 0.8, keyheight = 0.8)) - -## **Learning Check Solutions** - -## ----message=FALSE------------------------------------------------------- -flights_namedports <- flights %>% - inner_join(airports, by = c("origin" = "faa")) - -## ---- fig.cap="Stacked barplot comparing the number of flights by carrier and airport", fig.height=3.5---- -ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + - geom_bar() - -## **Learning Check Solutions** - -## ---- fig.cap="Side-by-side barplot comparing the number of flights by carrier and airport", fig.height=5---- -ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + - geom_bar(position = "dodge") - -## **Learning Check Solutions** - -## ----facet-bar-vert, fig.cap="Faceted barplot comparing the number of flights by carrier and airport", fig.height=7.5---- -ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + - geom_bar() + - facet_grid(name ~ .) - -## **Learning Check Solutions** - -## ----viz-map, echo=FALSE, fig.cap="Mind map for Data Visualization", out.width="200%"---- -#library(knitr) -#if(knitr:::is_html_output()){ -# include_url("https://coggle.it/diagram/V_G2gzukTDoQ-aZt-", -# height = "1000px") -#} else { - include_graphics("images/coggleviz.png", dpi = 300) -#} - diff --git a/docs/scripts/04-tidy.R b/docs/scripts/04-tidy.R deleted file mode 100644 index 2349ae868..000000000 --- a/docs/scripts/04-tidy.R +++ /dev/null @@ -1,140 +0,0 @@ -## ----setup_tidy, include=FALSE------------------------------------------- -chap <- 4 -lc <- 0 -rq <- 0 -# **`r paste0("(LC", chap, ".", (lc <- lc + 1), ")")`** -# **`r paste0("(RQ", chap, ".", (rq <- rq + 1), ")")`** - -knitr::opts_chunk$set( - tidy = FALSE, - out.width = '\\textwidth' - ) - -# This bit of code is a bug fix on asis blocks, which we use to show/not show LC -# solutions, which are written like markdown text. In theory, it shouldn't be -# necessary for knitr versions <=1.11.6, but I've found I still need to for -# everything to knit properly in asis blocks. More info here: -# https://stackoverflow.com/questions/32944715/conditionally-display-block-of-markdown-text-using-knitr -library(knitr) -knit_engines$set(asis = function(options) { - if (options$echo && options$eval) knit_child(text = options$code) -}) - -# This controls which LC solutions to show. Options for solutions_shown: "ALL" -# (to show all solutions), or subsets of c('4-4', '4-5'), including the -# null vector c('') to show no solutions. -# solutions_shown <- c('4-1', '4-2', '4-3', '4-4') -solutions_shown <- c('') -show_solutions <- function(section){ - return(solutions_shown == "ALL" | section %in% solutions_shown) - } - -## ----warning=FALSE, message=FALSE---------------------------------------- -library(nycflights13) -library(dplyr) -library(tidyr) -library(ggplot2) -library(readr) - -## ----message=FALSE, warning=FALSE, echo=FALSE---------------------------- -# Packages needed internally, but not in text. -library(knitr) - -## ----tidyfig, echo=FALSE, fig.cap="Tidy data graphic from http://r4ds.had.co.nz/tidy-data.html"---- -knitr::include_graphics("images/tidy-1.png") - -## ----echo=FALSE---------------------------------------------------------- -stocks <- data_frame( - Date = as.Date('2009-01-01') + 0:4, - `Boeing Stock Price` = paste("$", c("173.55", "172.61", "173.86", "170.77", "174.29"), sep = ""), - `Amazon Stock Price` = paste("$", c("174.90", "171.42", "171.58", "173.89", "170.16"), sep = ""), - `Google Stock Price` = paste("$", c("174.34", "170.04", "173.65", "174.87", "172.19") ,sep = "") -) %>% - slice(1:2) -stocks %>% - kable( - digits = 2, - caption = "Stock Prices (Non-Tidy Format)", - booktabs = TRUE - ) - -## ----echo=FALSE---------------------------------------------------------- -stocks_tidy <- stocks %>% - rename( - Boeing = `Boeing Stock Price`, - Amazon = `Amazon Stock Price`, - Google = `Google Stock Price` - ) %>% - gather(`Stock Name`, `Stock Price`, -Date) -stocks_tidy %>% - kable( - digits = 2, - caption = "Stock Prices (Tidy Format)", - booktabs = TRUE - ) - -## ----echo=FALSE---------------------------------------------------------- -stocks <- data_frame( - Date = as.Date('2009-01-01') + 0:4, - `Boeing Price` = paste("$", c("173.55", "172.61", "173.86", "170.77", "174.29"), sep = ""), - `Weather` = c("Sunny", "Overcast", "Rain", "Rain", "Sunny") -) %>% - slice(1:2) -stocks %>% - kable( - digits = 2, - caption = "Date, Boeing Price, Weather Data", - booktabs = TRUE - ) - -## ----message=FALSE------------------------------------------------------- -if(!file.exists("data/dem_score.csv")){ - download.file(url = "http://ismayc.github.io/dem_score.csv", - destfile = "data/dem_score.csv") -} -dem_score <- read_csv("data/dem_score.csv") -dem_score - -## ------------------------------------------------------------------------ -guat_dem <- dem_score %>% - filter(country == "Guatemala") -guat_dem - -## ------------------------------------------------------------------------ -guat_tidy <- gather(data = guat_dem, - key = year, - value = democracy_score, - - country) -guat_tidy - -## ----errors=TRUE--------------------------------------------------------- -ggplot(data = guat_tidy, mapping = aes(x = year, y = democracy_score)) + - geom_line() - -## ----guatline, fig.cap="Guatemala's democracy score ratings from 1952 to 1992"---- -ggplot(data = guat_tidy, mapping = aes(x = parse_number(year), y = democracy_score)) + - geom_line() + - labs(x = "year") - -## **Learning Check Solutions** - -## `dem_tidy <- gather(data = dem_score, key = year, value = democracy_score, - country)` - -## ------------------------------------------------------------------------ -glimpse(airports) - -## **_Learning check_** - -## **Learning Check Solutions** - -## ----message=FALSE------------------------------------------------------- -library(dplyr) -joined_flights <- inner_join(x = flights, y = airlines, by = "carrier") - -## ----eval=FALSE---------------------------------------------------------- -## View(joined_flights) - -## **_Learning check_** - -## **Learning Check Solutions** - diff --git a/docs/scripts/05-wrangling.R b/docs/scripts/05-wrangling.R deleted file mode 100644 index cb21dc113..000000000 --- a/docs/scripts/05-wrangling.R +++ /dev/null @@ -1,192 +0,0 @@ -## ---- message=FALSE------------------------------------------------------ -library(dplyr) -library(ggplot2) -library(nycflights13) -library(knitr) - -## ---- eval=FALSE--------------------------------------------------------- -## portland_flights <- flights %>% -## filter(dest == "PDX") -## View(portland_flights) - -## ---- eval=FALSE--------------------------------------------------------- -## btv_sea_flights_fall <- flights %>% -## filter(origin == "JFK", (dest == "BTV" | dest == "SEA"), month >= 10) -## View(btv_sea_flights_fall) - -## ---- eval=FALSE--------------------------------------------------------- -## not_BTV_SEA <- flights %>% -## filter(!(dest == "BTV" | dest == "SEA")) -## View(not_BTV_SEA) - -## ------------------------------------------------------------------------ -summary_temp <- weather %>% - summarize(mean = mean(temp), std_dev = sd(temp)) -kable(summary_temp) - -## ------------------------------------------------------------------------ -summary_temp <- weather %>% - summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) -kable(summary_temp) - -## ------------------------------------------------------------------------ -#summary_temp$mean - -## ----eval=FALSE---------------------------------------------------------- -## summary_temp <- weather %>% -## summarize(mean = mean(temp, na.rm = TRUE)) %>% -## summarize(std_dev = sd(temp, na.rm = TRUE)) - -## ------------------------------------------------------------------------ -summary_monthly_temp <- weather %>% - group_by(month) %>% - summarize(mean = mean(temp, na.rm = TRUE), - std_dev = sd(temp, na.rm = TRUE)) -kable(summary_monthly_temp) - -## ------------------------------------------------------------------------ -by_origin <- flights %>% - group_by(origin) %>% - summarize(count = n()) -kable(by_origin) - -## ------------------------------------------------------------------------ -by_monthly_origin <- flights %>% - group_by(origin, month) %>% - summarize(count = n()) -kable(by_monthly_origin) - -## ------------------------------------------------------------------------ -by_monthly_origin2 <- flights %>% - dplyr::count(origin, month) -kable(by_monthly_origin2) - -## NA -## ------------------------------------------------------------------------ -flights <- flights %>% - mutate(gain = dep_delay - arr_delay) - -## ------------------------------------------------------------------------ -gain_summary <- flights %>% - summarize( - min = min(gain, na.rm = TRUE), - q1 = quantile(gain, 0.25, na.rm = TRUE), - median = quantile(gain, 0.5, na.rm = TRUE), - q3 = quantile(gain, 0.75, na.rm = TRUE), - max = max(gain, na.rm = TRUE), - mean = mean(gain, na.rm = TRUE), - sd = sd(gain, na.rm = TRUE), - missing = sum(is.na(gain)) - ) -kable(gain_summary) - -## ----message=FALSE, fig.cap="Histogram of gain variable"----------------- -ggplot(data = flights, mapping = aes(x = gain)) + - geom_histogram(color = "white", bins = 20) - -## ------------------------------------------------------------------------ -flights <- flights %>% - mutate( - gain = dep_delay - arr_delay, - hours = air_time / 60, - gain_per_hour = gain / hours - ) - -## ---- eval--------------------------------------------------------------- -freq_dest <- flights %>% - group_by(dest) %>% - summarize(num_flights = n()) -freq_dest - -## ------------------------------------------------------------------------ -freq_dest %>% arrange(num_flights) - -## ------------------------------------------------------------------------ -freq_dest %>% arrange(desc(num_flights)) - -## ----eval=FALSE---------------------------------------------------------- -## View(airlines) - -## ----eval=FALSE---------------------------------------------------------- -## flights_joined <- flights %>% -## inner_join(airlines, by = "carrier") -## View(flights) -## View(flights_joined) - -## ----eval=FALSE---------------------------------------------------------- -## View(airports) - -## ---- eval=FALSE--------------------------------------------------------- -## flights %>% -## inner_join(airports, by = c("dest" = "faa")) - -## ---- eval=FALSE--------------------------------------------------------- -## named_dests <- flights %>% -## group_by(dest) %>% -## summarize(num_flights = n()) %>% -## arrange(desc(num_flights)) %>% -## inner_join(airports, by = c("dest" = "faa")) %>% -## rename(airport_name = name) -## View(named_dests) - -## ---- eval=FALSE--------------------------------------------------------- -## glimpse(flights) - -## ---- eval=FALSE--------------------------------------------------------- -## flights %>% -## select(carrier, flight) - -## ---- eval=FALSE--------------------------------------------------------- -## flights_no_year <- flights %>% -## select(-year) -## names(flights_no_year) - -## ---- eval=FALSE--------------------------------------------------------- -## flight_arr_times <- flights %>% -## select(month:day, arr_time:sched_arr_time) -## flight_arr_times - -## ---- eval=FALSE--------------------------------------------------------- -## flights_reorder <- flights %>% -## select(month:day, hour:time_hour, everything()) -## names(flights_reorder) - -## ---- eval=FALSE--------------------------------------------------------- -## flights_begin_a <- flights %>% -## select(starts_with("a")) -## flights_begin_a - -## ---- eval=FALSE--------------------------------------------------------- -## flights_delays <- flights %>% -## select(ends_with("delay")) -## flights_delays - -## ---- eval=FALSE--------------------------------------------------------- -## flights_time <- flights %>% -## select(contains("time")) -## flights_time - -## ---- eval=FALSE--------------------------------------------------------- -## flights_time_new <- flights %>% -## select(contains("time")) %>% -## rename(departure_time = dep_time, -## arrival_time = arr_time) -## names(flights_time) - -## ---- eval=FALSE--------------------------------------------------------- -## named_dests %>% -## top_n(n = 10, wt = num_flights) - -## ---- eval=FALSE--------------------------------------------------------- -## named_dests %>% -## top_n(n = 10, wt = num_flights) %>% -## arrange(desc(num_flights)) - -## ---- eval=FALSE--------------------------------------------------------- -## ten_freq_dests <- flights %>% -## group_by(dest) %>% -## summarize(num_flights = n()) %>% -## arrange(desc(num_flights)) %>% -## top_n(n = 10) -## View(ten_freq_dests) - diff --git a/docs/scripts/06-regression.R b/docs/scripts/06-regression.R deleted file mode 100644 index f88099412..000000000 --- a/docs/scripts/06-regression.R +++ /dev/null @@ -1,488 +0,0 @@ -## ---- message=FALSE, warning=FALSE--------------------------------------- -library(ggplot2) -library(dplyr) -library(moderndive) -library(gapminder) - -## ---- message=FALSE, warning=FALSE, echo=FALSE--------------------------- -# Packages needed internally, but not in text. -library(mvtnorm) -library(tidyr) -library(forcats) -library(gridExtra) -library(broom) -library(janitor) - -## ----eval=FALSE---------------------------------------------------------- -## load(url("http://www.openintro.org/stat/data/evals.RData")) -## evals <- evals %>% -## select(score, bty_avg) - -## ----echo=FALSE---------------------------------------------------------- -if(!file.exists("data/evals.RData")){ - download.file(url = "http://www.openintro.org/stat/data/evals.RData", - destfile = "data/evals.RData") -} -load("data/evals.RData") -evals <- evals %>% - select(score, bty_avg) - -## ---- echo=FALSE--------------------------------------------------------- -set.seed(76) -evals %>% - sample_n(5) %>% - knitr::kable( - digits = 3, - caption = "Random sample of 5 instructors", - booktabs = TRUE - ) - -## ------------------------------------------------------------------------ -glimpse(evals) - -## ------------------------------------------------------------------------ -evals %>% - select(score, bty_avg) %>% - summary() - -## ----correlation1, echo=FALSE, fig.cap="Different correlation coefficients"---- -correlation <- c(-0.9999, -0.75, 0, 0.75, 0.9999) -n_sim <- 100 - -values <- NULL -for(i in 1:length(correlation)){ - rho <- correlation[i] - sigma <- matrix(c(5, rho * sqrt(50), rho * sqrt(50), 10), 2, 2) - sim <- rmvnorm( - n = n_sim, - mean = c(20,40), - sigma = sigma - ) %>% - as_data_frame() %>% - mutate(correlation = round(rho, 2)) - - values <- bind_rows(values, sim) -} - -ggplot(data = values, mapping = aes(V1, V2)) + - geom_point() + - facet_wrap(~ correlation, nrow = 2) + - labs(x = "x", y = "y") + - theme( - axis.text.x = element_blank(), - axis.text.y = element_blank(), - axis.ticks = element_blank() - ) - -## ------------------------------------------------------------------------ -cor(evals$score, evals$bty_avg) - -## ----numxplot1, warning=FALSE, fig.cap="Instructor evaluation scores at UT Austin"---- -ggplot(evals, aes(x = bty_avg, y = score)) + - geom_point() + - labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") - -## ----numxplot2, warning=FALSE, fig.cap="Instructor evaluation scores at UT Austin: Jittered"---- -ggplot(evals, aes(x = bty_avg, y = score)) + - geom_jitter() + - labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") - -## ----numxplot3, warning=FALSE, fig.cap="Regression line"----------------- -ggplot(evals, aes(x = bty_avg, y = score)) + - geom_jitter() + - labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") + - geom_smooth(method = "lm") - -## ----numxplot4, warning=FALSE, fig.cap="Regression line without error bands"---- -ggplot(evals, aes(x = bty_avg, y = score)) + - geom_jitter() + - labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") + - geom_smooth(method = "lm", se = FALSE) - -## ---- eval=FALSE--------------------------------------------------------- -## score_model <- lm(score ~ bty_avg, data = evals) -## get_regression_table(score_model, digits = 2) - -## ---- echo=FALSE--------------------------------------------------------- -score_model <- lm(score ~ bty_avg, data = evals) -evals_line <- score_model %>% - get_regression_table() %>% - pull(estimate) - -## ----numxplot4b, echo=FALSE---------------------------------------------- -get_regression_table(score_model) %>% - knitr::kable( - digits = 3, - caption = "Linear regression table", - booktabs = TRUE - ) - -## ---- echo=FALSE--------------------------------------------------------- -index <- which(evals$bty_avg == 7.333 & evals$score == 4.9) -target_point <- score_model %>% - get_regression_points() %>% - slice(index) -x <- target_point$bty_avg -y <- target_point$score -y_hat <- target_point$score_hat -resid <- target_point$residual -evals %>% - slice(index) %>% - knitr::kable( - digits = 3, - caption = "Data for 21st instructor", - booktabs = TRUE - ) - -## ----numxplot5, echo=FALSE, warning=FALSE, fig.cap="Example of observed value, fitted value, and residual"---- -best_fit_plot <- ggplot(evals, aes(x = bty_avg, y = score)) + - geom_jitter() + - labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") + - geom_point(method = "lm", se = FALSE) + - annotate("point", x = x, y = y, col = "red", size = 3) + - annotate("point", x = x, y = y_hat, col = "red", shape = 15, size = 3) + - annotate("segment", x = x, xend = x, y = y, yend = y_hat, color = "blue", - arrow = arrow(type = "closed", length = unit(0.02, "npc"))) -best_fit_plot - -## ---- eval=FALSE--------------------------------------------------------- -## regression_points <- get_regression_points(score_model) -## regression_points - -## ---- echo=FALSE--------------------------------------------------------- -set.seed(76) -regression_points <- get_regression_points(score_model) -regression_points %>% - slice(c(index, index + 1, index + 2, index + 3)) %>% - knitr::kable( - digits = 3, - caption = "Regression points (for only 21st through 24th instructor)", - booktabs = TRUE - ) - -## ----numxplot6, echo=FALSE, warning=FALSE, fig.cap="Plot of residuals over beauty score"---- -ggplot(regression_points, aes(x = bty_avg, y = residual)) + - geom_point() + - labs(x = "Beauty Score", y = "Residual") + - geom_hline(yintercept = 0, col = "blue", size = 1) + - annotate("point", x = x, y = resid, col = "red", size = 3) + - annotate("point", x = x, y = 0, col = "red", shape = 15, size = 3) + - annotate("segment", x = x, xend = x, y = resid, yend = 0, color = "blue", - arrow = arrow(type = "closed", length = unit(0.02, "npc"))) - -## ----numxplot7, echo=FALSE, warning=FALSE, fig.cap="Examples of less than ideal residual patterns"---- -resid_ex <- evals -resid_ex$ex_1 <- ((evals$bty_avg - 5) ^ 2 - 6 + rnorm(nrow(evals), 0, 0.5)) * 0.4 -resid_ex$ex_2 <- (rnorm(nrow(evals), 0, 0.075 * evals$bty_avg ^ 2)) * 0.4 - -resid_ex <- resid_ex %>% - select(bty_avg, ex_1, ex_2) %>% - gather(type, eps, -bty_avg) %>% - mutate(type = ifelse(type == "ex_1", "Example 1", "Example 2")) - -ggplot(resid_ex, aes(x = bty_avg, y = eps)) + - geom_point() + - labs(x = "Beauty Score", y = "Residual") + - geom_hline(yintercept = 0, col = "blue", size = 1) + - facet_wrap(~type) - -## ----model1_residuals_hist, warning=FALSE, fig.cap="Histogram of residuals"---- -ggplot(regression_points, aes(x = residual)) + - geom_histogram(binwidth = 0.25, color = "white") + - labs(x = "Residual") - -## ----numxplot9, echo=FALSE, warning=FALSE, fig.cap="Examples of ideal and less than ideal residual patterns"---- -resid_ex <- evals -resid_ex$`Ideal` <- rnorm(nrow(resid_ex), 0, sd = sd(regression_points$residual)) -resid_ex$`Less than ideal` <- - rnorm(nrow(resid_ex), 0, sd = sd(regression_points$residual))^2 -resid_ex$`Less than ideal` <- resid_ex$`Less than ideal` - mean(resid_ex$`Less than ideal` ) - -resid_ex <- resid_ex %>% - select(bty_avg, `Ideal`, `Less than ideal`) %>% - gather(type, eps, -bty_avg) - -ggplot(resid_ex, aes(x = eps)) + - geom_histogram(binwidth = 0.25, color = "white") + - labs(x = "Residual") + - facet_wrap( ~ type, scales = "free") - -## ---- eval=FALSE--------------------------------------------------------- -## load(url("http://www.openintro.org/stat/data/evals.RData")) -## evals <- evals %>% -## select(score, age) - -## ---- warning=FALSE, message=FALSE--------------------------------------- -library(gapminder) -gapminder2007 <- gapminder %>% - filter(year == 2007) %>% - select(country, continent, lifeExp) - -## ---- eval=FALSE--------------------------------------------------------- -## View(gapminder2007) - -## ----model2-data-preview, echo=FALSE------------------------------------- -gapminder2007 %>% - sample_n(5) %>% - knitr::kable( - digits = 3, - caption = "Random sample of 5 countries", - booktabs = TRUE - ) - -## ------------------------------------------------------------------------ -glimpse(gapminder2007) - -## ------------------------------------------------------------------------ -summary(gapminder2007$continent) - -## ---- eval=TRUE---------------------------------------------------------- -lifeExp_worldwide <- gapminder2007 %>% - summarize(median = median(lifeExp), mean = mean(lifeExp)) - -## ---- echo=FALSE--------------------------------------------------------- -lifeExp_worldwide %>% - knitr::kable( - digits = 3, - caption = "Worldwide life expectancy", - booktabs = TRUE - ) - -## ------------------------------------------------------------------------ -ggplot(gapminder2007, aes(x = lifeExp)) + - geom_histogram(binwidth = 5, color = "white") + - labs(x = "Life expectancy", y = "Number of countries", title = "Worldwide life expectancy") - -## ---- eval=TRUE---------------------------------------------------------- -lifeExp_by_continent <- gapminder2007 %>% - group_by(continent) %>% - summarize(median = median(lifeExp), mean = mean(lifeExp)) - -## ----catxplot0, echo=FALSE----------------------------------------------- -lifeExp_by_continent %>% - knitr::kable( - digits = 3, - caption = "Life expectancy by continent", - booktabs = TRUE - ) - -## ---- echo=FALSE--------------------------------------------------------- -median_africa <- lifeExp_by_continent %>% - filter(continent == "Africa") %>% - pull(median) -mean_africa <- lifeExp_by_continent %>% - filter(continent == "Africa") %>% - pull(mean) -n_countries <- gapminder2007 %>% nrow() -n_countries_africa <- gapminder2007 %>% filter(continent == "Africa") %>% nrow() - -## ----catxplot0b, warning=FALSE, fig.cap="Life expectancy in 2007"-------- -ggplot(gapminder2007, aes(x = lifeExp)) + - geom_histogram(binwidth = 5, color = "white") + - labs(x = "Life expectancy", y = "Number of countries", title = "Life expectancy by continent") + - facet_wrap(~continent, nrow = 2) - -## ----catxplot1, warning=FALSE, fig.cap="Life expectancy in 2007"--------- -ggplot(gapminder2007, aes(x = continent, y = lifeExp)) + - geom_boxplot() + - labs(x = "Continent", y = "Life expectancy (years)", title = "Life expectancy by continent") - -## ----continent-mean-life-expectancies, echo=FALSE------------------------ -gapminder2007 %>% - group_by(continent) %>% - summarize(mean = mean(lifeExp)) %>% - mutate(`mean vs Africa` = mean - mean_africa) %>% - knitr::kable( - digits = 3, - caption = "Mean life expectancy by continent", - booktabs = TRUE - ) - -## ---- eval=FALSE--------------------------------------------------------- -## lifeExp_model <- lm(lifeExp ~ continent, data = gapminder2007) -## get_regression_table(lifeExp_model) - -## ---- echo=FALSE--------------------------------------------------------- -lifeExp_model <- lm(lifeExp ~ continent, data = gapminder2007) -evals_line <- get_regression_table(lifeExp_model) %>% - pull(estimate) - -## ----catxplot4b, echo=FALSE---------------------------------------------- -get_regression_table(lifeExp_model) %>% - knitr::kable( - digits = 3, - caption = "Linear regression table", - booktabs = TRUE - ) - -## ---- echo=FALSE--------------------------------------------------------- -gapminder2007 %>% - slice(1:10) %>% - knitr::kable( - digits = 3, - caption = "First 10 out of 142 countries", - booktabs = TRUE - ) - -## ---- eval=FALSE--------------------------------------------------------- -## regression_points <- get_regression_points(lifeExp_model) -## regression_points - -## ---- echo=FALSE--------------------------------------------------------- -regression_points <- get_regression_points(lifeExp_model) -regression_points %>% - slice(1:10) %>% - knitr::kable( - digits = 3, - caption = "Regression points (First 10 out of 142 countries)", - booktabs = TRUE - ) - -## ----catxplot7, warning=FALSE, fig.cap="Plot of residuals over continent"---- -ggplot(regression_points, aes(x = continent, y = residual)) + - geom_jitter(width = 0.1) + - labs(x = "Continent", y = "Residual") + - geom_hline(yintercept = 0, col = "blue") - -## ---- eval=FALSE--------------------------------------------------------- -## gapminder2007 %>% -## filter(continent == "Asia") %>% -## arrange(lifeExp) - -## ---- echo=FALSE--------------------------------------------------------- -gapminder2007 %>% - filter(continent == "Asia") %>% - arrange(lifeExp) %>% - slice(1:5) %>% - knitr::kable( - digits = 3, - caption = "Countries in Asia with shortest life expectancy", - booktabs = TRUE - ) - -## ----catxplot8, warning=FALSE, fig.cap="Histogram of residuals"---------- -ggplot(regression_points, aes(x = residual)) + - geom_histogram(binwidth = 5, color = "white") + - labs(x = "Residual") - -## ---- eval = FALSE------------------------------------------------------- -## # The following commands reloads the gapminder from scratch: -## data("gapminder") -## -## gapminder2007 <- gapminder %>% -## filter(year == 2007) %>% -## select(country, continent, gdpPercap) - -## ----correlation2, echo=FALSE, fig.cap="Different Correlation Coefficients"---- -correlation <- c(-0.9999, -0.9, -0.75, -0.3, 0, 0.3, 0.75, 0.9, 0.9999) -n_sim <- 100 - -values <- NULL -for(i in 1:length(correlation)){ - rho <- correlation[i] - sigma <- matrix(c(5, rho * sqrt(50), rho * sqrt(50), 10), 2, 2) - sim <- rmvnorm( - n = n_sim, - mean = c(20,40), - sigma = sigma - ) %>% - as_data_frame() %>% - mutate(correlation = round(rho,2)) - - values <- bind_rows(values, sim) -} - -ggplot(data = values, mapping = aes(V1, V2)) + - geom_point() + - facet_wrap(~ correlation, ncol = 3) + - labs(x = "x", y = "y") + - theme( - axis.text.x = element_blank(), - axis.text.y = element_blank(), - axis.ticks = element_blank() - ) - -## ----echo=FALSE---------------------------------------------------------- -if(!file.exists("data/evals.RData")){ - download.file(url = "http://www.openintro.org/stat/data/evals.RData", - destfile = "data/evals.RData") -} -load("data/evals.RData") -evals <- evals %>% - select(score, bty_avg) -index <- which(evals$bty_avg == 2.333 & evals$score == 2.7) -target_point <- get_regression_points(score_model) %>% - slice(index) -x <- target_point$bty_avg -y <- target_point$score -y_hat <- target_point$score_hat -resid <- target_point$residual - -best_fit_plot <- best_fit_plot + - annotate("point", x = x, y = y, col = "red", size = 3) + - annotate("point", x = x, y = y_hat, col = "red", shape = 15, size = 3) + - annotate("segment", x = x, xend = x, y = y, yend = y_hat, color = "blue", - arrow = arrow(type = "closed", length = unit(0.02, "npc"))) -best_fit_plot - -## ---- echo=FALSE--------------------------------------------------------- -index <- which(evals$bty_avg == 3.667 & evals$score == 4.4) -score_model <- lm(score ~ bty_avg, data = evals) -target_point <- get_regression_points(score_model) %>% - slice(index) -x <- target_point$bty_avg -y <- target_point$score -y_hat <- target_point$score_hat -resid <- target_point$residual - -best_fit_plot <- best_fit_plot + - annotate("point", x = x, y = y, col = "red", size = 3) + - annotate("point", x = x, y = y_hat, col = "red", shape = 15, size = 3) + - annotate("segment", x = x, xend = x, y = y, yend = y_hat, - color = "blue", - arrow = arrow(type = "closed", length = unit(0.02, "npc"))) -best_fit_plot - -## ----here, echo=FALSE---------------------------------------------------- -index <- which(evals$bty_avg == 6 & evals$score == 3.8) -score_model <- lm(score ~ bty_avg, data = evals) -target_point <- get_regression_points(score_model) %>% - slice(index) -x <- target_point$bty_avg -y <- target_point$score -y_hat <- target_point$score_hat -resid <- target_point$residual - -best_fit_plot <- best_fit_plot + - annotate("point", x = x, y = y, col = "red", size = 3) + - annotate("point", x = x, y = y_hat, col = "red", shape = 15, size = 3) + - annotate("segment", x = x, xend = x, y = y, yend = y_hat, color = "blue", - arrow = arrow(type = "closed", length = unit(0.02, "npc"))) -best_fit_plot - -## ---- eval = FALSE------------------------------------------------------- -## lm(score ~ bty_avg, data = evals) %>% -## get_regression_table() - -## ---- echo = FALSE------------------------------------------------------- -lm(score ~ bty_avg, data = evals) %>% - get_regression_table() %>% - knitr::kable() - -## ---- eval = FALSE------------------------------------------------------- -## library(broom) -## library(janitor) -## lm(score ~ bty_avg, data = evals) %>% -## tidy(conf.int = TRUE) %>% -## mutate_if(is.numeric, round, digits = 3) %>% -## clean_names() - -## ---- echo = FALSE------------------------------------------------------- -library(broom) -library(janitor) -lm(score ~ bty_avg, data = evals) %>% - tidy(conf.int = TRUE) %>% - mutate_if(is.numeric, round, digits = 3) %>% - clean_names() %>% - knitr::kable() - diff --git a/docs/scripts/07-multiple-regression.R b/docs/scripts/07-multiple-regression.R deleted file mode 100644 index aa0e4ac3f..000000000 --- a/docs/scripts/07-multiple-regression.R +++ /dev/null @@ -1,374 +0,0 @@ -## ---- message=FALSE, warning=FALSE--------------------------------------- -library(ggplot2) -library(dplyr) -library(moderndive) -library(ISLR) - -## ---- message=FALSE, warning=FALSE, echo=FALSE--------------------------- -# Packages needed internally, but not in text. -library(mvtnorm) -library(tidyr) -library(forcats) -library(gridExtra) - -## ---- warning=FALSE, message=FALSE--------------------------------------- -library(ISLR) -Credit <- Credit %>% - select(Balance, Limit, Income) - -## ---- eval=FALSE--------------------------------------------------------- -## View(Credit) - -## ----model3-data-preview, echo=FALSE------------------------------------- -Credit %>% - sample_n(5) %>% - knitr::kable( - digits = 3, - caption = "Random sample of 5 credit card holders", - booktabs = TRUE - ) - -## ------------------------------------------------------------------------ -glimpse(Credit) - -## ------------------------------------------------------------------------ -summary(Credit) - -## ---- eval=FALSE--------------------------------------------------------- -## cor(Credit$Balance, Credit$Limit) -## cor(Credit$Balance, Credit$Income) - -## ---- eval=FALSE--------------------------------------------------------- -## cor(Credit) - -## ----model3-correlation, echo=FALSE-------------------------------------- -Credit %>% - cor() %>% - knitr::kable( - digits = 3, - caption = "Correlations between credit card balance, credit limit, and credit rating", - booktabs = TRUE - ) - -## ---- eval=FALSE--------------------------------------------------------- -## ggplot(Credit, aes(x = Limit, y = Balance)) + -## geom_point() + -## labs(x = "Credit limit (in $)", y = "Credit card balance (in $)", -## title = "Relationship between balance and credit limit") + -## geom_smooth(method = "lm", se = FALSE) -## -## ggplot(Credit, aes(x = Income, y = Balance)) + -## geom_point() + -## labs(x = "Income (in $1000)", y = "Credit card balance (in $)", -## title = "Relationship between balance and income") + -## geom_smooth(method = "lm", se = FALSE) - -## ----2numxplot1, echo=FALSE, fig.height=4, fig.cap="Relationship between credit card balance and credit limit/income"---- -model3_balance_vs_limit_plot <- ggplot(Credit, aes(x = Limit, y = Balance)) + - geom_point() + - labs(x = "Credit limit (in $)", y = "Credit card balance (in $)", - title = "Balance vs credit limit") + - geom_smooth(method = "lm", se = FALSE) -model3_balance_vs_income_plot <- ggplot(Credit, aes(x = Income, y = Balance)) + - geom_point() + - labs(x = "Income (in $1000)", y = "Credit card balance (in $)", - title = "Balance vs income") + - geom_smooth(method = "lm", se = FALSE) -grid.arrange(model3_balance_vs_limit_plot, model3_balance_vs_income_plot, nrow = 1) - -## ---- eval=FALSE, echo=FALSE--------------------------------------------- -## # Save as 798 x 562 images/credit_card_balance_3D_scatterplot.png -## library(ISLR) -## library(plotly) -## plot_ly(showscale=FALSE) %>% -## add_markers( -## x = Credit$Income, -## y = Credit$Limit, -## z = Credit$Balance, -## hoverinfo = 'text', -## text = ~paste("x1 - Income: ", Credit$Income, -## "
x2 - Limit: ", Credit$Limit, -## "
y - Balance: ", Credit$Balance) -## ) %>% -## layout( -## scene = list( -## xaxis = list(title = "x1 - Income (in $10K)"), -## yaxis = list(title = "x2 - Limit ($)"), -## zaxis = list(title = "y - Balance ($)") -## ) -## ) - -## ---- eval=FALSE, echo=FALSE--------------------------------------------- -## # Save as 798 x 562 images/credit_card_balance_regression_plane.png -## library(ISLR) -## library(plotly) -## library(tidyverse) -## -## # setup hideous grid required by plotly -## model_lm <- lm(Balance ~ Income + Limit, data=Credit) -## x_grid <- seq(from=min(Credit$Income), to=max(Credit$Income), length=100) -## y_grid <- seq(from=min(Credit$Limit), to=max(Credit$Limit), length=200) -## z_grid <- expand.grid(x_grid, y_grid) %>% -## tbl_df() %>% -## rename( -## x_grid = Var1, -## y_grid = Var2 -## ) %>% -## mutate(z = coef(model_lm)[1] + coef(model_lm)[2]*x_grid + coef(model_lm)[3]*y_grid) %>% -## .[["z"]] %>% -## matrix(nrow=length(x_grid)) %>% -## t() -## -## # plot points and plane -## plot_ly(showscale = FALSE) %>% -## add_markers( -## x = Credit$Income, -## y = Credit$Limit, -## z = Credit$Balance, -## hoverinfo = 'text', -## text = ~paste("x1 - Income: ", Credit$Income, "
x2 - Limit: ", -## Credit$Limit, "
y - Balance: ", Credit$Balance) -## ) %>% -## layout( -## scene = list( -## xaxis = list(title = "x1 - Income (in $10K)"), -## yaxis = list(title = "x2 - Limit ($)"), -## zaxis = list(title = "y - Balance ($)") -## ) -## ) %>% -## add_surface( -## x = x_grid, -## y = y_grid, -## z = z_grid -## ) - -## ---- eval=FALSE--------------------------------------------------------- -## Balance_model <- lm(Balance ~ Limit + Income, data = Credit) -## get_regression_table(Balance_model) - -## ---- echo=FALSE--------------------------------------------------------- -Balance_model <- lm(Balance ~ Limit + Income, data = Credit) -Credit_line <- get_regression_table(Balance_model) %>% - pull(estimate) - -## ----model3-table-output, echo=FALSE------------------------------------- -get_regression_table(Balance_model) %>% - knitr::kable( - digits = 3, - caption = "Multiple regression table", - booktabs = TRUE - ) - -## ---- eval=FALSE--------------------------------------------------------- -## regression_points <- get_regression_points(Balance_model) -## regression_points - -## ----model3-points-table, echo=FALSE------------------------------------- -set.seed(76) -regression_points <- get_regression_points(Balance_model) -regression_points %>% - slice(1:5) %>% - knitr::kable( - digits = 3, - caption = "Regression points (first 5 rows of 400)", - booktabs = TRUE - ) - -## ---- eval=FALSE--------------------------------------------------------- -## ggplot(regression_points, aes(x = Limit, y = residual)) + -## geom_point() + -## labs(x = "Credit limit (in $)", y = "Residual", title = "Residuals vs credit limit") -## -## ggplot(regression_points, aes(x = Income, y = residual)) + -## geom_point() + -## labs(x = "Income (in $1000)", y = "Residual", title = "Residuals vs income") - -## ---- echo=FALSE, fig.height=4, fig.cap="Residuals vs credit limit and income"---- -model3_residual_vs_limit_plot <- ggplot(regression_points, aes(x = Limit, y = residual)) + - geom_point() + - labs(x = "Credit limit (in $)", y = "Residual", - title = "Residuals vs credit limit") -model3_residual_vs_income_plot <- ggplot(regression_points, aes(x = Income, y = residual)) + - geom_point() + - labs(x = "Income (in $1000)", y = "Residual", - title = "Residuals vs income") -grid.arrange(model3_residual_vs_limit_plot, model3_residual_vs_income_plot, nrow = 1) - -## ----model3-residuals-hist, fig.height=4, fig.cap="Relationship between credit card balance and credit limit/income"---- -ggplot(regression_points, aes(x = residual)) + - geom_histogram(color = "white") + - labs(x = "Residual") - -## ----eval=FALSE---------------------------------------------------------- -## load(url("http://www.openintro.org/stat/data/evals.RData")) -## evals <- evals %>% -## select(score, age, gender) - -## ----echo=FALSE---------------------------------------------------------- -if(!file.exists("data/evals.RData")){ - download.file(url = "http://www.openintro.org/stat/data/evals.RData", - destfile = "data/evals.RData") -} -load(file = "data/evals.RData") -evals <- evals %>% - select(score, bty_avg, age, gender) - -## ---- eval=FALSE--------------------------------------------------------- -## View(evals) - -## ----model4-data-preview, echo=FALSE------------------------------------- -evals %>% - sample_n(5) %>% - knitr::kable( - digits = 3, - caption = "Random sample of 5 instructors", - booktabs = TRUE - ) - -## ------------------------------------------------------------------------ -summary(evals) - -## ----numxcatxplot1, warning=FALSE, fig.cap="Instructor evaluation scores at UT Austin split by gender (jittered)"---- -ggplot(evals, aes(x = age, y = score, col = gender)) + - geom_jitter() + - labs(x = "Age", y = "Teaching Score", color = "Gender") + - geom_smooth(method = "lm", se = FALSE) - -## ---- eval=FALSE--------------------------------------------------------- -## score_model_2 <- lm(score ~ age + gender, data = evals) -## get_regression_table(score_model_2) - -## ---- echo=FALSE--------------------------------------------------------- -score_model_2 <- lm(score ~ age + gender, data = evals) -get_regression_table(score_model_2) %>% - knitr::kable( - digits = 3, - caption = "Regression table", - booktabs = TRUE - ) - -## ----numxcatxplot2, echo=FALSE, warning=FALSE, fig.cap="Instructor evaluation scores at UT Austin by gender: same slope"---- -coeff <- lm(score ~ age + gender, data = evals) %>% - coef() %>% - as.numeric() -slopes <- evals %>% - group_by(gender) %>% - summarise(min = min(age), max = max(age)) %>% - mutate(intercept = coeff[1]) %>% - mutate(intercept = ifelse(gender == "male", intercept + coeff[3], intercept)) %>% - gather(point, age, -c(gender, intercept)) %>% - mutate(y_hat = intercept + age * coeff[2]) - -ggplot(evals, aes(x = age, y = score, col = gender)) + - geom_jitter() + - labs(x = "Age", y = "Teaching Score", color = "Gender") + - geom_line(data = slopes, aes(y = y_hat), size = 1) - -## ---- eval=FALSE--------------------------------------------------------- -## score_model_interaction <- lm(score ~ age * gender, data = evals) -## get_regression_table(score_model_interaction) - -## ---- echo=FALSE--------------------------------------------------------- -score_model_interaction <- lm(score ~ age * gender, data = evals) -get_regression_table(score_model_interaction) %>% - knitr::kable( - digits = 3, - caption = "Regression table", - booktabs = TRUE - ) - -## ---- echo=FALSE--------------------------------------------------------- -data_frame( - Gender = c("Male instructors", "Female instructors"), - Intercept = c(4.437, 4.883), - `Slope for age` = c(-0.004, -0.018) -) %>% - knitr::kable( - digits = 3, - caption = "Comparison of male and female intercepts and age slopes", - booktabs = TRUE - ) - -## ---- eval=FALSE--------------------------------------------------------- -## regression_points <- get_regression_points(score_model_interaction) -## regression_points - -## ----model4-points-table, echo=FALSE------------------------------------- -set.seed(76) -regression_points <- get_regression_points(score_model_interaction) -regression_points %>% - slice(1:5) %>% - knitr::kable( - digits = 3, - caption = "Regression points (first 5 rows of 463)", - booktabs = TRUE - ) - -## ----residual1, warning=FALSE, fig.cap="Interaction model histogram of residuals"---- -ggplot(regression_points, aes(x = residual)) + - geom_histogram(binwidth = 0.25, color = "white") + - labs(x = "Residual") - -## ----residual2, warning=FALSE, fig.cap="Interaction model residuals vs predictor"---- -ggplot(regression_points, aes(x = age, y = residual)) + - geom_point() + - labs(x = "age", y = "Residual") + - geom_hline(yintercept = 0, col = "blue", size = 1) + - facet_wrap(~gender) - -## ---- eval=FALSE--------------------------------------------------------- -## library(ISLR) -## data(Credit) -## Credit %>% -## select(Balance, Limit, Income) %>% -## mutate(Income = Income * 1000) %>% -## cor() - -## ----cor-credit-2, echo=FALSE-------------------------------------------- -library(ISLR) -data(Credit) -Credit %>% - select(Balance, Limit, Income) %>% - mutate(Income = Income * 1000) %>% - cor() %>% - knitr::kable( - digits = 3, - caption = "Correlation between income (in $) and credit card balance", - booktabs = TRUE - ) - -## ----echo=FALSE, fig.height=4, fig.cap="Relationship between credit card balance and credit limit/income"---- -grid.arrange(model3_balance_vs_limit_plot, model3_balance_vs_income_plot, nrow = 1) - -## ----credit-limit-quartiles, echo=FALSE, fig.height=4, fig.cap="Histogram of credit limits and quartiles"---- -ggplot(Credit, aes(x = Limit)) + - geom_histogram(color = "white") + - geom_vline(xintercept = quantile(Credit$Limit, probs = c(0.25, 0.5, 0.75)), col = "red", linetype = "dashed") - -## ---- 2numxplot4, fig.height=4, echo=FALSE, fig.cap="Relationship between credit card balance and income for different credit limit brackets"---- -Credit <- Credit %>% - mutate(limit_bracket = cut_number(Limit, 4)) %>% - mutate(limit_bracket = fct_recode(limit_bracket, - "low" = "[855,3.09e+03]", - "medium-low" = "(3.09e+03,4.62e+03]", - "medium-high" = "(4.62e+03,5.87e+03]", - "high" = "(5.87e+03,1.39e+04]" - )) - -model3_balance_vs_income_plot_colored <- ggplot(Credit, aes(x = Income, y = Balance, col = limit_bracket)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE) + - labs(x = "Income (in $1000)", y = "Credit card balance (in $)", - color = "Credit limit\nbracket", title = "Balance vs income") + - theme(legend.position = "bottom") - -grid.arrange(model3_balance_vs_income_plot, model3_balance_vs_income_plot_colored, nrow = 1) -#cowplot::plot_grid(model3_balance_vs_income_plot, model3_balance_vs_income_plot_colored, nrow = 1, rel_widths = c(2/5, 3/5)) - -## ---- 2numxplot5, echo=FALSE, warning=FALSE, fig.cap="Relationship between credit card balance and income for different credit limit brackets"---- -ggplot(Credit, aes(x = Income, y = Balance)) + - geom_point() + - facet_wrap(~limit_bracket) + - geom_smooth(method = "lm", se = FALSE) + - labs(x = "Income (in $1000)", y = "Credit card balance (in $)") - diff --git a/docs/scripts/08-sampling.R b/docs/scripts/08-sampling.R deleted file mode 100644 index a78d0ff5d..000000000 --- a/docs/scripts/08-sampling.R +++ /dev/null @@ -1,104 +0,0 @@ -## ----message=FALSE, warning=FALSE---------------------------------------- -library(dplyr) -library(ggplot2) -library(moderndive) - -library(okcupiddata) -library(mosaic) - -## ----message=FALSE, warning=FALSE, echo=FALSE---------------------------- -# Packages needed internally, but not in text. - -## ----height-hist, warning=FALSE------------------------------------------ -ggplot(data = profiles, mapping = aes(x = height)) + - geom_histogram(bins = 20, color = "white") - -## ----filter-profiles----------------------------------------------------- -profiles_subset <- profiles %>% filter(between(height, 55, 85)) - -## ----height-hist2, warning=FALSE----------------------------------------- -ggplot(data = profiles_subset, mapping = aes(x = height)) + - geom_histogram(bins = 20, color = "white") - -## ----sample-profiles----------------------------------------------------- -set.seed(2017) -profiles_sample1 <- profiles_subset %>% - resample(size = 100, replace = FALSE) - -## ----plot-sample1-------------------------------------------------------- -ggplot(data = profiles_sample1, mapping = aes(x = height)) + - geom_histogram(bins = 20, color = "white", fill = "red") + - coord_cartesian(xlim = c(55, 85)) - -## ----sample-profiles2---------------------------------------------------- -profiles_sample2 <- profiles_subset %>% resample(size = 100, replace = FALSE) -ggplot(data = profiles_sample2, mapping = aes(x = height)) + - geom_histogram(bins = 20, color = "black", fill = "yellow") + - coord_cartesian(xlim = c(55, 85)) - -## ----sample-profiles3---------------------------------------------------- -profiles_sample3 <- profiles_subset %>% filter(height >= 72) -ggplot(data = profiles_sample3, mapping = aes(x = height)) + - geom_histogram(bins = 20, color = "white", fill = "blue") + - coord_cartesian(xlim = c(55, 85)) - -## ----mean1--------------------------------------------------------------- -profiles_sample1 %>% summarize(mean(height)) - -## ----mean2--------------------------------------------------------------- -profiles_sample2 %>% summarize(mean(height)) - -## ----mean3--------------------------------------------------------------- -profiles_sample3 %>% summarize(mean(height)) - -## ----do-first, include=FALSE--------------------------------------------- -if(!file.exists("rds/sample_means.rds")){ - sample_means <- do(5000) * - (profiles_subset %>% resample(size = 100, replace = FALSE) %>% - summarize(mean_height = mean(height)) - ) - saveRDS(object = sample_means, "rds/sample_means.rds") -} else { - sample_means <- readRDS("rds/sample_means.rds") -} - -## ----do-first-read, eval=FALSE------------------------------------------- -## sample_means <- do(5000) * -## (profiles_subset %>% resample(size = 100, replace = FALSE) %>% -## summarize(mean_height = mean(height))) - -## ----do-plot------------------------------------------------------------- -ggplot(data = sample_means, mapping = aes(x = mean_height)) + - geom_histogram(color = "white", bins = 20) - -## ----message=FALSE------------------------------------------------------- -set.seed(2017) -do(1) * rflip(1) - -## ------------------------------------------------------------------------ -do(13) * rflip(10) - -## ----include=FALSE------------------------------------------------------- -if(!file.exists("rds/simGuesses.rds")){ - simGuesses <- do(5000) * rflip(10) - saveRDS(object = simGuesses, "rds/simGuesses.rds") -} else { - simGuesses <- readRDS("rds/simGuesses.rds") -} - -## ----eval=FALSE---------------------------------------------------------- -## simGuesses <- do(5000) * rflip(10) - -## ------------------------------------------------------------------------ -simGuesses %>% - group_by(heads) %>% - summarize(count = n()) - -## ----fig.cap="Histogram of number of heads in simulation - needs tweaking"---- -ggplot(data = simGuesses, mapping = aes(x = heads)) + - geom_histogram(binwidth = 1, color = "white") - -## ----fig.cap="Barplot of number of heads in simulation"------------------ -ggplot(data = simGuesses, mapping = aes(x = factor(heads))) + - geom_bar() - diff --git a/docs/scripts/09-confidence-intervals.R b/docs/scripts/09-confidence-intervals.R deleted file mode 100644 index b98225333..000000000 --- a/docs/scripts/09-confidence-intervals.R +++ /dev/null @@ -1,146 +0,0 @@ -## ----message=FALSE, warning=FALSE---------------------------------------- -library(dplyr) -library(ggplot2) -library(mosaic) -library(knitr) -library(ggplot2movies) - -## ----message=FALSE, warning=FALSE, echo=FALSE---------------------------- -# Packages needed internally, but not in text. - -## ----fig.cap="Population ratings histogram"------------------------------ -movies %>% ggplot(aes(x = rating)) + - geom_histogram(color = "white", bins = 20) - -## **_Learning check_** - -## ------------------------------------------------------------------------ -set.seed(2017) -movies_sample <- movies %>% - sample_n(50) - -## ----fig.cap="Sample ratings histogram"---------------------------------- -ggplot(data = movies_sample, aes(x = rating)) + - geom_histogram(color = "white", bins = 20) - -## ------------------------------------------------------------------------ -(movies_sample_mean <- movies_sample %>% - summarize(mean = mean(rating))) - -## ------------------------------------------------------------------------ -boot1 <- resample(movies_sample) %>% - arrange(orig.id) - -## ------------------------------------------------------------------------ -(movies_boot1_mean <- boot1 %>% summarize(mean = mean(rating))) - -## ------------------------------------------------------------------------ -do(10) * - (resample(movies_sample) %>% - summarize(mean = mean(rating))) - -## ---- include=FALSE------------------------------------------------------ -if(!file.exists("rds/trials.rds")){ - trials <- do(5000) * summarize(resample(movies_sample), mean = mean(rating)) - saveRDS(object = trials, "rds/trials.rds") -} else { - trials <- readRDS("rds/trials.rds") -} - -## ---- eval=FALSE--------------------------------------------------------- -## trials <- do(5000) * summarize(resample(movies_sample), mean = mean(rating)) - -## ---- fig.cap="Bootstrapped means histogram"----------------------------- -ggplot(data = trials, mapping = aes(x = mean)) + - geom_histogram(bins = 30, color = "white") - -## ------------------------------------------------------------------------ -(ciq_mean_rating <- confint(trials, level = 0.95, method = "quantile")) - -## ------------------------------------------------------------------------ -movies %>% summarize(mean_rating = mean(rating)) - -## ----warning=FALSE, message=FALSE---------------------------------------- -(cise_mean_rating <- confint(trials, level = 0.95, method = "stderr")) - -## ----message=FALSE, warning=FALSE---------------------------------------- -(movies_trimmed <- movies %>% select(title, year, rating, Action, Romance)) - -## ------------------------------------------------------------------------ -movies_trimmed <- movies_trimmed %>% - filter(!(Action == 1 & Romance == 1)) - -## ------------------------------------------------------------------------ -movies_trimmed <- movies_trimmed %>% - mutate(genre = ifelse(Action == 1, "Action", - ifelse(Romance == 1, "Romance", - "Neither"))) %>% - filter(genre != "Neither") %>% - select(-Action, -Romance) - -## ------------------------------------------------------------------------ -set.seed(2017) -movies_genre_sample <- movies_trimmed %>% - group_by(genre) %>% - sample_n(34) %>% - ungroup() - -## ------------------------------------------------------------------------ -mean_ratings <- movies_genre_sample %>% - group_by(genre) %>% - summarize(mean = mean(rating)) -obs_diff <- diff(mean_ratings$mean) - -## ----message=FALSE, warning=FALSE---------------------------------------- -shuffled_ratings <- #movies_trimmed %>% - movies_genre_sample %>% - mutate(genre = shuffle(genre)) %>% - group_by(genre) %>% - summarize(mean = mean(rating)) -diff(shuffled_ratings$mean) - -## ----include=FALSE------------------------------------------------------- -set.seed(2017) -if(!file.exists("rds/many_shuffles.rds")){ - many_shuffles <- do(5000) * - (movies_genre_sample %>% - mutate(genre = shuffle(genre)) %>% - group_by(genre) %>% - summarize(mean = mean(rating)) - ) - saveRDS(object = many_shuffles, "rds/many_shuffles.rds") -} else { - many_shuffles <- readRDS("rds/many_shuffles.rds") -} - -## ----eval=FALSE---------------------------------------------------------- -## set.seed(2017) -## many_shuffles <- do(5000) * -## (movies_genre_sample %>% -## mutate(genre = shuffle(genre)) %>% -## group_by(genre) %>% -## summarize(mean = mean(rating)) -## ) - -## ------------------------------------------------------------------------ -rand_distn <- many_shuffles %>% - group_by(.index) %>% - summarize(diffmean = diff(mean)) -head(rand_distn, 10) - -## ----fig.cap="Simulated shuffled sample means histogram"----------------- -ggplot(data = rand_distn, mapping = aes(x = diffmean)) + - geom_histogram(color = "white", bins = 20) - -## ------------------------------------------------------------------------ -(std_err <- rand_distn %>% summarize(se = sd(diffmean))) - -## ------------------------------------------------------------------------ -(lower <- obs_diff - (2 * std_err)) -(upper <- obs_diff + (2 * std_err)) - -## ------------------------------------------------------------------------ -df1 <- data_frame(samp1 = rexp(50)) -df2 <- data_frame(samp2 = rnorm(100)) -df3 <- data_frame(samp3 = rbeta(20, 5, 5)) - diff --git a/docs/scripts/10-hypothesis-testing.R b/docs/scripts/10-hypothesis-testing.R deleted file mode 100644 index bcb622f30..000000000 --- a/docs/scripts/10-hypothesis-testing.R +++ /dev/null @@ -1,303 +0,0 @@ -## ----message=FALSE, warning=FALSE---------------------------------------- -library(dplyr) -library(ggplot2) -library(mosaic) -library(knitr) -library(nycflights13) -library(ggplot2movies) -library(broom) - -## ----message=FALSE, warning=FALSE, echo=FALSE---------------------------- -# Packages needed internally, but not in text. - -## ------------------------------------------------------------------------ -bos_sfo <- flights %>% - na.omit() %>% - filter(dest %in% c("BOS", "SFO")) %>% - group_by(dest) %>% - sample_n(100) - -## ------------------------------------------------------------------------ -bos_sfo_summary <- bos_sfo %>% group_by(dest) %>% - summarize(mean_time = mean(air_time), - sd_time = sd(air_time)) -kable(bos_sfo_summary) - -## ------------------------------------------------------------------------ -ggplot(data = bos_sfo, mapping = aes(x = dest, y = air_time)) + - geom_boxplot() - -## ----echo=FALSE---------------------------------------------------------- -choice <- c(rep("Correct", 3), "Incorrect", rep("Correct", 6)) -kable(choice) - -## ----sample-table, echo=FALSE-------------------------------------------- -set.seed(2017) -sim1 <- resample(x = c("Correct", "Incorrect"), size = 10, prob = c(0.5, 0.5)) -sim2 <- resample(x = c("Correct", "Incorrect"), size = 10, prob = c(0.5, 0.5)) -sim3 <- resample(x = c("Correct", "Incorrect"), size = 10, prob = c(0.5, 0.5)) -sims <- data.frame(sample1 = sim1, sample2 = sim2, sample3 = sim3) -kable(sims, row.names = TRUE, caption = 'A table of three sets of 10 coin flips') - -## ----echo=FALSE---------------------------------------------------------- -t1 <- sum(sim1 == "Correct") -t2 <- sum(sim2 == "Correct") -t3 <- sum(sim3 == "Correct") - -## ------------------------------------------------------------------------ -ggplot(data = simGuesses, aes(x = factor(heads))) + - geom_bar() - -## ------------------------------------------------------------------------ -pvalue_tea <- simGuesses %>% - filter(heads >= 9) %>% - nrow() / nrow(simGuesses) - -## ----fig.cap="Barplot of heads with p-value highlighted"----------------- -ggplot(data = simGuesses, aes(x = factor(heads), fill = (heads >= 9))) + - geom_bar() + - labs(x = "heads") - -## ----message=FALSE, warning=FALSE---------------------------------------- -(movies_trimmed <- movies %>% select(title, year, rating, Action, Romance)) - -## ------------------------------------------------------------------------ -movies_trimmed <- movies_trimmed %>% - filter(!(Action == 1 & Romance == 1)) - -## ------------------------------------------------------------------------ -movies_trimmed <- movies_trimmed %>% - mutate(genre = ifelse(Action == 1, "Action", - ifelse(Romance == 1, "Romance", - "Neither"))) %>% - filter(genre != "Neither") %>% - select(-Action, -Romance) - -## ----fig.cap="Rating vs genre in the population"------------------------- -ggplot(data = movies_trimmed, aes(x = genre, y = rating)) + - geom_boxplot() - -## ----movie-hist, warning=FALSE, fig.cap="Faceted histogram of genre vs rating"---- -ggplot(data = movies_trimmed, mapping = aes(x = rating)) + - geom_histogram(binwidth = 1, color = "white", fill = "dodgerblue") + - facet_grid(genre ~ .) - -## ------------------------------------------------------------------------ -set.seed(2017) -movies_genre_sample <- movies_trimmed %>% - group_by(genre) %>% - sample_n(34) %>% - ungroup() - -## ----fig.cap="Genre vs rating for our sample"---------------------------- -ggplot(data = movies_genre_sample, aes(x = genre, y = rating)) + - geom_boxplot() - -## ----warning=FALSE, fig.cap="Genre vs rating for our sample as faceted histogram"---- -ggplot(data = movies_genre_sample, mapping = aes(x = rating)) + - geom_histogram(binwidth = 1, color = "white", fill = "dodgerblue") + - facet_grid(genre ~ .) - -## ------------------------------------------------------------------------ -summary_ratings <- movies_genre_sample %>% - group_by(genre) %>% - summarize(mean = mean(rating), - std_dev = sd(rating), - n = n()) -summary_ratings %>% kable() - -## ------------------------------------------------------------------------ -mean_ratings <- movies_genre_sample %>% - group_by(genre) %>% - summarize(mean = mean(rating)) -obs_diff <- diff(mean_ratings$mean) - -## ----message=FALSE, warning=FALSE---------------------------------------- -shuffled_ratings <- #movies_trimmed %>% - movies_genre_sample %>% - mutate(genre = shuffle(genre)) %>% - group_by(genre) %>% - summarize(mean = mean(rating)) -diff(shuffled_ratings$mean) - -## ----include=FALSE------------------------------------------------------- -set.seed(2017) -if(!file.exists("rds/many_shuffles.rds")){ - many_shuffles <- do(5000) * - (movies_genre_sample %>% - mutate(genre = shuffle(genre)) %>% - group_by(genre) %>% - summarize(mean = mean(rating)) - ) - saveRDS(object = many_shuffles, "rds/many_shuffles.rds") -} else { - many_shuffles <- readRDS("rds/many_shuffles.rds") -} - -## ----eval=FALSE---------------------------------------------------------- -## set.seed(2017) -## many_shuffles <- do(5000) * -## (movies_genre_sample %>% -## mutate(genre = shuffle(genre)) %>% -## group_by(genre) %>% -## summarize(mean = mean(rating)) -## ) - -## ------------------------------------------------------------------------ -rand_distn <- many_shuffles %>% - group_by(.index) %>% - summarize(diffmean = diff(mean)) -head(rand_distn, 10) - -## ----fig.cap="Simulated differences in means histogram"------------------ -ggplot(data = rand_distn, aes(x = diffmean)) + - geom_histogram(color = "white", bins = 20) - -## ----fig.cap="Shaded histogram to show p-value"-------------------------- -ggplot(data = rand_distn, aes(x = diffmean, fill = (abs(diffmean) >= obs_diff))) + - geom_histogram(color = "white", bins = 20) - -## ----fig.cap="Histogram with vertical lines corresponding to observed statistic"---- -ggplot(data = rand_distn, aes(x = diffmean)) + - geom_histogram(color = "white", bins = 100) + - geom_vline(xintercept = obs_diff, color = "red") + - geom_vline(xintercept = -obs_diff, color = "red") - -## ------------------------------------------------------------------------ -(pvalue_movies <- rand_distn %>% - filter(abs(diffmean) >= obs_diff) %>% - nrow() / nrow(rand_distn)) - -## ----echo=FALSE---------------------------------------------------------- -ggplot(data.frame(x = c(-4, 4)), aes(x)) + stat_function(fun = dnorm) - -## ----fig.cap="Simulated differences in means histogram"------------------ -ggplot(data = rand_distn, aes(x = diffmean)) + - geom_histogram(color = "white", bins = 20) - -## ------------------------------------------------------------------------ -kable(summary_ratings) - -## ------------------------------------------------------------------------ -s1 <- summary_ratings$std_dev[2] -s2 <- summary_ratings$std_dev[1] -n1 <- summary_ratings$n[2] -n2 <- summary_ratings$n[1] - -## ------------------------------------------------------------------------ -(denom_T <- sqrt( (s1^2 / n1) + (s2^2 / n2) )) - -## ---- fig.cap="Simulated T statistics histogram"------------------------- -rand_distn <- rand_distn %>% - mutate(t_stat = diffmean / denom_T) -ggplot(data = rand_distn, aes(x = t_stat)) + - geom_histogram(color = "white", bins = 20) - -## ------------------------------------------------------------------------ -ggplot(data = rand_distn, mapping = aes(x = t_stat)) + - geom_histogram(aes(y = ..density..), color = "white", binwidth = 0.3) + - stat_function(fun = dt, - args = list(df = min(n1 - 1, n2 - 1)), - color = "royalblue", size = 2) - -## ------------------------------------------------------------------------ -(t_obs <- obs_diff / denom_T) - -## ------------------------------------------------------------------------ -ggplot(data = rand_distn, mapping = aes(x = t_stat)) + - stat_function(fun = dt, - args = list(df = min(n1 - 1, n2 - 1)), - color = "royalblue", size = 2) + - geom_vline(xintercept = t_obs, color = "red") + - geom_vline(xintercept = -t_obs, color = "red") - -## ------------------------------------------------------------------------ -pt(t_obs, df = min(n1 - 1, n2 - 1), lower.tail = FALSE) + - pt(-t_obs, df = min(n1 - 1, n2 - 1), lower.tail = TRUE) - -## ----warning=FALSE------------------------------------------------------- -# To ensure the random sample of 50 flights is the same for -# anyone using this code -set.seed(2017) - -# Load Alaska data, deleting rows that have missing departure delay -# or arrival delay data -alaska_flights <- flights %>% - filter(carrier == "AS") %>% - filter(!is.na(dep_delay) & !is.na(arr_delay)) %>% - # Select 50 flights that don't have missing delay data - sample_n(50) - -## ---- echo=FALSE--------------------------------------------------------- -# USED INTERNALLY: Least squares line values, used for in-text output -delay_fit <- lm(formula = arr_delay ~ dep_delay, data = alaska_flights) -intercept <- tidy(delay_fit, conf.int=TRUE)$estimate[1] %>% round(3) -slope <- tidy(delay_fit, conf.int=TRUE)$estimate[2] %>% round(3) -CI_intercept <- c(tidy(delay_fit, conf.int=TRUE)$conf.low[1], tidy(delay_fit, conf.int=TRUE)$conf.high[1]) %>% round(3) -CI_slope <- c(tidy(delay_fit, conf.int=TRUE)$conf.low[2], tidy(delay_fit, conf.int=TRUE)$conf.high[2]) %>% round(3) - -## ------------------------------------------------------------------------ -delay_fit <- lm(formula = arr_delay ~ dep_delay, data = alaska_flights) -(b1_obs <- tidy(delay_fit)$estimate[2]) - -## ---- include=FALSE------------------------------------------------------ -if(!file.exists("rds/rand_slope_distn.rds")){ - rand_slope_distn <- do(5000) * - (lm(formula = arr_delay ~ shuffle(dep_delay), data = alaska_flights) %>% - coef()) - saveRDS(object = rand_slope_distn, "rds/rand_slope_distn.rds") -} else { - rand_slope_distn <- readRDS("rds/rand_slope_distn.rds") -} - -## ----many_shuffles_reg, eval=FALSE--------------------------------------- -## rand_slope_distn <- do(5000) * -## (lm(formula = arr_delay ~ shuffle(dep_delay), data = alaska_flights) %>% -## coef()) - -## ------------------------------------------------------------------------ -names(rand_slope_distn) - -## ------------------------------------------------------------------------ -ggplot(data = rand_slope_distn, mapping = aes(x = dep_delay)) + - geom_histogram(color = "white", bins = 20) - -## ----fig.cap="Shaded histogram to show p-value"-------------------------- -ggplot(data = rand_slope_distn, aes(x = dep_delay, fill = (dep_delay >= b1_obs))) + - geom_histogram(color = "white", bins = 20) - -## ---- echo=FALSE--------------------------------------------------------- -delay_fit <- lm(formula = arr_delay ~ dep_delay, data = alaska_flights) -tidy(delay_fit) %>% - kable() - -## ----echo=FALSE---------------------------------------------------------- -ggplot(data = alaska_flights, - mapping = aes(x = dep_delay, y = arr_delay)) + - geom_point() + - geom_smooth(method = "lm", se = FALSE, color = "red") + - annotate("point", x = 44, y = 7, color = "blue", size = 3) + - annotate("segment", x = 44, xend = 44, y = 7, yend = -14.155 + 1.218 * 44, - color = "blue", arrow = arrow(length = unit(0.03, "npc"))) - -## ------------------------------------------------------------------------ -regression_points <- augment(delay_fit) %>% - select(arr_delay, dep_delay, .fitted, .resid) -regression_points %>% - head() %>% - kable() - -## ----resid-histogram----------------------------------------------------- -ggplot(data = regression_points, mapping = aes(x = .resid)) + - geom_histogram(binwidth = 10, color = "white") + - geom_vline(xintercept = 0, color = "blue") - -## ----resid-plot, fig.cap="Fitted versus Residuals plot"------------------ -ggplot(data = regression_points, mapping = aes(x = .fitted, y = .resid)) + - geom_point() + - geom_abline(intercept = 0, slope = 0, color = "blue") - -## ----qqplot1, fig.cap="QQ Plot of residuals"----------------------------- -ggplot(data = regression_points, mapping = aes(sample = .resid)) + - stat_qq() - diff --git a/docs/scripts/11-inference-for-regression.R b/docs/scripts/11-inference-for-regression.R deleted file mode 100644 index 7fc9760b2..000000000 --- a/docs/scripts/11-inference-for-regression.R +++ /dev/null @@ -1,71 +0,0 @@ -## ---- echo=FALSE--------------------------------------------------------- -library(tidyr) - -## ----eval=FALSE---------------------------------------------------------- -## library(ggplot2) -## library(dplyr) -## library(moderndive) -## -## load(url("http://www.openintro.org/stat/data/evals.RData")) -## evals <- evals %>% -## select(score, ethnicity, gender, language, age, bty_avg, rank) - -## ----echo=FALSE---------------------------------------------------------- -library(ggplot2) -library(dplyr) -library(moderndive) -if(!file.exists("data/evals.RData")){ - download.file(url = "http://www.openintro.org/stat/data/evals.RData", - destfile = "data/evals.RData") -} -load(file = "data/evals.RData") -evals <- evals %>% - select(score, ethnicity, gender, language, age, bty_avg, rank) - -## ----model1, echo=FALSE, warning=FALSE, fig.cap="Model 1: no interaction effect included"---- -coeff <- lm(score ~ age + gender, data = evals) %>% coef() %>% as.numeric() -slopes <- evals %>% - group_by(gender) %>% - summarise(min = min(age), max = max(age)) %>% - mutate(intercept = coeff[1]) %>% - mutate(intercept = ifelse(gender == "male", intercept + coeff[3], intercept)) %>% - gather(point, age, -c(gender, intercept)) %>% - mutate(y_hat = intercept + age * coeff[2]) - - ggplot(evals, aes(x = age, y = score, col = gender)) + - geom_jitter() + - labs(x = "Age", y = "Teaching Score", color = "Gender") + - geom_line(data = slopes, aes(y = y_hat), size = 1) - -## ----model2, echo=FALSE, warning=FALSE, fig.cap="Model 2: interaction effect included"---- -ggplot(evals, aes(x = age, y = score, col = gender)) + - geom_jitter() + - labs(x = "Age", y = "Teaching Score", color = "Gender") + - geom_smooth(method = "lm", se = FALSE) - -## ---- eval=FALSE--------------------------------------------------------- -## score_model_2 <- lm(score ~ age + gender, data = evals) -## get_regression_table(score_model_2) - -## ---- echo=FALSE--------------------------------------------------------- -score_model_2 <- lm(score ~ age + gender, data = evals) -get_regression_table(score_model_2) %>% - knitr::kable( - digits = 3, - caption = "Model 1: Regression table with no interaction effect included", - booktabs = TRUE - ) - -## ---- eval=FALSE--------------------------------------------------------- -## score_model_3 <- lm(score ~ age * gender, data = evals) -## get_regression_table(score_model_3) - -## ---- echo=FALSE--------------------------------------------------------- -score_model_3 <- lm(score ~ age * gender, data = evals) -get_regression_table(score_model_3) %>% - knitr::kable( - digits = 3, - caption = "Model 2: Regression table with interaction effect included", - booktabs = TRUE - ) - diff --git a/docs/scripts/12-thinking-with-data.R b/docs/scripts/12-thinking-with-data.R deleted file mode 100644 index 8b1378917..000000000 --- a/docs/scripts/12-thinking-with-data.R +++ /dev/null @@ -1 +0,0 @@ - diff --git a/docs/search_index.json b/docs/search_index.json index 1139b1043..ef7abde35 100644 --- a/docs/search_index.json +++ b/docs/search_index.json @@ -1,16 +1,16 @@ [ -["index.html", "An Introduction to Statistical and Data Sciences via R 1 Introduction 1.1 Introduction for students 1.2 Introduction for instructors 1.3 Connect and contribute 1.4 About this book 1.5 About the authors", " An Introduction to Statistical and Data Sciences via R Chester Ismay and Albert Y. Kim February 3, 2018 1 Introduction Help! I’m new to R and RStudio and I need to learn about them! However, I’m completely new to coding! What do I do? If you’re asking yourself this question, then you’ve come to the right place! Start with our Introduction for Students. Are you an instructor hoping to use this book in your courses? Then click here for more information on how to teach with this book. Are you looking to connect with and contribute to ModernDive? Then click here for information on how. Are you curious about the publishing of this book? Then click here for more information on the open-source technology, in particular R Markdown and the bookdown package. This is version 0.3.0 of ModernDive published on February 3, 2018. For previous versions of ModernDive, see Section 1.4. 1.1 Introduction for students This book assumes no prerequisites: no algebra, no calculus, and no prior programming/coding experience. This is intended to be a gentle introduction to the practice of analyzing data and answering questions using data the way data scientists, statisticians, data journalists, and other researchers would. Here is a flowchart of what you’ll cover: We get started with data in Chapter 2: R vs RStudio, coding in R, R packages, and exploring your first real data: all domestic departure flights from a New York City airport in 2013. Then we build up your data science toolbox via the tidyverse, an opinionated collection of R packages designed for data science. Specifically Chapter 3 on data visualization via the ggplot2 package Chapter 4 on the “tidy” data format Chapter 5 on data wrangling via the dplyr package. Equipped with your new data science toolbox, in Chapters 6 and 7 we’ll make your first forays into data modeling using one of the most commonly-used and easy to understand approaches: linear regression. We’ll use regression as a descriptive tool for now and leverage the moderndive accompaniment package to this book to help digest the results. We the proceed to cover topics related to statistical inference, the bread and butter of statistics. To this end, we’ll leverage a new package for tidyverse-friendly inference called infer. In particular: Chapter 8 on sampling theory Chapter 9 on confidence intervals (Under construction) Chapter 10 on hypothesis testing (Under construction) (Under construction) After studying simple instances of statistical inference, we revisit the data modeling topics from Chapters 6 and 7 and boost your abilities to interpret the results of regression in Chapter 11 on inference for regression. (Under construction) We’ll end with a discussion on what it means to “think with data” in Chapter 12. 1.1.1 What you will learn from this book We hope that by the end of this book, you’ll have learned How to use R to explore data. How to answer statistical questions using tools like confidence intervals and hypothesis tests. How to effectively create “data stories” using these tools. What do we mean by data stories? We mean any analysis involving data that engages the reader in answering questions with careful visuals and thoughtful discussion, such as How strong is the relationship between per capita income and crime in Chicago neighborhoods? and How many f**ks does Quentin Tarantino give (as measured by the amount of swearing in his films)?. Further discussions on data stories can be found in this Think With Google article. For other examples of data stories constructed by students like yourselves, look at the final projects for two courses that have previously used ModernDive: Middlebury College MATH 116 Introduction to Statistical and Data Sciences using student collected data. Pacific University SOC 301 Social Statistics using data from the fivethirtyeight R package. This book will help you develop your “data science toolbox”, including tools such as data visualization, data formatting, data wrangling, and data modeling using regression. With these tools, you’ll be able to perform the entirety of the “data/science pipeline” while building data communication skills (see Subsection 1.1.2 for more details). In particular, this book will lean heavily on data visualization. In today’s world, we are bombarded with graphics that attempt to convey ideas. We will explore what makes a good graphic and what the standard ways are to convey relationships with data. You’ll also see the use of visualization to introduce concepts like mean, median, standard deviation, distributions, etc. In general, we’ll use visualization as a way of building almost all of the ideas in this book. To impart the statistical lessons in this book, we have intentionally minimized the number of mathematical formulas used and instead have focused on developing a conceptual understanding via data visualization, statistical computing, and simulations. We hope this is a more intuitive experience than the way statistics has traditionally been taught in the past and how it is commonly perceived. Finally, you’ll learn the importance of literate programming. By this we mean you’ll learn how to write code that is useful not just for a computer to execute but also for readers to understand exactly what your analysis is doing and how you did it. This is part of a greater effort to encourage reproducible research (see Subsection 1.1.3 for more details). Hal Abelson coined the phrase that we will follow throughout this book: “Programs must be written for people to read, and only incidentally for machines to execute.” We understand that there may be challenging moments as you learn to program. Both of us continue to struggle and find ourselves often using web searches to find answers and reach out to colleagues for help. In the long run though, we all can solve problems faster and more elegantly via programming. We wrote this book as our way to help you get started and you should know that there is a huge community of R users that are always happy to help everyone along as well. This community exists in particular on the internet on various forums and websites such as stackoverflow.com. 1.1.2 Data/science pipeline You may think of statistics as just being a bunch of numbers. We commonly hear the phrase “statistician” when listening to broadcasts of sporting events. Statistics (in particular, data analysis), in addition to describing numbers like with baseball batting averages, plays a vital role in all of the sciences. You’ll commonly hear the phrase “statistically significant” thrown around in the media. You’ll see articles that say “Science now shows that chocolate is good for you.” Underpinning these claims is data analysis. By the end of this book, you’ll be able to better understand whether these claims should be trusted or whether we should be wary. Inside data analysis are many sub-fields that we will discuss throughout this book (though not necessarily in this order): data collection data wrangling data visualization data modeling inference correlation and regression interpretation of results data communication/storytelling These sub-fields are summarized in what Grolemund and Wickham term the “data/science pipeline” in Figure 1.1. Figure 1.1: Data/Science Pipeline We will begin by digging into the gray Understand portion of the cycle with data visualization, then with a discussion on what is meant by tidy data and data wrangling, and then conclude by talking about interpreting and discussing the results of our models via Communication. These steps are vital to any statistical analysis. But why should you care about statistics? “Why did they make me take this class?” There’s a reason so many fields require a statistics course. Scientific knowledge grows through an understanding of statistical significance and data analysis. You needn’t be intimidated by statistics. It’s not the beast that it used to be and, paired with computation, you’ll see how reproducible research in the sciences particularly increases scientific knowledge. 1.1.3 Reproducible research “The most important tool is the mindset, when starting, that the end product will be reproducible.” – Keith Baggerly Another goal of this book is to help readers understand the importance of reproducible analyses. The hope is to get readers into the habit of making their analyses reproducible from the very beginning. This means we’ll be trying to help you build new habits. This will take practice and be difficult at times. You’ll see just why it is so important for you to keep track of your code and well-document it to help yourself later and any potential collaborators as well. Copying and pasting results from one program into a word processor is not the way that efficient and effective scientific research is conducted. It’s much more important for time to be spent on data collection and data analysis and not on copying and pasting plots back and forth across a variety of programs. In a traditional analyses if an error was made with the original data, we’d need to step through the entire process again: recreate the plots and copy and paste all of the new plots and our statistical analysis into your document. This is error prone and a frustrating use of time. We’ll see how to use R Markdown to get away from this tedious activity so that we can spend more time doing science. “We are talking about computational reproducibility.” - Yihui Xie Reproducibility means a lot of things in terms of different scientific fields. Are experiments conducted in a way that another researcher could follow the steps and get similar results? In this book, we will focus on what is known as computational reproducibility. This refers to being able to pass all of one’s data analysis, data-sets, and conclusions to someone else and have them get exactly the same results on their machine. This allows for time to be spent interpreting results and considering assumptions instead of the more error prone way of starting from scratch or following a list of steps that may be different from machine to machine. 1.1.4 Final note for students At this point, if you are interested in instructor perspectives on this book, ways to contribute and collaborate, or the technical details of this book’s construction and publishing, then continue with the rest of the chapter below. Otherwise, let’s get started with R and RStudio in Chapter 2! 1.2 Introduction for instructors This book is inspired by the following books: “Mathematical Statistics with Resampling and R” (Chihara and Hesterberg 2011), “OpenIntro: Intro Stat with Randomization and Simulation” (Diez, Barr, and Çetinkaya-Rundel 2014), and “R for Data Science” (Grolemund and Wickham 2016). The first book, while designed for upper-level undergraduates and graduate students, provides an excellent resource on how to use resampling to impart statistical concepts like sampling distributions using computation instead of large-sample approximations and other mathematical formulas. The last two books are free options to learning introductory statistics and data science, providing an alternative to the many traditionally expensive introductory statistics textbooks. When looking over the large number of introductory statistics textbooks that currently exist, we found that there wasn’t one that incorporated many newly developed R packages directly into the text, in particular the many packages included in the tidyverse collection of packages, such as ggplot2, dplyr, tidyr, and broom. Additionally, there wasn’t an open-source and easily reproducible textbook available that exposed new learners all of three of the learning goals listed at the outset of Subsection 1.1.1. 1.2.1 Who is this book for? This book is intended for instructors of traditional introductory statistics classes using RStudio, either the desktop or server version, who would like to inject more data science topics into their syllabus. We assume that students taking the class will have no prior algebra, calculus, nor programming/coding experience. Here are some principles and beliefs we kept in mind while writing this text. If you agree with them, this might be the book for you. Blur the lines between lecture and lab With increased availability and accessibility of laptops and open-source non-proprietary statistical software, the strict dichotomy between lab and lecture can be loosened. It’s much harder for students to understand the importance of using software if they only use it once a week or less. They forget the syntax in much the same way someone learning a foreign language forgets the rules. Frequent reinforcement is key. Focus on the entire data/science research pipeline We believe that the entirety of Grolemund and Wickham’s data/science pipeline should be taught. We believe in “minimizing prerequisites to research”: students should be answering questions with data as soon as possible. It’s all about the data We leverage R packages for rich, real, and realistic data-sets that at the same time are easy-to-load into R, such as the nycflights13 and fivethirtyeight packages. We believe that data visualization is a gateway drug for statistics and that the Grammar of Graphics as implemented in the ggplot2 package is the best way to impart such lessons. However, we often hear: “You can’t teach ggplot2 for data visualization in intro stats!” We, like David Robinson, are much more optimistic. dplyr has made data wrangling much more accessible to novices, and hence much more interesting data-sets can be explored. Use simulation/resampling to introduce statistical inference, not probability/mathematical formulas Instead of using formulas, large-sample approximations, and probability tables, we teach statistical concepts using resampling-based inference. This allows for a de-emphasis of traditional probability topics, freeing up room in the syllabus for other topics. Don’t fence off students from the computation pool, throw them in! Computing skills are essential to working with data in the 21st century. Given this fact, we feel that to shield students from computing is to ultimately do them a disservice. We are not teaching a course on coding/programming per se, but rather just enough of the computational and algorithmic thinking necessary for data analysis. Complete reproducibility and customizability We are frustrated when textbooks give examples, but not the source code and the data itself. We give you the source code for all examples as well as the whole book! Ultimately the best textbook is one you’ve written yourself. You know best your audience, their background, and their priorities. You know best your own style and the types of examples and problems you like best. Customization is the ultimate end. For more about how to make this book your own, see About this Book. 1.3 Connect and contribute If you would like to connect with ModernDive, check out the following links: If you would like to receive periodic updates about ModernDive (roughly every 3 months), please sign up for our mailing list. Contact Albert at albert@moderndive.com and Chester chester@moderndive.com We’re on Twitter at ModernDive. If you would like to contribute to ModernDive, there are many ways! Let’s all work together to make this book as great as possible for as many students and instructors as possible! Please let us know if you find any errors, typos, or areas from improvement on our GitHub issues page. If you are familiar with GitHub and would like to contribute more, please see Section 1.4 below. The authors would like to thank Nina Sonneborn, Kristin Bott, and the participants of our USCOTS 2017 workshop for their feedback and suggestions. A special thanks goes to Prof. Yana Weinstein, cognitive psychological scientist and co-founder of The Learning Scientists, for her extensive contributions. 1.4 About this book This book was written using RStudio’s bookdown package by Yihui Xie (Xie 2018). This package simplifies the publishing of books by having all content written in R Markdown. The bookdown/R Markdown source code for all versions of ModernDive is available on GitHub: Latest published version The most up-to-date release: Version 0.3.0 released on February 3, 2018 (source code). Available at ModernDive.com Development version The working copy of the next version which is currently being edited: Preview of development version is available at http://moderndive.netlify.com/ Source code: Available on ModernDive’s GitHub repository page Previous versions Older versions that may be out of date: Version 0.2.0 released on August 02, 2017 (source code) Version 0.1.3 released on February 09, 2017 (source code) Version 0.1.2 released on January 22, 2017 (source code) Could this be a new paradigm for textbooks? Instead of the traditional model of textbook companies publishing updated editions of the textbook every few years, we apply a software design influenced model of publishing more easily updated versions. We can then leverage open-source communities of instructors and developers for ideas, tools, resources, and feedback. As such, we welcome your pull requests. Finally, feel free to modify the book as you wish for your own needs, but please list the authors at the top of index.Rmd as “Chester Ismay, Albert Y. Kim, and YOU!” 1.5 About the authors Who we are! Chester Ismay Albert Y. Kim Chester Ismay - Data Science Curriculum Lead, DataCamp. Portland, OR, USA. Email: chester@moderndive.com Webpage: http://ismayc.github.io/ Twitter: old_man_chester GitHub: https://github.com/ismayc Albert Y. Kim - Lecturer of Statistics, Amherst College. Amherst, MA, USA. Email: albert@moderndive.com Webpage: http://rudeboybert.rbind.io/ Twitter: rudeboybert GitHub: https://github.com/rudeboybert "], -["2-getting-started.html", "2 Getting Started with Data in R 2.1 What are R and RStudio? 2.2 How do I code in R? 2.3 What are R packages? 2.4 Explore your first dataset 2.5 Conclusion", " 2 Getting Started with Data in R Before we can start exploring data in R, there are some key concepts to understand first: What are R and RStudio? How do I code in R? What are R packages? If you are already familiar with these concepts, feel free to skip to Section 2.4 below introducing some of the datasets we will explore in depth in this book. Much of this chapter is based on two sources which you should feel free to use as references if you are looking for additional details: Ismay’s Getting used to R, RStudio, and R Markdown (Ismay 2016), which includes video screen recordings that you can follow along and pause as you learn. DataCamp’s online tutorials. DataCamp is a browser-based interactive platform for learning data science and their tutorials will help facilitate your learning of the above concepts (and other topics in this book). Go to DataCamp and create an account before continuing. 2.1 What are R and RStudio? For much of this book, we will assume that you are using R via RStudio. First time users often confuse the two. At its simplest: R is like a car’s engine RStudio is like a car’s dashboard R: Engine RStudio: Dashboard More precisely, R is a programming language that runs computations while RStudio is an integrated development environment (IDE) that provides an interface by adding many convenient features and tools. So the way of having access to a speedometer, rearview mirrors, and a navigation system makes driving much easier, using RStudio’s interface makes using R much easier as well. Optional: For a more in-depth discussion on the difference between R and RStudio IDE, watch this DataCamp video (2m52s). 2.1.1 Installing R and RStudio If your instructor has provided you with a link and access to RStudio Server, then you can skip this section. We do recommend though after a few months of working on the RStudio Server that you return to these instructions. If you don’t know what RStudio Server is, then please read this section. You will first need to download and install both R and RStudio (Desktop version) on your computer. Download and install R. Note: You must do this first. Click on the download link corresponding to your computer’s operating system. Download and install RStudio. Scroll down to “Installers for Supported Platforms” Click on the download link corresponding to your computer’s operating system. Optional: If you need more detailed instructions on how to install R and RStudio, watch this DataCamp video (1m22s). 2.1.2 Using R via RStudio Recall our car analogy from above. Much as we don’t drive a car by interacting directly with the engine but rather by using elements on the car’s dashboard, we won’t be using R directly but rather we will use RStudio’s interface. After you install R and RStudio on your computer, you’ll have two new programs AKA applications you can open. We will always work in RStudio and not R. In other words: R: Do not open this RStudio: Open this After you open RStudio, you should see the following: Watch the following DataCamp video (4m10s) to learn about the different panes in RStudio, in particular the Console pane where you will later run R code. 2.2 How do I code in R? Now that you’re set up with R and RStudio, you are probably asking yourself “OK. Now how do I use R?” The first thing to note as that unlike other software like Excel, STATA, or SAS that provide point and click interfaces, R is an interpreted language, meaning you have to enter in R commands written in R code i.e. you have to program in R (we use the terms “coding” and “programming” interchangeably in this book). While it is not required to be a seasoned coder/computer programmer to use R, there is still a set of basic programming concepts that R users need to understand. Consequently, while this book is not a book on programming, you will still learn just enough of these basic programming concepts needed to explore and analyze data effectively. 2.2.1 Basic programming concepts and terminology To introduce you to many of these basic programming concepts and terminology, we direct you to the following DataCamp online interactive tutorials. For each of the tutorials, we give a list of the basic programming concepts covered. Note that in this book, we will use a different font to distinguish regular font from computer_code. It is important to note that while these tutorials serve as excellent introductions, a single pass through them is insufficient for long-term learning and retention. The ultimate tools for long-term learning and retention are “learning by doing” and repetition, something we will have you do over the course of the entire book and we encourage this process as much as possible as you learn any new skill. From the Introduction to R course complete the following chapters. As you work through the chapters, carefully note the important terms and what they are used for. We recommend you do so in a notebook that you can easily refer back to. Chapter 1 Intro to basics: Console pane: where you enter in commands Objects: where values are saved, how to assign values to objects. Data types: integers, doubles/numerics, logicals, characters. Chapter 2 Vectors: Vectors: a series of values. Chapter 4 Factors: Categorical data (as opposed to numerical data) are represented in R as factors. Chapter 5 Data frames: Data frames are analogous to rectangular spreadsheets: they are representations of datasets in R where the rows correspond observations and the columns correspond to variables that describe the observations. We will revisit this later in Section 2.4. From the Intermediate R course complete the following chapters: Chapter 1 Conditionals and Control Flow: Testing for equality in R using == (and not = which is typically used for assignment). Ex: 2 + 1 == 3 compares 2 + 1 to 3 and is correct R syntax, while 2 + 1 = 3 is not and is incorrect R syntax. Boolean algebra: TRUE/FALSE statements and mathematical operators such as < (less than), <= (less than or equal), and != (not equal to). Logical operators: & representing “and”, | representing “or”. Ex: (2 + 1 == 3) & (2 + 1 == 4) returns FALSE while (2 + 1 == 3) | (2 + 1 == 4) returns TRUE. Chapter 3 Functions: Concept of functions: they take in inputs (called arguments) and return outputs. You either manually specify a function’s arguments or use the function’s defaults. This list is by no means an exhaustive list of all the programming concepts and terminology needed to become a savvy R user; such a list would be so large it wouldn’t be very useful, especially for novices. Rather, we feel this is the bare minimum you need to know before you get started; the rest we feel you can learn as you go. Remember that your knowledge of all of these concepts will build as you get better and better at “speaking R” and getting used to its syntax. 2.2.2 Tips on learning to code Learning to code/program is very much like learning a foreign language, it can be very daunting and frustrating at first. However just as with learning a foreign language, if you put in the effort and are not afraid to make mistakes, anybody can learn. Lastly, there are a few useful things to keep in mind as you learn to program: Computers are stupid: You have to tell a computer everything it needs to do. Furthermore, your instructions can’t have any mistakes in them, nor can they be ambiguous in any way. Take the “copy/paste/tweak” approach: Especially when learning your first programming language, it is often much easier to taking existing code that you know works and modify it to suit your ends, rather than trying to write new code from scratch. We call this the copy/paste/tweak approach. So early on, we suggest not trying to code from scratch, but please take the code we provide throughout this book and play around with it! Practice is key: Just as the only solution to improving your foreign language skills is practice, so also the only way to get better at R is through pracitice. Don’t worry however, we’ll give you plenty of opportunities to practice! 2.3 What are R packages? Another point of confusion with new R users is the notion of a package. R packages extend the functionality of R by providing additional functions, data, and documentation and can be downloaded for free from the internet. They are written by a world-wide community of R users. For example, among the many packages we will use in this book are the ggplot2 package for data visualization in Chapter 3 dplyr package for data wrangling in Chapter 5 There are two key things to remember about R packages: Installation: Most packages are not installed by default when you install R and RStudio. You need to install a package before you can use it. Once you’ve installed it, you likely don’t need to install it again unless you want to update it to a newer version of the package. Loading: Packages are not loaded automatically when you open RStudio. You need to load them everytime you open RStudio using the library() command. A good analogy for R packages is they are like apps you can download onto a mobile phone: R: A new phone R Packages: Apps you can download So, expanding on this analogy a bit: R is like a new mobile phone. It has a certain amount of functionality when you use it for the first time, but it doesn’t have everything. R packages are like the apps you can download onto your phone, much like those offered in the App Store and Google Play. For example: Instagram. In order to use a package, just like in order to use Instagram, you must: First download it and install it. You do this only once. Load it, or in other words, “open” it, using the library() command. So just as you can only start sharing photos with your friends on Instagram if you first install the app and then open it, you can only access an R package’s data and functions if you first install the package and then load it with the library() command. Let’s cover these two steps: 2.3.1 Package installation (Note that if you are working on an RStudio Server, you probably will not need to install your own packages as that has been already done for you. Still it is important that you know this process for later when you are not using the RStudio Server but rather your own installation of RStudio Desktop.) There are two ways to install an R package. For example, to install the ggplot2 package: Easy way: In the Files pane of RStudio: Click on the “Packages” tab Click on “Install” Type the name of the package under “Packages (separate multiple with space or comma):” In this case, type ggplot2 Click “Install” Alternative way: In the Console pane run install.packages("ggplot2") (you must include the quotation marks). Repeat this for the dplyr and nycflights13 packages. Note: You only have to install a package once, unless you want to update an already installed package to the latest version. If you want to update a package to the latest version, then re-install it by repeating the above steps. 2.3.2 Package loading After you’ve installed a package, you can now load it using the library() command. For example, to load the ggplot2 and dplyr packages, run the following code in the Console pane: library(ggplot2) library(dplyr) Note: You have to reload each package you want to use every time you open a new session of RStudio. This is a little annoying to get used to and will be your most common error as you begin. When you see an error such as Error: could not find function remember that this likely comes from you trying to use a function in a package that has not been loaded. Remember to run the library() function with the appropriate package to fix this error. 2.4 Explore your first dataset Let’s put everything we’ve learned so far into practice and start exploring some real data! Data comes to us in a variety of formats, from pictures to text to numbers. Throughout this book, we’ll focus on datasets that can be stored in a spreadsheet as that is among the most common way data is collected in the many fields. Remember from Subsection 2.2.1 that these “spreadsheet”-type datasets are called data frames in R and we will focus on working with data frames throughout this book. Let’s first load all the packages needed for this chapter (This assumes you’ve already installed them. Read Section 2.3 for information on how to install and load R packages if you haven’t already.) At the beginning of all subsequent chapters in this text, we’ll always have a list of packages similar to what follows that you should have installed and loaded to work with that chapter’s R code. library(dplyr) # Be sure to install these first! library(nycflights13) library(knitr) 2.4.1 nycflights13 package We likely have all flown on airplanes or know someone who has. Air travel has become an ever-present aspect in many people’s lives. If you live in or are visiting a relatively large city and you walk around that city’s airport, you see gates showing flight information from many different airlines. And you will frequently see that some flights are delayed because of a variety of conditions. Are there ways that we can avoid having to deal with these flight delays? We’d all like to arrive at our destinations on time whenever possible. (Unless you secretly love hanging out at airports. If you are one of these people, pretend for the moment that you are very much anticipating being at your final destination.) Throughout this book, we’re going to analyze data related to flights contained in the nycflights13 package (Wickham 2017). Specifically, this package contains five datasets saved as “data frames” (see Section 2.2) with information about all domestic flights departing from New York City in 2013, from either Newark Liberty International (EWR), John F. Kennedy International (JFK), or LaGuardia (LGA) airports: flights: information on all 336,776 flights airlines: translation between two letter IATA carrier codes and names (16 in total) planes: construction information about each of 3,322 planes used weather: hourly meteorological data (about 8710 observations) for each of the three NYC airports airports: airport names and locations 2.4.2 flights data frame We will begin by exploring the flights data frame that is included in the nycflights13 package and getting an idea of its structure. Run the following in your code in your console: it loads in the flights dataset into your Console. Note depending on the size of your monitor, the output may vary slightly. flights # A tibble: 336,776 x 19 year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time <int> <int> <int> <int> <int> <dbl> <int> <int> 1 2013 1 1 517 515 2.00 830 819 2 2013 1 1 533 529 4.00 850 830 3 2013 1 1 542 540 2.00 923 850 4 2013 1 1 544 545 -1.00 1004 1022 5 2013 1 1 554 600 -6.00 812 837 6 2013 1 1 554 558 -4.00 740 728 7 2013 1 1 555 600 -5.00 913 854 8 2013 1 1 557 600 -3.00 709 723 9 2013 1 1 557 600 -3.00 838 846 10 2013 1 1 558 600 -2.00 753 745 # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>, # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Let’s unpack this output: A tibble: 336,776 x 19: a tibble is a kind of data frame. This particular data frame has 336,776 rows 19 columns corresponding to 19 variables describing each observation year month day dep_time sched_dep_time dep_delay arr_time are different columns, in other words variables, of this data frame. We then have the first 10 rows of observations corresponding to 10 flights. ... with 336,766 more rows, and 11 more variables: indicating to us that 336,766 more rows of data and 11 more variables could not fit in this screen. Unfortunately, this output does not allow us to explore the data very well. Let’s look at different tools to explore data frames. 2.4.3 Exploring data frames Among the many ways of getting a feel for the data contained in a data frame such as flights, we present three functions that take as their argument the data frame in question: Using the View() function built for use in RStudio. We will use this the most. Using the glimpse() function loaded via dplyr package Using the kable() function in the knitr package Using the $ operator to view a single variable in a data frame 1. View(): Run View(flights) in your Console in RStudio and explore this data frame in the resulting pop-up viewer. You should get into the habit of always Viewing any data frames that come your way. Note the capital “V” in View. R is case-sensitive so you’ll receive an error is you run view(flights) instead of View(flights). Learning check (LC2.1) What does any ONE row in this flights dataset refer to? A. Data on an airline B. Data on a flight C. Data on an airport D. Data on multiple flights By running View(flights), we see the different variables listed in the columns and we see that there are different types of variables. Some of the variables like distance, day, and arr_delay are what we will call quantitative variables. These variables are numerical in nature. Other variables here are categorical. Note that if you look in the leftmost column of the View(flights) output, you will see a column of numbers. These are the row numbers of the dataset. If you glance across a row with the same number, say row 5, you can get an idea of what each row corresponds to. In other words, this will allow you to identify what object is being referred to in a given row. This is often called the observational unit. The observational unit in this example is an individual flight departing New York City in 2013. You can identify the observational unit by determining what the thing is that is being measured in each of the variables. 2. glimpse(): The second way to explore a data frame is using the glimpse() function that you can access after you’ve loaded the dplyr package. It provides us with much of the above information and more. glimpse(flights) Observations: 336,776 Variables: 19 $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013... $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55... $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 60... $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,... $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8... $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 8... $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,... $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"... $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301... $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N... $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG... $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA... $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149... $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73... $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6... $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59... $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-0... Learning check (LC2.2) What are some examples in this dataset of categorical variables? What makes them different than quantitative variables? (LC2.3) What does int, dbl, and chr mean in the output above? We see that glimpse will give you the first few entries of each variable in a row after the variable. In addition, the data type (See Subsection 2.2.1) of the variable is given immediately after each variable’s name inside < >. Here, int and num refer to quantitative variables. In contrast, chr refers to categorical variables. One more type of variable is given here with the time_hour variable: dttm. As you may suspect, this variable corresponds to a specific date and time of day. 3. kable(): The final way to explore the entirety of a data frame is using the kable() function from the knitr package. Let’s explore the different carrier codes for all the airlines in our dataset two ways. Run both of these in your Console: airlines kable(airlines) At first glance of both outputs, it may not appear that there is much difference. However, we’ll see later on, especially when using a tool for document production called R Markdown, that the latter produces output that is much more legible. 4. $ operator Lastly, the $ operator allows us to explore a single variable within a data frame. For example, run the following in your console airlines airlines$name We used the $ operator to extract only the name variable and return it as a vector of length 16. We will only be occasionally exploring data frames using this operator. 2.4.4 Help files Another nice feature of R is the help system. You can get help in R by entering a ? before the name of a function or data frame in question and you will be presented with a page showing the documentation. For example, let’s look at the help file for the flights data frame: ?flights A help file should pop-up in the Help pane of RStudio. Note the content of this particular help file is also accessible on the web on page 3 of the PDF document. You should get in the habit of consulting the help file of any function or data frame in R about which you have questions. 2.5 Conclusion We’ve given you what we feel are the most essential concepts to know before you can start exploring data in R. Is this chapter exhaustive? Absolutely not. To try to include everything in this chapter would make the chapter so large it wouldn’t be useful! However, as we stated earlier, the best way to learn R is to learn by doing. Now let’s get into learning about how to create good stories about and with data. In Chapter 3, we start with what we feel is the most important tool in a data scientist’s toolbox: data visualization. 2.5.1 What’s to come? We’ll now start the “data science” portion of the in Chapter 3, where we will further explore the datasets include the nycflights13 package. We’ll see that data visualization is a powerful tool to add to our toolbox for exploring what is going on in a dataset beyond the View and glimpse functions we introduced in this chapter. "], -["3-viz.html", "3 Data Visualization via ggplot2 3.1 The Grammar of Graphics 3.2 Five Named Graphs - The 5NG 3.3 5NG#1: Scatterplots 3.4 5NG#2: Linegraphs 3.5 5NG#3: Histograms 3.6 Facets 3.7 5NG#4: Boxplots 3.8 5NG#5: Barplots 3.9 Conclusion", " 3 Data Visualization via ggplot2 We begin the development of your data science toolbox with data visualization. By visualizing our data, we will be able to gain valuable insights from our data that we couldn’t initially see from just looking at the raw data in spreadsheet form. We will use the ggplot2 package as it provides an easy way to customize your plots and is rooted in the data visualization theory known as The Grammar of Graphics (Wilkinson 2005). At the most basic level, graphics/plots/charts (we use these terms interchangeably in this book) provide a nice way for us to get a sense for how quantitative variables compare in terms of their center (where the values tend to be located) and their spread (how they vary around the center). The most important thing to know about graphics is that they should be created to make it obvious for your audience to understand the findings and insight you want to get across. This does however require a balancing act. On the one hand, you want to highlight as many meaningful relationships and interesting findings as possible, but on the other you don’t want to include so many as to overwhelm your audience. As we will see, plots/graphics also help us to identify patterns and outliers in our data. We will see that a common extension of these ideas is to compare the distribution of one quantitative variable (i.e., what the spread of a variable looks like or how the variable is distributed in terms of its values) as we go across the levels of a different categorical variable. Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). Read Section 2.3 for information on how to install and load R packages. library(nycflights13) library(ggplot2) library(dplyr) library(knitr) 3.1 The Grammar of Graphics We begin with a discussion of a theoretical framework for data visualization known as the “The Grammar of Graphics,” which serves as the basis for the ggplot2 package. Much like how we construct sentences in any language by using a linguistic grammar (nouns, verbs, subjects, objects, etc.), the theoretical framework given by Leland Wilkinson (Wilkinson 2005) allows us to specify the components of a statistical graphic. 3.1.1 Components of the Grammar In short, the grammar tells us that: A statistical graphic is a mapping of data variables to aesthetic attributes of geometric objects. Specifically, we can break a graphic into the following three essential components: data: the data-set comprised of variables that we map. geom: the geometric object in question. This refers to our type of objects we can observe in our plot. For example, points, lines, bars, etc. aes: aesthetic attributes of the geometric object that we can perceive on a graphic. For example, x/y position, color, shape, and size. Each assigned aesthetic attribute can be mapped to a variable in our data-set. Let’s break down the grammar with an example. 3.1.2 Gapminder In February 2006, a statistician named Hans Rosling gave a TED talk titled “The best stats you’ve ever seen” where he presented global economic, health, and development data from the website gapminder.org. For example, from the 1704 countries included from 2007, consider only the first 6 countries when listed alphabetically: Table 3.1: Gapminder 2007 Data: First 6 of 142 countries Country Continent Life Expectancy Population GDP per Capita Afghanistan Asia 43.83 31889923 974.6 Albania Europe 76.42 3600523 5937.0 Algeria Africa 72.30 33333216 6223.4 Angola Africa 42.73 12420476 4797.2 Argentina Americas 75.32 40301927 12779.4 Australia Oceania 81.23 20434176 34435.4 Each row in this table corresponds to a country in 2007. For each row, we have 5 columns: Country: Name of country. Continent: Which of the five continents the country is part of. (Note that Americas groups North and South America and that Antarctica is excluded here.) Life Expectancy: Life expectancy in years. Population: Number of people living in the country. GDP per Capita: Gross domestic product (in US dollars). Now consider Figure 3.1, which plots this data for all 142 countries in the data frame. Note that R will deal with large numbers using scientific notation. So in the legend for “Population”, 1.25e+09 = \\(1.25 \\times 10^{9}\\) = 1,250,000,000 = 1.25 billion. Figure 3.1: Life Expectancy over GDP per Capita in 2007 Let’s view this plot through the grammar of graphics: The data variable GDP per Capita gets mapped to the x-position aesthetic of the points. The data variable Life Expectancy gets mapped to the y-position aesthetic of the points. The data variable Population gets mapped to the size aesthetic of the points. The data variable Continent gets mapped to the color aesthetic of the points. Recall that data here corresponds to each of the variables being in the same data frame and the “data variable” corresponds to a column in a data frame. While in this example we are considering one type of geometric object (of type point), graphics are not limited to just points. Some plots involve lines while others involve bars. Let’s summarize the three essential components of the grammar in a table: Table 3.2: Summary of Grammar of Graphics for this plot data variable aes geom GDP per Capita x point Life Expectancy y point Population size point Continent color point 3.1.3 Other components of the Grammar There are other components of the Grammar of Graphics we can control. As you start to delve deeper into the Grammar of Graphics, you’ll start to encounter these topics more and more often. In this book, we’ll only work with the two other components below (The other components are left to a more advanced text such as R for Data Science (Grolemund and Wickham 2016)): faceting breaks up a plot into small multiples corresponding to the levels of another variable (Section 3.6) position adjustments for barplots (Section 3.8) In general, the Grammar of Graphics allows for a high degree of customization and also a consistent framework for easy updating/modification of plots. 3.1.4 The ggplot2 package In this book, we will be using the ggplot2 package for data visualization, which is an implementation of the Grammar of Graphics for R (Wickham and Chang 2018). You may have noticed that a lot of the previous text in this chapter is written in computer font. This is because the various components of the Grammar of Graphics are specified in the ggplot function, which expects at a bare minimum as arguments: The data frame where the variables exist: the data argument The mapping of the variables to aesthetic attributes: the mapping argument, which specifies the aesthetic attributes involved After we’ve specified these components, we then add layers to the plot using the + sign. The most essential layer to add to a plot is the specification of which type of geometric object we want the plot to involve; e.g. points, lines, bars. Other layers we can add include the specification of the plot title, axes labels, facets, and visual themes for the plot. Let’s now put the theory of the Grammar of Graphics into practice. 3.2 Five Named Graphs - The 5NG For our purposes, we will be limiting consideration to five different types of graphs. We term these five named graphs the 5NG: scatterplots linegraphs boxplots histograms barplots We will discuss some variations of these plots, but with this basic repertoire in your toolbox you can visualize a wide array of different data variable types. Note that certain plots are only appropriate for categorical/logical variables and others only for quantitative variables. You’ll want to quiz yourself often as we go along on which plot makes sense a given a particular problem or data-set. 3.3 5NG#1: Scatterplots The simplest of the 5NG are scatterplots (also called bivariate plots); they allow you to investigate the relationship between two numerical variables. While you may already be familiar with this type of plot, let’s view it through the lens of the Grammar of Graphics. Specifically, we will graphically investigate the relationship between the following two numerical variables in the flights data frame: dep_delay: departure delay on the horizontal “x” axis and arr_delay: arrival delay on the vertical “y” axis for Alaska Airlines flights leaving NYC in 2013. This requires paring down the flights data frame to a smaller data frame all_alaska_flights consisting of only Alaska Airlines (carrier code “AS”) flights. Don’t worry for now if you don’t fully understand what this code is doing, we’ll explain this in details Chapter 5, just run it all and understand that we are taking all flights and only considering those corresponding to Alaska Airlines. all_alaska_flights <- flights %>% filter(carrier == "AS") This code snippet makes use of functions in the dplyr package for data wrangling to achieve our goal: it takes the flights data frame and filters it to only return the rows which meet the condition carrier == "AS". Recall from Section 2.2 that testing for equality is specified with == and not =. You will see many more examples of == and filter() in Chapter 5. Learning check (LC3.1) Take a look at both the flights and all_alaska_flights data frames by running View(flights) and View(all_alaska_flights) in the console. In what respect do these data frames differ? 3.3.1 Scatterplots via geom_point We proceed to create the scatterplot using the ggplot() function: ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_point() Figure 3.2: Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013 In Figure 3.2 we see that a positive relationship exists between dep_delay and arr_delay: as departure delays increase, arrival delays tend to also increase. We also note that the majority of points fall near the point (0, 0). There is a large mass of points clustered there. Let’s break this down, keeping in mind our discussion in Section 3.1: Within the ggplot() function call, we specify two of the components of the grammar: The data frame to be all_alaska_flights by setting data = all_alaska_flights The aesthetic mapping by setting aes(x = dep_delay, y = arr_delay). Specifically the variable dep_delay maps to the x position aesthetic the variable arr_delay maps to the y position aesthetic We add a layer to the ggplot() function call using the + sign. The layer in question specifies the third component of the grammar: the geometric object. In this case the geometric object are points, set by specifying geom_point(). Some notes on layers: Note that the + sign comes at the end of lines, and not at the beginning. You’ll get an error in R if you put it at the beginning. When adding layers to a plot, you are encouraged to hit Return on your keyboard after entering the + so that the code for each layer is on a new line. As we add more and more layers to plots, you’ll see this will greatly improve the legibility of your code. To stress the importance of adding layers, in particular the layer specifying the geometric object, consider Figure 3.3 where no layers are added. A not very useful plot! ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) Figure 3.3: Plot with No Layers Learning check (LC3.2) What are some practical reasons why dep_delay and arr_delay have a positive relationship? (LC3.3) What variables (not necessarily in the flights data frame) would you expect to have a negative correlation (i.e. a negative relationship) with dep_delay? Why? Remember that we are focusing on numerical variables here. (LC3.4) Why do you believe there is a cluster of points near (0, 0)? What does (0, 0) correspond to in terms of the Alaskan flights? (LC3.5) What are some other features of the plot that stand out to you? (LC3.6) Create a new scatterplot using different variables in the all_alaska_flights data frame by modifying the example above. 3.3.2 Over-plotting The large mass of points near (0, 0) in Figure 3.2 can cause some confusion. This is the result of a phenomenon called overplotting. As one may guess, this corresponds to values being plotted on top of each other over and over again. It is often difficult to know just how many values are plotted in this way when looking at a basic scatterplot as we have here. There are two ways to address this issue: By adjusting the transparency of the points via the alpha argument By jittering the points via geom_jitter() The first way of relieving overplotting is by changing the alpha argument in geom_point() which controls the transparency of the points. By default, this value is set to 1. We can change this to any value between 0 and 1 where 0 sets the points to be 100% transparent and 1 sets the points to be 100% opaque. Note how the following function call is identical to the one in Section 3.3, but with alpha = 0.2 added to the geom_point(). ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_point(alpha = 0.2) Figure 3.4: Delay scatterplot with alpha=0.2 The key feature to note in Figure 3.4 is that the transparency of the points is cumulative: areas with a high-degree of overplotting are darker, whereas areas with a lower degree are less dark. Note that there is no aes() surrounding alpha = 0.2 here. Since we are NOT mapping a variable to an aesthetic but instead are just changing a setting, we don’t need to create a mapping with aes(). In fact, you’ll receive an error if you try to change the second line above to geom_point(aes(alpha = 0.2)). The second way of relieving overplotting is to jitter the points a bit. In other words, we are going to add just a bit of random noise to the points to better see them and remove some of the overplotting. You can think of “jittering” as shaking the points around a bit on the plot. Instead of using geom_point, we use geom_jitter to perform this shaking. To specify how much jitter to add, we adjust the width and height arguments. This corresponds to how hard you’d like to shake the plot in units corresponding to those for both the horizontal and vertical variables (in this case, minutes). ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_jitter(width = 30, height = 30) Figure 3.5: Jittered delay scatterplot Note how this function call is identical to the one in Subsection 3.3.1, but with geom_point() replaced with geom_jitter(). The plot in Figure 3.5 helps us a little bit in getting a sense for the overplotting, but with a relatively large data-set like this one (714 flights), it can be argued that changing the transparency of the points by setting alpha proved more effective. We’ll see later on that the two following R commands will yield the exact same plot: ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_jitter(width = 30, height = 30) ggplot(all_alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_jitter(width = 30, height = 30) In other words you can drop the data = and mapping = if you keep the order of the two arguments the same. Since the ggplot() function is expecting its first argument data to be a data frame and its second argument to correspond to mapping =, you can omit both and you’ll get the same plot. As you get more and more practice, you’ll likely find yourself not including the specification of the argument like this. But for now to keep things straightforward let’s make it a point to include the data = and mapping =. Learning check (LC3.7) Why is setting the alpha argument value useful with scatterplots? What further information does it give you that a regular scatterplot cannot? (LC3.8) After viewing the Figure 3.4 above, give an approximate range of arrival delays and departure delays that occur the most frequently. How has that region changed compared to when you observed the same plot without the alpha = 0.2 set in Figure 3.2? 3.3.3 Summary Scatterplots display the relationship between two numerical variables. They are among the most commonly used plots because they can provide an immediate way to see the trend in one variable versus another. However, if you try to create a scatterplot where either one of the two variables is not numerical, you will get strange results. Be careful! With medium to large data-sets, you may need to play with either geom_jitter() or the alpha argument in order to get a good feel for relationships in your data. This tweaking is often a fun part of data visualization since you’ll have the chance to see different relationships come about as you make subtle changes to your plots. 3.4 5NG#2: Linegraphs The next of the 5NG is a linegraph. They are most frequently used when the x-axis represents time and the y-axis represents some other numerical variable; such plots are known as time series. Time represents a variable that is connected together by each day following the previous day. In other words, time has a natural ordering. Linegraphs should be avoided when there is not a clear sequential ordering to the explanatory variable, i.e. the x-variable or the predictor variable. Our focus now turns to the temp variable in this weather data-set. By Looking over the weather data-set by typing View(weather) in the console. Running ?weather to bring up the help file. We can see that the temp variable corresponds to hourly temperature (in Fahrenheit) recordings at weather stations near airports in New York City. Instead of considering all hours in 2013 for all three airports in NYC, let’s focus on the hourly temperature at Newark airport (origin code “EWR”) for the first 15 days in January 2013. The weather data frame in the nycflights13 package contains this data, but we first need to filter it to only include those rows that correspond to Newark in the first 15 days of January. early_january_weather <- weather %>% filter(origin == "EWR" & month == 1 & day <= 15) This is similar to the previous use of the filter command in Section 3.3, however we now use the & operator. The above selects only those rows in weather where the originating airport is "EWR" and we are in the first month and the day is from 1 to 15 inclusive. Learning check (LC3.9) Take a look at both the weather and early_january_weather data frames by running View(weather) and View(early_january_weather) in the console. In what respect do these data frames differ? (LC3.10) View() the flights data frame again. Why does the time_hour variable uniquely identify the hour of the measurement whereas the hour variable does not? 3.4.1 Linegraphs via geom_line We plot a linegraph of hourly temperature using geom_line(): ggplot(data = early_january_weather, mapping = aes(x = time_hour, y = temp)) + geom_line() Figure 3.6: Hourly Temperature in Newark for January 1-15, 2013 Much as with the ggplot() call in Chapter 3.3.1, we describe the components of the Grammar of Graphics: Within the ggplot() function call, we specify two of the components of the grammar: The data frame to be early_january_weather by setting data = early_january_weather The aesthetic mapping by setting aes(x = time_hour, y = temp). Specifically time_hour (i.e. the time variable) maps to the x position temp maps to the y position We add a layer to the ggplot() function call using the + sign The layer in question specifies the third component of the grammar: the geometric object in question. In this case the geometric object is a line, set by specifying geom_line(). Learning check (LC3.11) Why should linegraphs be avoided when there is not a clear ordering of the horizontal axis? (LC3.12) Why are linegraphs frequently used when time is the explanatory variable? (LC3.13) Plot a time series of a variable other than temp for Newark Airport in the first 15 days of January 2013. 3.4.2 Summary Linegraphs, just like scatterplots, display the relationship between two numerical variables. However, the variable on the x-axis (i.e. the explanatory variable) should have a natural ordering, like some notion of time. We can mislead our audience if that isn’t the case. 3.5 5NG#3: Histograms Let’s consider the temp variable in the weather data frame once again, but now unlike with the linegraphs in Chapter 3.4, let’s say we don’t care about the relationship of temperature to time, but rather we care about the (statistical) distribution of temperatures. We could just produce points where each of the different values appear on something similar to a number line: Figure 3.7: Plot of Hourly Temperature Recordings from NYC in 2013 This gives us a general idea of how the values of temp differ. We see that temperatures vary from around 11 up to 100 degrees Fahrenheit. The area between 40 and 60 degrees appears to have more points plotted than outside that range. 3.5.1 Histograms via geom_histogram What is commonly produced instead of the above plot is a plot known as a histogram. The histogram shows how many elements of a single numerical variable fall in specified bins. In this case, these bins may correspond to between 0-10°F, 10-20°F, etc. We produce a histogram of the hour temperatures at all three NYC airports in 2013: ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram() `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Warning: Removed 1 rows containing non-finite values (stat_bin). Figure 3.8: Histogram of Hourly Temperature Recordings from NYC in 2013 Note here: There is only one variable being mapped in aes(): the single numerical variable temp. You don’t need to compute the y-aesthetic: it gets computed automatically. We set the geometric object to be geom_histogram() We got a warning message of 1 rows containing non-finite values being removed. This is due to one of the values of temperature being missing. R is alerting us that this happened. Another warning corresponds to an urge to specify the number of bins you’d like to create. 3.5.2 Adjusting the bins We can adjust characteristics of the bins in one of two ways: By adjusting the number of bins via the bins argument By adjusting the width of the bins via the binwidth argument First, we have the power to specify how many bins we would like to put the data into as an argument in the geom_histogram() function. By default, this is chosen to be 30 somewhat arbitrarily; we have received a warning above our plot that this was done. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(bins = 60, color = "white") Figure 3.9: Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins Note the addition of the color argument. If you’d like to be able to more easily differentiate each of the bins, you can specify the color of the outline as done above. You can also adjust the color of the bars by setting the fill argument. Type colors() in your console to see all 657 available colors. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(bins = 60, color = "white", fill = "steelblue") Figure 3.10: Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Colored Bins Second, instead of specifying the number of bins, we can also specify the width of the bins by using the binwidth argument in the geom_histogram function. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(binwidth = 10, color = "white") Figure 3.11: Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10 Learning check (LC3.14) What does changing the number of bins from 30 to 60 tell us about the distribution of temperatures? (LC3.15) Would you classify the distribution of temperatures as symmetric or skewed? (LC3.16) What would you guess is the “center” value in this distribution? Why did you make that choice? (LC3.17) Is this data spread out greatly from the center or is it close? Why? 3.5.3 Summary Histograms, unlike scatterplots and linegraphs, present information on only a single numerical variable. In particular they are visualizations of the (statistical) distribution of values. 3.6 Facets Before continuing the 5NG, we briefly introduce a new concept called faceting. Faceting is used when we’d like to create small multiples of the same plot over a different categorical variable. By default, all of the small multiples will have the same vertical axis. For example, suppose we were interested in looking at how the temperature histograms we saw in Chapter 3.5 varied by month. This is what is meant by “the distribution of a variable over another variable”: temp is one variable and month is the other variable. In order to look at histograms of temp for each month, we add a layer facet_wrap(~ month). You can also specify how many rows you’d like the small multiple plots to be in using nrow or how many columns using ncol inside of facet_wrap. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(binwidth = 5, color = "white") + facet_wrap(~ month, nrow = 4) Figure 3.12: Faceted histogram Note the use of the ~ before month in facet_wrap. The tilde (~) is required and you’ll receive the error Error in as.quoted(facets) : object 'month' not found if you don’t include it before month here. As we might expect, the temperature tends to increase as summer approaches and then decrease as winter approaches. Learning check (LC3.18) What other things do you notice about the faceted plot above? How does a faceted plot help us see relationships between two variables? (LC3.19) What do the numbers 1-12 correspond to in the plot above? What about 25, 50, 75, 100? (LC3.20) For which types of data-sets would these types of faceted plots not work well in comparing relationships between variables? Give an example describing the nature of these variables and other important characteristics. (LC3.21) Does the temp variable in the weather data-set have a lot of variability? Why do you say that? 3.7 5NG#4: Boxplots While using faceted histograms can provide a way to compare distributions of a numerical variable split by groups of a categorical variable as in Section 3.6, an alternative plot called a boxplot (also called a side-by-side boxplot) achieves the same task and is frequently preferred. The boxplot uses the information provided in the five-number summary referred to in Appendix A. It gives a way to compare this summary information across the different levels of a categorical variable. 3.7.1 Boxplots via geom_boxplot Let’s create a boxplot to compare the monthly temperatures as we did above with the faceted histograms. ggplot(data = weather, mapping = aes(x = month, y = temp)) + geom_boxplot() Figure 3.13: Invalid boxplot specification Warning messages: 1: Continuous x aesthetic -- did you forget aes(group=...)? 2: Removed 1 rows containing non-finite values (stat_boxplot). Note the set of warnings that is given here. The second warning corresponds to missing values in the data frame and it is turned off on subsequent plots. Let’s focus on the first warning. Observe that this plot does not look like what we were expecting. We were expecting to see the distribution of temperatures for each month (so 12 different boxplots). The first warning is letting us know that we are plotting a numerical, and not categorical variable, on the x-axis. This gives us the overall boxplot without any other groupings. We can get around this by introducing a new function for our x variable: ggplot(data = weather, mapping = aes(x = factor(month), y = temp)) + geom_boxplot() Figure 3.14: Month by temp boxplot We have introduced a new function called factor() here. One of the things this function does is to convert a discrete value like month (1, 2, …, 12) into a categorical variable. The “box” part of this plot represents the 25th percentile, the median (50th percentile), and the 75th percentile. The dots correspond to outliers. (The specific formulation for these outliers is discussed in Appendix A.) The lines show how the data varies that is not in the center 50% defined by the first and third quantiles. Longer lines correspond to more variability and shorter lines correspond to less variability. Looking at this plot we can see, as expected, that summer months (6 through 8) have higher median temperatures. We can easily compare temperatures across months by drawing imaginary horizontal lines across the plot. Furthermore, the height of the 12 boxes are informative too; they tell us about variability, or spread, of temperatures recorded in a given month. But to really bring home what boxplots show, let’s focus only on the month of November’s 2138 temperature recordings. Figure 3.15: November boxplot Now let’s plot all 2138 temperature recordings for November on top of the boxplot in Figure 3.16. Figure 3.16: November boxplot with points What the boxplot does is summarize the 2138 points for you, in particular: 25% of points (about 534 observations) fall below the bottom edge of the box which is the first quartile of 35.96 degrees Farenheit (2.2 degrees Celsius). In other words 25% of observations were colder than 35.96 degrees Farenheit. 25% of points fall between the bottom edge of the box and the solid middle line which is the median of 46.04 degrees Farenheit (7.8 degrees Celsius). In other words 25% of observations were between 35.96 and 46.04 degrees Farenheit. 25% of points fall between the solid middle line and the top edge of the box which is the third quartile of 51.98 degrees Farenheit (11.1 degrees Celsius). In other words 25% of observations were between 46.04 and 51.98 degrees Farenheit. 25% of points fall over the top edge of the box. In other words 25% of observations were warmer than 51.98 degrees Farenheit. Learning check (LC3.22) What does the dot at the bottom of the plot for May correspond to? Explain what might have occurred in May to produce this point. (LC3.23) Which months have the highest variability in temperature? What reasons do you think this is? (LC3.24) We looked at the distribution of a numerical variable over a categorical variable here with this boxplot. Why can’t we look at the distribution of one numerical variable over the distribution of another numerical variable? Say, temperature across pressure, for example? (LC3.25) Boxplots provide a simple way to identify outliers. Why may outliers be easier to identify when looking at a boxplot instead of a faceted histogram? 3.7.2 Summary Boxplots provide a way to compare and contrast the distribution of one quantitative variable across multiple levels of one categorical variable. One can see where the median falls across the different groups by looking at the center line in the box. To see how spread out the variable is across the different groups, look at both the width of the box and also how far the lines stretch vertically from the box. (If the lines stretch far from the box but the box has a small width, the variability of the values closer to the center is much smaller than the variability of the outer ends of the variable.) Outliers are even more easily identified when looking at a boxplot than when looking at a histogram. 3.8 5NG#5: Barplots Both histograms and boxplots represent ways to visualize the variability of numerical variables. Another common task is to present the distribution of a categorical variable. This is a simpler task, focused on how many elements from the data fall into different categories of the categorical variable. Often the best way to visualize these different counts (also known as frequencies) is via a barplot, also known as a barchart. One complication, however, is how your counts are represented in your data. For example, run the following code in your Console. This code manually creates two data frames representing counts of fruit. fruits <- data_frame( fruit = c("apple", "apple", "apple", "orange", "orange") ) fruits_counted <- data_frame( fruit = c("apple", "orange"), number = c(3, 2) ) We see both the fruits and fruits_counted data frames represent the same collection of fruit: three apples and two oranges. However, whereas fruits just lists the fruit: Table 3.3: Fruits fruit apple apple apple orange orange fruits_counted has a variable count, where the counts are pre-tabulated. Table 3.4: Fruits (Pre-Counted) fruit number apple 3 orange 2 Compare the barcharts in Figures 3.17 and 3.18, which are identical, but are based on two different data frames: ggplot(data = fruits, mapping = aes(x = fruit)) + geom_bar() Figure 3.17: Barplot when counts are not pre-tabulated ggplot(data = fruits_counted, mapping = aes(x = fruit, y = number)) + geom_col() Figure 3.18: Barplot when counts are pre-tabulated Observe that: The code that generates Figure 3.17 based on fruits does not have an explicit y aesthetic and uses geom_bar() The code that generates Figure 3.18 based on fruits_counted has an explicit y aesthetic (to the variable number) and uses geom_col() (Please note that this one of ggplot2’s trickier aspects that causes the most confusion, and fortunately this is as complicated as our use of ggplot2 is going to get.) Stating the above differently: When the categorical variable you want to plot is not pre-tabulated in your data frame you need to use geom_bar(). When the categorical variable is pre-tabulated (in the above fruits_counted example in the variable number), you need to use geom_col() with the y aesthetic explicitly mapped. 3.8.1 Barplots via geom_bar/geom_col Consider the distribution of airlines that flew out of New York City in 2013. Here we explore the number of flights from each airline/carrier. This can be plotted by invoking the geom_bar function in ggplot2: ggplot(data = flights, mapping = aes(x = carrier)) + geom_bar() Figure 3.19: Number of flights departing NYC in 2013 by airline using geom_bar To get an understanding of what the names of these airlines are corresponding to these carrier codes, we can look at the airlines data frame in the nycflights13 package. airlines carrier name 9E Endeavor Air Inc. AA American Airlines Inc. AS Alaska Airlines Inc. B6 JetBlue Airways DL Delta Air Lines Inc. EV ExpressJet Airlines Inc. F9 Frontier Airlines Inc. FL AirTran Airways Corporation HA Hawaiian Airlines Inc. MQ Envoy Air OO SkyWest Airlines Inc. UA United Air Lines Inc. US US Airways Inc. VX Virgin America WN Southwest Airlines Co. YV Mesa Airlines Inc. Going back to our barplot, we see that United Air Lines, JetBlue Airways, and ExpressJet Airlines had the most flights depart New York City in 2013. To get the actual number of flights by each airline we can use the group_by(), summarize(), and n() functions in the dplyr package on the carrier variable in flights, which we will introduce formally in Chapter 5. flights_table <- flights %>% group_by(carrier) %>% summarize(number = n()) flights_table carrier number 9E 18460 AA 32729 AS 714 B6 54635 DL 48110 EV 54173 F9 685 FL 3260 HA 342 MQ 26397 OO 32 UA 58665 US 20536 VX 5162 WN 12275 YV 601 In this table, the counts of the carriers are pre-tabulated. To create a barchart using the data frame flights_table, we use geom_col and map the y aesthetic to the variable number. Compare this barplot using geom_col in Figure 3.20 with the earlier barplot using geom_bar in Figure 3.19. They are identical. ggplot(data = flights_table, mapping = aes(x = carrier, y = number)) + geom_col() Figure 3.20: Number of flights departing NYC in 2013 by airline using geom_col Learning check (LC3.26) Why are histograms inappropriate for visualizing categorical variables? (LC3.27) What is the difference between histograms and barplots? (LC3.28) How many Envoy Air flights departed NYC in 2013? (LC3.29) What was the seventh highest airline in terms of departed flights from NYC in 2013? How could we better present the table to get this answer quickly. 3.8.2 Must avoid pie charts! Unfortunately, one of the most common plots seen today for categorical data is the pie chart. While they may see harmless enough, they actually present a problem in that humans are unable to judge angles well. As Naomi Robbins describes in her book “Creating More Effective Graphs” (Robbins 2013), we overestimate angles greater than 90 degrees and we underestimate angles less than 90 degrees. In other words, it is difficult for us to determine relative size of one piece of the pie compared to another. Let’s examine our previous barplot example on the number of flights departing NYC by airline. This time we will use a pie chart. As you review this chart, try to identify how much larger the portion of the pie is for ExpressJet Airlines (EV) compared to US Airways (US), what the third largest carrier is in terms of departing flights, and how many carriers have fewer flights than United Airlines (UA)? Figure 3.21: The dreaded pie chart While it is quite easy to look back at the barplot to get the answer to these questions, it’s quite difficult to get the answers correct when looking at the pie graph. Barplots can always present the information in a way that is easier for the eye to determine relative position. There may be one exception from Nathan Yau at FlowingData.com but we will leave this for the reader to decide: Figure 3.22: The only good pie chart Learning check (LC3.30) Why should pie charts be avoided and replaced by barplots? (LC3.31) What is your opinion as to why pie charts continue to be used? 3.8.3 Using barplots to compare two categorical variables Barplots are the go-to way to visualize the frequency of different categories of a categorical variable. They make it easy to order the counts and to compare the frequencies of one group to another. Another use of barplots (unfortunately, sometimes inappropriately and confusingly) is to compare two categorical variables together. Let’s examine the distribution of outgoing flights from NYC by carrier and airport. We begin by getting the names of the airports in NYC that were included in the flights data-set. Here, we preview the inner_join() function from Chapter 5. This function will join the data frame flights with the data frame airports by matching rows that have the same airport code. However, in flights the airport code is included in the origin variable whereas in airports the airport code is included in the faa variable. We will revisit such examples in Section 5.8 on joining data-sets. flights_namedports <- flights %>% inner_join(airports, by = c("origin" = "faa")) After running View(flights_namedports), we see that name now corresponds to the name of the airport as referenced by the origin variable. We will now plot carrier as the horizontal variable. When we specify geom_bar, it will specify count as being the vertical variable. A new addition here is fill = name. Look over what was produced from the plot to get an idea of what this argument gives. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar() Figure 3.23: Stacked barplot comparing the number of flights by carrier and airport This plot is what is known as a stacked barplot. While simple to make, it often leads to many problems. For example in this plot, it is difficult to compare the heights of the different colors (corresponding to the number of flights from each airport) between the bars (corresponding to the different carriers). Note that fill is an aesthetic just like x is an aesthetic, and thus must be included within the parentheses of the aes() mapping. The following code, where the fill aesthetic is specified on the outside will yield an error. This is a fairly common error that new ggplot users make: ggplot(data = flights_namedports, mapping = aes(x = carrier), fill = name) + geom_bar() Learning check (LC3.32) What kinds of questions are not easily answered by looking at the above figure? (LC3.33) What can you say, if anything, about the relationship between airline and airport in NYC in 2013 in regards to the number of departing flights? Another variation on the stacked barplot is the side-by-side barplot. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar(position = "dodge") Figure 3.24: Side-by-side barplot comparing the number of flights by carrier and airport Learning check (LC3.34) Why might the side-by-side barplot be preferable to a stacked barplot in this case? (LC3.35) What are the disadvantages of using a side-by-side barplot, in general? Lastly, an often preferred type of barplot is the faceted barplot. We already saw this concept of faceting and small multiples in Section 3.6. This gives us a nicer way to compare the distributions across both carrier and airport/name. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar() + facet_grid(name ~ .) Figure 3.25: Faceted barplot comparing the number of flights by carrier and airport Note how the facet_grid function arguments are written here. We are wanting the names of the airports vertically and the carrier listed horizontally. As you may have guessed, this argument and other formulas of this sort in R are in y ~ x order. We will see more examples of this in Chapter 6. If you’d like to create small multiples in a vertical direction, you’ll want to use facet_grid() with the name of the variable before the ~ as we did in Figure 3.25. This corresponds to vertical going with y in the formula. If instead you’d like the small multiples to be in the horizontal direction, you’d use facet_grid() with the name of the variable after the ~, corresponding to the x position in the formula. Further, you can use facet_wrap() if you would like the small multiples to wrap into multiple rows as we saw earlier in the faceted histogram example in Figure 3.12. Additionally, you could use facet_grid() with one variable in the y position and another variable in the x position creating a grid of all possible combinations of the two variables. Learning check (LC3.36) Why is the faceted barplot preferred to the side-by-side and stacked barplots in this case? (LC3.37) What information about the different carriers at different airports is more easily seen in the faceted barplot? 3.8.4 Summary Barplots are the preferred way of displaying categorical variables. They are easy-to-understand and make it easy to compare across groups of a categorical variable. When dealing with more than one categorical variable, faceted barplots are frequently preferred over side-by-side or stacked barplots. Stacked barplots are sometimes nice to look at, but it is quite difficult to compare across the levels since the sizes of the bars are all of different sizes. Side-by-side barplots can provide an improvement on this, but the issue about comparing across groups still must be dealt with. 3.9 Conclusion 3.9.1 Review questions Review questions have been designed using the fivethirtyeight R package (Ismay and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the chapters of the DataCamp course available below: Scatterplots & Linegraphs Histograms & Boxplots Barplots ggplot2 Review 3.9.2 What’s to come? In Chapter 4, we’ll introduce the concept of “tidy data” and how it is used as a key data format for all the packages we use in this textbook. You’ll see that the concept appears to be simple, but actually can be a little challenging to decipher without careful practice. We’ll also investigate how to import CSV (comma-separated value) files into R using the readr package. 3.9.3 Resources An excellent resource as you begin to create plots using the ggplot2 package is a cheatsheet that RStudio has put together entitled “Data Visualization with ggplot2” available by clicking here or by clicking the RStudio Menu Bar -> Help -> Cheatsheets -> “Data Visualization with ggplot2” This cheatsheet covers more than what we’ve discussed in this chapter but provides nice visual descriptions of what each function produces. In addition, we’ve created a mind map to help you remember which types of plots are most appropriate in a given situation by identifying the types of variables involved in the problem. Figure 3.26: Mind map for Data Visualization 3.9.4 Script of R code An R script file of all R code used in this chapter is available here. "], -["4-tidy.html", "4 Tidy Data via tidyr 4.1 What is tidy data? 4.2 Back to nycflights13 4.3 Importing CSVs via readr 4.4 Converting from wide to long 4.5 Optional: Normal forms of data 4.6 Conclusion", " 4 Tidy Data via tidyr In Subsection 2.2.1 we introduced the concept of a data frame: a rectangular spreadsheet-like representation of data in R where the rows correspond to observations and the columns correspond to variables describing each observation. In Section 2.4, we started explorations of our first data frame flights included in the nycflights13 package. In Chapter 3 we made graphics using data contained in flights and other data frames. In this chapter, we extend some of these ideas by discussing a type of data formatting called “tidy” data. You will see that having data stored in “tidy” format is about more than what the colloquial definition of the term “tidy” might suggest of having your data “neatly organized” in a spreadsheet. Instead, we define the term “tidy” in a more rigorous fashion, outlining a set of rules by which data can be stored and the implications of these rules on analyses. Although knowledge of this type of data formatting was not necessary in our treatment of data visualization in Chapter 3 since all the data was already in tidy format, we’ll see going forward that having tidy data will allow you to more easily create data visualizations in a wide range of settings. Furthermore, it will also help you with data wrangling in Chapter 5 and in all subsequent chapters in this book when we cover regression and discuss statistical inference. Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(nycflights13) library(tidyr) library(readr) 4.1 What is tidy data? You have surely heard the word “tidy” in your life: “Tidy up your room!” “Please write your homework in a tidy way so that it is easier to grade and to provide feedback.” Marie Kondo’s best-selling book The Life-Changing Magic of Tidying Up: The Japanese Art of Decluttering and Organizing “I am not by any stretch of the imagination a tidy person, and the piles of unread books on the coffee table and by my bed have a plaintive, pleading quality to me - ‘Read me, please!’” - Linda Grant What does it mean for your data to be “tidy”? Beyond just being organized, in the context of this book having “tidy” data means that your data follows a standardized format. This makes it easier for you and others to visualize your data, to wrangle/transform your data, and to model your data. We will follow Hadley Wickham’s definition of tidy data here (Wickham 2014): A dataset is a collection of values, usually either numbers (if quantitative) or strings AKA text data (if qualitative). Values are organised in two ways. Every value belongs to a variable and an observation. A variable contains all values that measure the same underlying attribute (like height, temperature, duration) across units. An observation contains all values measured on the same unit (like a person, or a day, or a city) across attributes. Tidy data is a standard way of mapping the meaning of a dataset to its structure. A dataset is messy or tidy depending on how rows, columns and tables are matched up with observations, variables and types. In tidy data: Each variable forms a column. Each observation forms a row. Each type of observational unit forms a table. Figure 4.1: Tidy data graphic from http://r4ds.had.co.nz/tidy-data.html For example, say the following table consists of stock prices: Table 4.1: Stock Prices (Non-Tidy Format) Date Boeing Stock Price Amazon Stock Price Google Stock Price 2009-01-01 $173.55 $174.90 $174.34 2009-01-02 $172.61 $171.42 $170.04 Although the data are neatly organized in a spreadsheet-type format, they are not in tidy format since there are three variables corresponding to three unique pieces of information (Date, Stock Name, and Stock Price), but there are not three columns. In tidy data format each variable should be its own column, as shown below. Notice that both tables present the same information, but in different formats. Table 4.2: Stock Prices (Tidy Format) Date Stock Name Stock Price 2009-01-01 Boeing $173.55 2009-01-02 Boeing $172.61 2009-01-01 Amazon $174.90 2009-01-02 Amazon $171.42 2009-01-01 Google $174.34 2009-01-02 Google $170.04 However, consider the following table Table 4.3: Date, Boeing Price, Weather Data Date Boeing Price Weather 2009-01-01 $173.55 Sunny 2009-01-02 $172.61 Overcast In this case, even though the variable “Boeing Price” occurs again, the data is tidy since there are three variables corresponding to three unique pieces of information (Date, Boeing stock price, and the weather that particular day). The non-tidy data format in the original table is also known as “wide” format whereas the tidy data format in the second table is also known as “long”/“narrow” data format. In this book, we will work with work with datasets that are already in tidy format. But data isn’t always in this nice format that the tidyverse gets its name from. Data actually may come to you in a variety of different formats that require data cleaning and reshaping beyond the scope of this book. For a thorough example of the steps needed to take a messy dataset and turn it into a tidy one, check out the different functions available for data tidying and a case study using data from the World Health Organization in R for Data Science (Grolemund and Wickham 2016). Most frequently though, data that isn’t in long format and is instead in wide format can be converted into “tidy” format by using the tidyr package (Wickham and Henry 2018) in the tidyverse. We’ll now investigate how that can be done using the gather() function in tidyr. Before we proceed with reshaping our data, we will discuss how to read data stored in CSV format into R as a data frame. 4.2 Back to nycflights13 Recall the nycflights13 package with data about all domestic flights departing from New York City in 2013 that we introduced in Section 2.4 and used extensively in Chapter 3 to create visualizations. In particular, let’s revisit the flights data frame by running View(flights) in your console. We see that flights has a rectangular shape with each row corresponding to a different flight and each column corresponding to a characteristic of that flight. This matches exactly with how Hadley Wickham defined tidy data: Each variable forms a column. Each observation forms a row. But what about the third property? Each type of observational unit forms a table. 4.2.1 Observational units We identified earlier that the observational unit in the flights dataset is an individual flight. And we have shown that this dataset consists of 336,776 flights with 19 variables. In other words, rows of this dataset don’t refer to a measurement on an airline or on an airport; they refer to characteristics/measurements on a given flight from New York City in 2013. Also included in the nycflights13 package are datasets with different observational units (Wickham 2017): airlines: translation between two letter IATA carrier codes and names (16 in total) planes: construction information about each of 3,322 planes used weather: hourly meteorological data (about 8710 observations) for each of the three NYC airports airports: airport names and locations The organization of this data follows the third “tidy” data property: observations corresponding to the same observational unit should be saved in the same table/data frame. 4.2.2 Identification vs measurement variables There is a subtle difference between the kinds of variables that you will encounter in data frames: measurement variables and identification variables. The airports data frame you worked with above contains both these types of variables. Recall that in airports the observational unit is an airport, and thus each row corresponds to one particular airport. Let’s pull them apart using the glimpse function: glimpse(airports) Observations: 1,458 Variables: 8 $ faa <chr> "04G", "06A", "06C", "06N", "09J", "0A9", "0G6", "0G7", "0P2"... $ name <chr> "Lansdowne Airport", "Moton Field Municipal Airport", "Schaum... $ lat <dbl> 41.13, 32.46, 41.99, 41.43, 31.07, 36.37, 41.47, 42.88, 39.79... $ lon <dbl> -80.62, -85.68, -88.10, -74.39, -81.43, -82.17, -84.51, -76.7... $ alt <int> 1044, 264, 801, 523, 11, 1593, 730, 492, 1000, 108, 409, 875,... $ tz <dbl> -5, -6, -6, -5, -5, -5, -5, -5, -5, -8, -5, -6, -5, -5, -5, -... $ dst <chr> "A", "A", "A", "A", "A", "A", "A", "A", "U", "A", "A", "U", "... $ tzone <chr> "America/New_York", "America/Chicago", "America/Chicago", "Am... The variables faa and name are what we will call identification variables: variables that uniquely identify each observational unit. They are mainly used to provide a name to the observational unit. faa gives the code provided by the FAA for that airport while the name variable gives the longer more natural name of the airport. The remaining variables (lat, lon, alt, tz, dst, tzone) are often called measurement or characteristic variables: variables that describe properties of each observational unit, in other words each observation in each row. For example, lat and long describe the latitude and longitude of each airport. While it is not an absolute rule, for organizational purposes it considered good practice to have your identification variables in the far left-most columns of your data frame. Learning check (LC4.1) What properties of the observational unit do each of lat, lon, alt, tz, dst, and tzone describe for the airports data frame? Note that you may want to use ?airports to get more information. (LC4.2) Provide the names of variables in a data frame with at least three variables in which one of them is an identification variable and the other two are not. In other words, create your own tidy dataset that matches these conditions. 4.3 Importing CSVs via readr Up to this point, we’ve used data either stored inside of an R package or we’ve manually created the data such as the fruits and fruits_counted data in Subsection 3.8. Another common way to get data into R is via reading in data from a spreadsheet either stored on your computer or stored online. For our purposes here, we will work with downloading data stored online. First, let’s download a Comma Separated Values (CSV) file of ratings of the level of democracy in different countries spanning 1952 to 1992: http://ismayc.github.io/dem_score.csv. After downloading it open it and take a look. You can think of a CSV file as a bare-bones spreadsheet where: Each line in the file corresponds to a row of data/one observation. Values for each line are separated with commas. In other words, the values of different variables are separated by commas. The first line is usually a header row indicating the names of the columns/variables. As opposed to a bare-bones CSV file, Excel files contain a lot of metadata, or put more simply, data about the data. Examples include the used of bold and italic fonts, colored cells, different column widths, etc. However, going forward we will only avail ourselves of just the data, and not the metadata, as saved in a CSV file. There are many ways to read in this data into RStudio. Here are two of the simplest; for the purposes of practice, we suggest you try both. First, we can use the read_csv() function from the readr package to read in the data directly off the web: dem_score <- read_csv("http://ismayc.github.io/dem_score.csv") dem_score # A tibble: 96 x 10 country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992` <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> 1 Albania - 9 - 9 - 9 - 9 - 9 - 9 - 9 - 9 5 2 Argentina - 9 - 1 - 1 - 9 - 9 - 9 - 8 8 7 3 Armenia - 9 - 7 - 7 - 7 - 7 - 7 - 7 - 7 7 4 Australia 10 10 10 10 10 10 10 10 10 5 Austria 10 10 10 10 10 10 10 10 10 6 Azerbaijan - 9 - 7 - 7 - 7 - 7 - 7 - 7 - 7 1 7 Belarus - 9 - 7 - 7 - 7 - 7 - 7 - 7 - 7 7 8 Belgium 10 10 10 10 10 10 10 10 10 9 Bhutan -10 -10 -10 -10 -10 -10 -10 -10 -10 10 Bolivia - 4 - 3 - 3 - 4 - 7 - 7 8 9 9 # ... with 86 more rows Second, let’s read in the same data, but using the file you just downloaded on to your computer: Go to the Files pane of RStudio -> Navigate the directories to where your downloaded files are -> Right click dem_score.csv -> Click “Import Dataset…” -> Click “Import”. You’ll see two things happen: The RStudio Viewer will pop open with your data. In the console, the command that read-in the data will run. You can copy and paste this code to reload your data again later. In this dem_score data frame, the minimum value of -10 corresponds to a highly autocratic nation whereas a value of 10 corresponds to a highly democratic nation. Note also that backticks surround the different names of the columns here. Variable names are not allowed to start with a number but this can be worked around by surrounding the column name in backticks. Variable names also can’t include spaces so if you’d like to refer to the variable Stock Names above, for example, you’ll need to surround it in backticks: `Stock Names`. 4.4 Converting from wide to long Let’s focus on only the data corresponding to the country of Guatemala. guat_dem <- dem_score %>% filter(country == "Guatemala") guat_dem # A tibble: 1 x 10 country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992` <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> 1 Guatemala 2 -6 -5 3 1 -3 -7 3 3 Now let’s produce a plot showing how the democracy scores have changed over the 40 years from 1952 to 1992 for Guatemala. Let’s start by laying out how we would map our aesthetics to variables in the data frame: The data frame is guat_dem by setting data = guat_dem What are the names of the variables to plot? We’d like to see how the democracy score has changed over the years. Now we are stuck in a predicament. We see that we have a variable named country but its only value is "Guatemala". We have other variables denoted by different year values. Unfortunately, we’ve run into a dataset that is not in the appropriate format to apply the Grammar of Graphics and ggplot2. Remember that ggplot2 is a package in the tidyverse and, thus, needs data to be in a tidy format. We’d like to finish off our mapping of aesthetics to variables by doing something like The aesthetic mapping is set by aes(x = year, y = democracy_score) but this is not possible with our wide-formatted data. We need to take the values of the current column names in guat_dem (aside from country) and convert them into a new variable that will act as a key called year. Then, we’d like to take the numbers on the inside of the table and turn them into a column that will act as values called democracy_score. Our resulting data frame will have three columns: country, year, and democracy_score. The gather() function in the tidyr package can complete this task for us. The first argument to gather(), just as with ggplot2(), is the data argument where we specify which data frame we would like to tidy. The next two arguments to gather() are key and value, which specify what we’d like to call the new columns that convert our wide data into long format. Lastly, we include a specification for variables we’d like to NOT include in this tidying process using a -. guat_tidy <- gather(data = guat_dem, key = year, value = democracy_score, - country) guat_tidy # A tibble: 9 x 3 country year democracy_score <chr> <chr> <int> 1 Guatemala 1952 2 2 Guatemala 1957 -6 3 Guatemala 1962 -5 4 Guatemala 1967 3 5 Guatemala 1972 1 6 Guatemala 1977 -3 7 Guatemala 1982 -7 8 Guatemala 1987 3 9 Guatemala 1992 3 We can now create the plot to show how the democracy score of Guatemala changed from 1952 to 1992 using a linegraph and ggplot2. ggplot(data = guat_tidy, mapping = aes(x = year, y = democracy_score)) + geom_line() geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic? Observe that the year variable in guat_tidy is stored as a character vector since we had to circumvent the naming rules in R by adding backticks around the different year columns in guat_dem. This is leading to ggplot not knowing exactly how to plot a line using a categorical variable. We can fix this by using the parse_number() function in the readr package and then specify the horizontal axis label to be "year": ggplot(data = guat_tidy, mapping = aes(x = parse_number(year), y = democracy_score)) + geom_line() + labs(x = "year") Figure 4.2: Guatemala’s democracy score ratings from 1952 to 1992 We’ll see in Chapter 5 how we could use the mutate() function to change year to be a numeric variable instead after we have done our tidying. Notice now that the mappings of aesthetics to variables make sense in Figure 4.2: The data frame is guat_tidy by setting data = dem_score The x aesthetic is mapped to year The y aesthetic is mapped to democracy_score The geom_etry chosen is line Learning check (LC4.3) Convert the dem_score data frame into a tidy data frame and assign the name of dem_tidy to the resulting long-formatted data frame. (LC4.4) Read in the life expectancy data stored at http://ismayc.github.io/le_mess.csv and convert it to a tidy data frame. 4.5 Optional: Normal forms of data The datasets included in the nycflights13 package are in a form that minimizes redundancy of data. We will see that there are ways to merge (or join) the different tables together easily. We are capable of doing so because each of the tables have keys in common to relate one to another. This is an important property of normal forms of data. The process of decomposing data frames into less redundant tables without losing information is called normalization. More information is available on Wikipedia. We saw an example of this above with the airlines dataset. While the flights data frame could also include a column with the names of the airlines instead of the carrier code, this would be repetitive since there is a unique mapping of the carrier code to the name of the airline/carrier. Below an example is given showing how to join the airlines data frame together with the flights data frame by linking together the two datasets via a common key of "carrier". Note that this “joined” data frame is assigned to a new data frame called joined_flights. The key variable that we frequently join by is one of the identification variables mentioned above. library(dplyr) joined_flights <- inner_join(x = flights, y = airlines, by = "carrier") View(joined_flights) If we View this dataset, we see a new variable has been created called name. (We will see in Subsection 5.9.2 ways to change name to a more descriptive variable name.) More discussion about joining data frames together will be given in Chapter 5. We will see there that the names of the columns to be linked need not match as they did here with "carrier". Learning check (LC4.5) What are common characteristics of “tidy” datasets? (LC4.6) What makes “tidy” datasets useful for organizing data? (LC4.7) What are some advantages of data in normal forms? What are some disadvantages? 4.6 Conclusion 4.6.1 Review questions Review questions have been designed using the fivethirtyeight R package (Ismay and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the Tidy Data chapter of the DataCamp course available here. 4.6.2 What’s to come? In Chapter 5, we’ll further explore data in tidy format by grouping our data, creating summaries based on those groupings, filtering our data to match conditions, and performing other wranglings with our data including defining new columns/variables. These data wrangling procedures will go hand-in-hand with the data visualizations you’ve produced in Chapter 3. 4.6.3 Script of R code An R script file of all R code used in this chapter is available here. "], -["5-wrangling.html", "5 Data Wrangling via dplyr 5.1 The pipe %>% 5.2 Five Main Verbs - The 5MV 5.3 5MV#1: Filter observations using filter 5.4 5MV#2: Summarize variables using summarize 5.5 5MV#3: Group rows using group_by 5.6 5MV#4: Create new variables/change old variables using mutate 5.7 5MV#5: Reorder the data frame using arrange 5.8 Joining data frames 5.9 Optional: Other verbs 5.10 Conclusion", " 5 Data Wrangling via dplyr Let’s briefly recap where we have been so far and where we are headed. In Chapter 4, we discussed what it means for data to be tidy. We saw that this refers to observations corresponding to rows and variables being stored in columns (one variable for every column). The entries in the data frame correspond to different combinations of observations (specific instances of observational units) and variables. In the flights data frame, we saw that each row corresponds to a different flight leaving New York City. In other words, the observational unit of the flights tidy data frame is a flight. The variables are listed as columns, and for flights these columns include both quantitative variables like dep_delay and distance and also categorical variables like carrier and origin. An entry in the table corresponds to a particular flight on a given day and a particular value of a given variable representing that flight. Armed with this knowledge and looking back on Chapter 3, we see that organizing data in this tidy way makes it easy for us to produce graphics, specifically a set of 5 common graphics we termed the 5 Named Graphics (5NG): scatterplots linegraphs boxplots histograms barplots We can simply specify what variable/column we would like on one axis, (if applicable) what variable we’d like on the other axis, and what type of plot we’d like to make by specifying the geometric object in question. We can also vary aesthetic attributes of the geometric objects in question (points, lines, bar), such as the size and color, along the values of another variable in this tidy dataset. Recall the Gapminder example from Figure 3.1. Lastly, in a few spots in Chapter 3 and Chapter 4, we hinted at some ways to summarize and wrangle data to suit your needs, using the filter() and inner_join() functions. This chapter expands on these functions by giving a variety of examples using what we term the Five Main Verbs (5MV) in the dplyr package (Wickham et al. 2017). Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(nycflights13) library(knitr) 5.1 The pipe %>% Before we introduce the five main verbs, we first introduce the pipe operator (%>%). Just as the + sign was used to add layers to a plot created using ggplot(), the pipe operator allows us to chain together dplyr data wrangling functions. The pipe operator can be read as “then”. The %>% operator allows us to go from one step in dplyr to the next easily so we can, for example: filter our data frame to only focus on a few rows then group_by another variable to create groups then summarize this grouped data to calculate the mean for each level of the group. The piping syntax will be our major focus throughout the rest of this book and you’ll find that you’ll quickly be addicted to the chaining with some practice. 5.2 Five Main Verbs - The 5MV The d in dplyr stands for data frames, so the functions in dplyr are built for working with objects of the data frame type. For now, we focus on the 5MV: the five most commonly used functions that help wrangle and summarize data. A description of these verbs follows, with each section devoted to an example of that verb, or a combination of a few verbs, in action. filter(): Pick rows based on conditions about their values summarize(): Compute summary measures known as “summary statistics” of variables group_by(): Group rows of observations together mutate(): Create a new variable in the data frame by mutating existing ones arrange(): Arrange/sort the rows based on one or more variables Just as we had the Five Named Graphs for data visualization using ggplot2 in Chapter 3, we have the 5MV here (The Five Main Verbs in dplyr) for data wrangling. All of the 5MVs follow the same syntax, with the argument before the pipe %>% being the name of the data frame, then the name of the verb, followed with other arguments specifying which criteria you’d like the verb to work with in parentheses. Keep in mind, there are more advanced functions than just these five and you’ll see some examples of this near the end of this chapter in 5.9, but with the 5MV you’ll be able to perform a broad array of data wrangling tasks. 5.3 5MV#1: Filter observations using filter Figure 5.1: Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet The filter function here works much like the “Filter” option in Microsoft Excel; it allows you to specify criteria about values of a variable in your dataset and then chooses only those rows that match that criteria. We begin by focusing only on flights from New York City to Portland, Oregon. The dest code (or airport code) for Portland, Oregon is "PDX". Run the following and look at the resulting spreadsheet to ensure that only flights heading to Portland are chosen here: portland_flights <- flights %>% filter(dest == "PDX") View(portland_flights) Note the following: The ordering of the commands: Take the data frame flights then filter the data frame so that only those where the dest equals "PDX" are included. The double equal sign == for testing for equality, and not =. You are almost guaranteed to make the mistake at least once of only including one equals sign. You can combine multiple criteria together using operators that make comparisons: | corresponds to “or” & corresponds to “and” We can often skip the use of & and just separate our conditions with a comma. You’ll see this in the example below. In addition, you can use other mathematical checks (similar to ==): > corresponds to “greater than” < corresponds to “less than” >= corresponds to “greater than or equal to” <= corresponds to “less than or equal to” != corresponds to “not equal to” To see many of these in action, let’s select all flights that left JFK airport heading to Burlington, Vermont ("BTV") or Seattle, Washington ("SEA") in the months of October, November, or December. Run the following btv_sea_flights_fall <- flights %>% filter(origin == "JFK", (dest == "BTV" | dest == "SEA"), month >= 10) View(btv_sea_flights_fall) Note: even though colloquially speaking one might say “all flights leaving Burlington, Vermont and Seattle, Washington,” in terms of computer logical operations, we really mean “all flights leaving Burlington, Vermont or Seattle, Washington.” For a given row in the data, dest can be “BTV”, “SEA”, or something else, but not “BTV” and “SEA” at the same time. Another example uses the ! to pick rows that don’t match a condition. The ! can be read as “not”. Here we are selecting rows corresponding to flights that didn’t go to Burlington, VT or Seattle, WA. not_BTV_SEA <- flights %>% filter(!(dest == "BTV" | dest == "SEA")) View(not_BTV_SEA) As a final note we point out that filter() should often be the first verb you’ll apply to your data. This cleans your dataset to only those rows you care about, or put differently, it narrows down the scope to just the observations your care about. Learning check (LC5.1) What’s another way using the “not” operator ! we could filter only the rows that are not going to Burlington, VT nor Seattle, WA in the flights data frame? Test this out using the code above. 5.4 5MV#2: Summarize variables using summarize The next common task when working with data is to be able to summarize data: take a large number of values and summarize then with a single value. While this may seem like a very abstract idea, something as simple as the sum, the smallest value, and the largest values are all summaries of a large number of values. Figure 5.2: Summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet Figure 5.3: Another summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet We can calculate the standard deviation and mean of the temperature variable temp in the weather data frame of nycflights13 in one step using the summarize (or equivalently using the UK spelling summarise) function in dplyr (See Appendix A): summary_temp <- weather %>% summarize(mean = mean(temp), std_dev = sd(temp)) kable(summary_temp) mean std_dev NA NA In this chapter we’ll display the contents of certain data frames using the kable() function from the knitr package in Subsection 2.4.3; all it does is yield an alternative formating of all data frames. We’ve created a small data frame here called summary_temp that includes both the mean and the std_dev of the temp variable in weather. Notice as shown in Figures 5.2 and 5.3, the data frame weather went from many rows to a single row of just the summary values in the data frame summary_temp. But why are the values returned NA? This stands for “not available or not applicable” and is how R encodes missing values; if in a data frame for a particular row and column no value exists, NA is stored instead. Furthermore, by default any time you try to summarize a number of values (using mean() and sd() for example) that has one or more missing values, then NA is returned. Values can missing for many reasons. Perhaps the data was collected but someone forgot to enter it? Perhaps the data was not collected at all because it was too difficult? Perhaps there was an erroneous value that someone entered that has been correct to read as missing? You’ll often encounter issues with missing values. You can summarize all non-missing values by setting the na.rm argument to TRUE (rm is short for “remove”). This will remove any NA missing values and only return the summary value for all non-missing values. So the code below computes the mean and standard deviation of all non-missing values. Notice how the na.rm=TRUE are set as arguments to the mean() and sd() functions, and not to the summarize() function. summary_temp <- weather %>% summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) kable(summary_temp) mean std_dev 55.2 17.78 It is not good practice to include a na.rm = TRUE in your summary commands by default; you should attempt to run them without this argument. This is because removing missing data can have an impact on your analyses. In fact, an entire branch of the field of statistics deals with missing data. The take away point is that na.rm = TRUE should only be used after you aware of the implications of its use (see the Learning Checks below for an example). What other summary functions can we use inside the summarize() verb? Any function in R that takes a vector of values and returns just one. Here are just a few: mean(): the mean AKA the average sd(): the standard deviation, which is a measure of spread min() and max(): the minimum and maximum values respectively IQR(): Interquartile range sum(): the sum n(): a count of the number of rows/observations in each group. This particular summary function will make more sense when group_by() is covered in Section 5.5. Learning check (LC5.2) Say a doctor is studying the effect of smoking on lung cancer for a large number of patients who have records measured at five year intervals. She notices that a large number of patients have missing data points because the patient has died, so she chooses to ignore these patients in his analysis. What is wrong with this doctor’s approach? (LC5.3) Modify the above summarize function to create summary_temp to also use the n() summary function: summarize(count = n()). What does the returned value correspond to? (LC5.4) Why doesn’t the following code work? Run the code line by line instead of all at once, and then look at the data. In other words, run summary_temp <- weather %>% summarize(mean = mean(temp, na.rm = TRUE)) first. summary_temp <- weather %>% summarize(mean = mean(temp, na.rm = TRUE)) %>% summarize(std_dev = sd(temp, na.rm = TRUE)) 5.5 5MV#3: Group rows using group_by Figure 5.4: Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet It’s often more useful to summarize a variable based on the groupings of another variable. Let’s say, we are interested in the mean and standard deviation of temperatures but grouped by month. To be more specific: we want the mean and standard deviation of temperatures split by month. sliced by month. aggregated by month. collapsed over month. Run the following code: summary_monthly_temp <- weather %>% group_by(month) %>% summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) kable(summary_monthly_temp) month mean std_dev 1 35.64 10.185 2 34.15 6.940 3 39.81 6.225 4 51.67 8.785 5 61.59 9.609 6 72.14 7.603 7 80.01 7.148 8 74.40 5.171 9 67.43 8.476 10 60.03 8.830 11 45.11 10.502 12 38.37 9.941 This code is identical to the previous code that created summary_temp, with an extra group_by(month) added. Grouping the weather dataset by month and then passing this new data frame into summarize yields a data frame that shows the mean and standard deviation of temperature for each month in New York City. Note: Since each row in summary_monthly_temp represents a summary of different rows in weather, the observational units have changed. It is important to note that group_by doesn’t change the data frame. It sets meta-data (data about the data), specifically the group structure of the data. It is only after we apply the summarize function that the data frame changes. If we would like to remove this group structure meta-data, we can pipe the resulting data frame into the ungroup() function. For example, say the group structure meta-data is set to be by month via group_by(month), all future summarizations will be reported on a month-by-month basis. If however, we would like to no longer have this and have all summarizations be for all data in a single group (in this case over the entire year of 2013), then pipe the data frame in question through and ungroup() to remove this. We now revisit the n() counting summary function we introduced in the previous section. For example, suppose we’d like to get a sense for how many flights departed each of the three airports in New York City: by_origin <- flights %>% group_by(origin) %>% summarize(count = n()) kable(by_origin) origin count EWR 120835 JFK 111279 LGA 104662 We see that Newark ("EWR") had the most flights departing in 2013 followed by "JFK" and lastly by LaGuardia ("LGA"). Note there is a subtle but important difference between sum() and n(). While sum() simply adds up a large set of numbers, the latter counts the number of times each of many different values occur. You are not limited to grouping by one variable! Say you wanted to know the number of flights leaving each of the three New York City airports for each month, we can also group by a second variable month: group_by(origin, month). by_monthly_origin <- flights %>% group_by(origin, month) %>% summarize(count = n()) by_monthly_origin # A tibble: 36 x 3 # Groups: origin [?] origin month count <chr> <int> <int> 1 EWR 1 9893 2 EWR 2 9107 3 EWR 3 10420 4 EWR 4 10531 5 EWR 5 10592 6 EWR 6 10175 7 EWR 7 10475 8 EWR 8 10359 9 EWR 9 9550 10 EWR 10 10104 # ... with 26 more rows Alternatively, you can use the shortcut count() function in dplyr to get the same result: by_monthly_origin <- flights %>% count(origin, month) by_monthly_origin Learning check (LC5.5) Recall from Chapter 3 when we looked at plots of temperatures by months in NYC. What does the standard deviation column in the summary_monthly_temp data frame tell us about temperatures in New York City throughout the year? (LC5.6) What code would be required to get the mean and standard deviation temperature for each day in 2013 for NYC? (LC5.7) Recreate by_monthly_origin, but instead of grouping via group_by(origin, month), group variables in a different order group_by(month, origin). What differs in the resulting dataset? (LC5.8) How could we identify how many flights left each of the three airports for each carrier? (LC5.9) How does the filter operation differ from a group_by followed by a summarize? 5.6 5MV#4: Create new variables/change old variables using mutate Figure 5.5: Mutate diagram from Data Wrangling with dplyr and tidyr cheatsheet When looking at the flights dataset, there are some clear additional variables that could be calculated based on the values of variables already in the dataset. Passengers are often frustrated when their flights departs late, but change their mood a bit if pilots can make up some time during the flight to get them to their destination close to when they expected to land. This is commonly referred to as “gain” and we will create this variable using the mutate function. Note that we have also overwritten the flights data frame with what it was before as well as an additional variable gain here, or put differently, the mutate() command outputs a new data frame which then gets saved over the original flights data frame. flights <- flights %>% mutate(gain = dep_delay - arr_delay) Why did we overwrite flights instead of assigning the resulting data frame to a new object, like flights_with_gain? As a rough rule of thumb, as long as you are not losing information that you might need later, it’s acceptable practice to overwrite data frames. However, if you overwrite existing variables and/or change the observational units, recovering the original information might prove difficult. In this case, it might make sense to create a new data object. Let’s look at summary measures of this gain variable and even plot it in the form of a histogram: gain_summary <- flights %>% summarize( min = min(gain, na.rm = TRUE), q1 = quantile(gain, 0.25, na.rm = TRUE), median = quantile(gain, 0.5, na.rm = TRUE), q3 = quantile(gain, 0.75, na.rm = TRUE), max = max(gain, na.rm = TRUE), mean = mean(gain, na.rm = TRUE), sd = sd(gain, na.rm = TRUE), missing = sum(is.na(gain)) ) kable(gain_summary) min q1 median q3 max mean sd missing -196 -3 7 17 109 5.66 18.04 9430 We’ve recreated the summary function we saw in Chapter 3 here using the summarize function in dplyr. ggplot(data = flights, mapping = aes(x = gain)) + geom_histogram(color = "white", bins = 20) Figure 5.6: Histogram of gain variable We can also create multiple columns at once and even refer to columns that were just created in a new column. Hadley and Garrett produce one such example in Chapter 5 of “R for Data Science” (Grolemund and Wickham 2016): flights <- flights %>% mutate( gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours ) Learning check (LC5.10) What do positive values of the gain variable in flights correspond to? What about negative values? And what about a zero value? (LC5.11) Could we create the dep_delay and arr_delay columns by simply subtracting dep_time from sched_dep_time and similarly for arrivals? Try the code out and explain any differences between the result and what actually appears in flights. (LC5.12) What can we say about the distribution of gain? Describe it in a few sentences using the plot and the gain_summary data frame values. 5.7 5MV#5: Reorder the data frame using arrange One of the most common things people working with data would like to do is sort the data frames by a specific variable in a column. Have you ever been asked to calculate a median by hand? This requires you to put the data in order from smallest to highest in value. The dplyr package has a function called arrange that we will use to sort/reorder our data according to the values of the specified variable. This is often used after we have used the group_by and summarize functions as we will see. Let’s suppose we were interested in determining the most frequent destination airports from New York City in 2013: freq_dest <- flights %>% group_by(dest) %>% summarize(num_flights = n()) freq_dest # A tibble: 105 x 2 dest num_flights <chr> <int> 1 ABQ 254 2 ACK 265 3 ALB 439 4 ANC 8 5 ATL 17215 6 AUS 2439 7 AVL 275 8 BDL 443 9 BGR 375 10 BHM 297 # ... with 95 more rows You’ll see that by default the values of dest are displayed in alphabetical order here. We are interested in finding those airports that appear most: freq_dest %>% arrange(num_flights) # A tibble: 105 x 2 dest num_flights <chr> <int> 1 LEX 1 2 LGA 1 3 ANC 8 4 SBN 10 5 HDN 15 6 MTJ 15 7 EYW 17 8 PSP 19 9 JAC 25 10 BZN 36 # ... with 95 more rows This is actually giving us the opposite of what we are looking for. It tells us the least frequent destination airports first. To switch the ordering to be descending instead of ascending we use the desc (descending) function: freq_dest %>% arrange(desc(num_flights)) # A tibble: 105 x 2 dest num_flights <chr> <int> 1 ORD 17283 2 ATL 17215 3 LAX 16174 4 BOS 15508 5 MCO 14082 6 CLT 14064 7 SFO 13331 8 FLL 12055 9 MIA 11728 10 DCA 9705 # ... with 95 more rows 5.8 Joining data frames Another common task is joining/merging two different datasets. For example, in the flights data, the variable carrier lists the carrier code for the different flights. While "UA" and "AA" might be somewhat easy to guess for some (United and American Airlines), what are “VX”, “HA”, and “B6”? This information is provided in a separate data frame airlines. View(airlines) We see that in airports, carrier is the carrier code while name is the full name of the airline. Using this table, we can see that “VX”, “HA”, and “B6” correspond to Virgin America, Hawaiian Airlines, and JetBlue respectively. However, will we have to continually look up the carrier’s name for each flight in the airlines dataset? No! Instead of having to do this manually, we can have R automatically do the “looking up” for us. Note that the values in the variable carrier in flights match the values in the variable carrier in airlines. In this case, we can use the variable carrier as a key variable to join/merge/match the two data frames by. Hadley and Garrett (Grolemund and Wickham 2016) created the following diagram to help us understand how the different datasets are linked: Figure 5.7: Data relationships in nycflights13 from R for Data Science 5.8.1 Joining by key variables In both flights and airlines, the key variable we want to join/merge/match the two data frames with has the same name in both datasets: carriers. We make use of the inner_join() function to join by the variable carrier. flights_joined <- flights %>% inner_join(airlines, by = "carrier") View(flights) View(flights_joined) We observed that the flights and flights_joined are identical except that flights_joined has an additional variable name whose values were drawn from airlines. A visual representation of the inner_join is given below (Grolemund and Wickham 2016): Figure 5.8: Diagram of inner join from R for Data Science There are more complex joins available, but the inner_join will solve nearly all of the problems you’ll face in our experience. 5.8.2 Joining by key variables with different names Say instead, you are interested in all the destinations of flights from NYC in 2013 and ask yourself: “What cities are these airports in?” “Is "ORD" Orlando?” “Where is "FLL"? The airports data frame contains airport codes: View(airports) However, looking at both the airports and flights and the visual representation of the relations between the data frames in Figure 5.8, we see that in: airports the airport code is in the variable faa flights the airport code is in the variable origin So to join these two datasets, our inner_join operation involves a by argument that accounts for the different names: flights %>% inner_join(airports, by = c("dest" = "faa")) Let’s construct the sequence of commands that computes the number of flights from NYC to each destination, but also includes information about each destination airport: named_dests <- flights %>% group_by(dest) %>% summarize(num_flights = n()) %>% arrange(desc(num_flights)) %>% inner_join(airports, by = c("dest" = "faa")) %>% rename(airport_name = name) View(named_dests) In case you didn’t know, "ORD" is the airport code of Chicago O’Hare airport and "FLL" is the main airport in Fort Lauderdale, Florida, which we can now see in our named_freq_dests data frame. Learning check (LC5.13) Looking at Figure 5.7, when joining flights and weather (or, in other words, matching the hourly weather values with each flight), why do we need to join by all of year, month, day, hour, and origin, and not just hour? (LC5.14) What surprises you about the top 10 destinations from NYC in 2013? 5.9 Optional: Other verbs On top of the following examples of other verbs, if you’d like to see more examples on using dplyr, the 5MV, and %>% with the nycflights13 dataset, check out Chapter 5 of Hadley and Garrett’s book (Grolemund and Wickham 2016). 5.9.1 Select variables using select Figure 5.9: Select diagram from Data Wrangling with dplyr and tidyr cheatsheet We’ve seen that the flights data frame in the nycflights13 package contains many different variables. The names function gives a listing of all the columns in a data frame; in our case you would run names(flights). You can also identify these variables by running the glimpse function in the dplyr package: glimpse(flights) However, say you only want to consider two of these variables, say carrier and flight. You can select these: flights %>% select(carrier, flight) Another one of these variables is year. If you remember the original description of the flights data frame (or by running ?flights), you’ll remember that this data correspond to flights in 2013 departing New York City. The year variable isn’t really a variable here in that it doesn’t vary… flights actually comes from a larger dataset that covers many years. We may want to remove the year variable from our dataset since it won’t be helpful for analysis in this case. We can deselect year by using the - sign: flights_no_year <- flights %>% select(-year) names(flights_no_year) Or we could specify a ranges of columns: flight_arr_times <- flights %>% select(month:day, arr_time:sched_arr_time) flight_arr_times The select function can also be used to reorder columns in combination with the everything helper function. Let’s suppose we’d like the hour, minute, and time_hour variables, which appear at the end of the flights dataset, to actually appear immediately after the day variable: flights_reorder <- flights %>% select(month:day, hour:time_hour, everything()) names(flights_reorder) in this case everything() picks up all remaining variables. Lastly, the helper functions starts_with, ends_with, and contains can be used to choose column names that match those conditions: flights_begin_a <- flights %>% select(starts_with("a")) flights_begin_a flights_delays <- flights %>% select(ends_with("delay")) flights_delays flights_time <- flights %>% select(contains("time")) flights_time 5.9.2 Rename variables using rename Another useful function is rename, which as you may suspect renames one column to another name. Suppose we wanted dep_time and arr_time to be departure_time and arrival_time instead in the flights_time data frame: flights_time_new <- flights %>% select(contains("time")) %>% rename(departure_time = dep_time, arrival_time = arr_time) names(flights_time) Note that in this case we used a single = sign with the rename(). Ex: departure_time = dep_time. This is because we are not testing for equality like we would using ==, but instead we want to assign a new variable departure_time to have the same values as dep_time and then delete the variable dep_time. It’s easy to forget if the new name comes before or after the equals sign. I usually remember this as “New Before, Old After” or NBOA. You’ll receive an error if you try to do it the other way: Error: Unknown variables: departure_time, arrival_time. 5.9.3 Find the top number of values using top_n We can also use the top_n function which automatically tells us the most frequent num_flights. We specify the top 10 airports here: named_dests %>% top_n(n = 10, wt = num_flights) We’ll still need to arrange this by num_flights though: named_dests %>% top_n(n = 10, wt = num_flights) %>% arrange(desc(num_flights)) Note: Remember that I didn’t pull the n and wt arguments out of thin air. They can be found by using the ? function on top_n. We can go one stop further and tie together the group_by and summarize functions we used to find the most frequent flights: ten_freq_dests <- flights %>% group_by(dest) %>% summarize(num_flights = n()) %>% arrange(desc(num_flights)) %>% top_n(n = 10) View(ten_freq_dests) Learning check (LC5.15) What are some ways to select all three of the dest, air_time, and distance variables from flights? Give the code showing how to do this in at least three different ways. (LC5.16) How could one use starts_with, ends_with, and contains to select columns from the flights data frame? Provide three different examples in total: one for starts_with, one for ends_with, and one for contains. (LC5.17) Why might we want to use the select function on a data frame? (LC5.18) Create a new data frame that shows the top 5 airports with the largest arrival delays from NYC in 2013. 5.10 Conclusion 5.10.1 Review questions Review questions have been designed using the fivethirtyeight R package (Ismay and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the chapters of the DataCamp course available below: Filtering, Grouping, & Summarizing dplyr Review 5.10.2 What’s to come? Congratulations! We’ve completed the “data science” portion of this book! We’ll now move to the “data modeling” portion in Chapters 6 and 7, where you’ll leverage your data visualization and wrangling skills to model the relationships between different variables of datasets. However, we’re going to leave “Inference for Regression” (Chapter 11) until later. 5.10.3 Resources As we saw with the RStudio cheatsheet on data visualization, RStudio has also created a cheatsheet for data wrangling entitled “Data Transformation with dplyr”. 5.10.4 Script of R code An R script file of all R code used in this chapter is available here. "], -["6-regression.html", "6 Basic Regression 6.1 One numerical explanatory variable 6.2 One categorical explanatory variable 6.3 Related topics 6.4 Conclusion", " 6 Basic Regression Now that we are equipped with data visualization skills from Chapter 3, data wrangling skills from Chapter 5, and an understanding of the “tidy” data format from Chapter 4, we now proceed with data modeling. The fundamental premise of data modeling is to make explicit the relationship between: An outcome variable \\(y\\), also called a dependent variable and An explanatory/predictor variable \\(x\\), also called an independent variable or covariate. Another way to state this is using mathematical terminology: we will model the outcome variable \\(y\\) as a function of the explanatory/predictor variable \\(x\\). Why do we have two different labels, explanatory and predictor, for the variable \\(x\\)? That’s because roughly speaking data modeling can be used for two purposes: Modeling for prediction: You want to predict an outcome variable \\(y\\) based on the information contained in a set of predictor variables. You don’t care so much about understanding how all the variables relate and interact, but so long as you can make good predictions about \\(y\\), you’re fine. For example, if we know many individuals’ risk factors for lung cancer, such as smoking habits and age, can we predict whether or not they will develop lung cancer? Here we wouldn’t care so much about distinguishing the degree to which the different risk factors contribute to lung cancer, but instead only on whether or not they could be put together to make reliable predictions. Modeling for explanation: You want to explicitly describe the relationship between an outcome variable \\(y\\) and a set of explanatory variables, determine the significance of any found relationships, and have measures summarizing these. Continuing our example from above, we would now be interested in describing the individual effects of the different risk factors and quantifying the magnitude of these effects. One reason could be to design an intervention to reduce lung cancer cases in a population, such as targeting smokers of a specific age group with an advertisement for smoking cessation programs. In this book, we’ll focus more on this latter purpose. Data modeling is used in a wide variety of fields, including statistical inference, causal inference, artificial intelligence, and machine learning. There are many techniques for data modeling, such as tree-based models, neural networks/deep learning, and more. However, we’ll focus on one particular technique: linear regression, one of the most commonly-used and easy-to-understand approaches to modeling. Recall our discussion in Subsection 2.4.3 on numerical and categorical variables. Linear regression involves: An outcome variable \\(y\\) that is numerical Explanatory variables \\(\\vec{x}\\) that are either numerical or categorical Whereas there is always only one numerical outcome variable \\(y\\), we have choices on both the number and the type of explanatory variables \\(\\vec{x}\\) to use. We’re going to cover the following regression scenarios: In this chapter, Chapter 6 on basic regression, where we’ll always have only one explanatory variable: A single numerical explanatory variable \\(x\\) in Section 6.1. This scenario is known as simple linear regression. A single categorical explanatory variable \\(x\\) in Section 6.2. In the next chapter: Chapter 7 on multiple regression, where we’ll have more than one explanatory variable: Two numerical explanatory variables \\(x_1\\) and \\(x_2\\) in Section 7.1. This can be denoted as \\(\\vec{x}\\) as well since we have more than one explanatory variable. One numerical and one categorical explanatory variable in Section 7.1. We’ll also introduce interaction models here; there the effect of one explanatory variable depends on the value of another. We’ll study all four of these regression scenarios using real data, all easily accessible via R packages! Needed packages In this chapter we introduce a new package, moderndive, that is an accompaniment package to this ModernDive book that includes useful functions for linear regression and other functions and data used later in the book. Let’s now load all the packages needed for this chapter. If needed, read Section 2.3 for information on how to install and load R packages. library(ggplot2) library(dplyr) library(moderndive) library(gapminder) 6.1 One numerical explanatory variable Why do some professors and instructors at universities and colleges get high teaching evaluations from students while others don’t? What factors can explain these differences? Are there biases? These are questions that are of interest to university/college administrators, as teaching evaluations are among the many criteria considered in determining which professors and instructors should get promotions. Researchers at the University of Texas in Austin tried to answer this question: what factors can explain differences in instructor’s teaching evaluation scores? To this end, they collected information on \\(n = 463\\) instructors. A full description of the study can be found at openintro.org. We’ll keep things simple for now and try to explain differences in instructor evaluation scores as a function of one numerical variable: their “beauty score” which we’ll describe shortly. Could it be that instructors with higher beauty scores also have higher teaching evaluations? Could it be instead that instructors with higher beauty scores tend to have lower teaching evaluations? Or could it be there is no relationship between beauty score and teaching evaluations? We’ll achieve this by modeling the relationship between these two variables with a particular kind of linear regression called simple linear regression. Simple linear regression is the most basic form of linear regression where we have A numerical outcome variable \\(y\\). In this case, their teaching score. A single numerical explanatory variable \\(x\\). In this case, their beauty score. 6.1.1 Exploratory data analysis A crucial step before doing any kind of modeling or analysis is performing an exploratory data analysis, or EDA, of all our data. Exploratory data analysis can give you a sense of the distribution of the data, whether there are outliers and/or missing values, but most importantly it can inform how to build your model. There are many approaches to exploratory data analysis, here are three: Most fundamentally: just looking at the raw values, in a spreadsheet for example. While this may seem trivial, many people ignore this crucial step! Computing summary statistics likes means, medians, and standard deviations. Creating data visualizations. Let’s load the data, select only a subset of the variables, and look at the raw values. Recall you can look at the raw values by running View(evals) in the console in RStudio to pop-up the spreadsheet viewer. Here, however, we present only a snapshot of 5 randomly chosen rows: load(url("http://www.openintro.org/stat/data/evals.RData")) evals <- evals %>% select(score, bty_avg, age) Table 6.1: Random sample of 5 instructors score bty_avg age 290 3.6 6.67 34 341 4.9 3.50 43 199 3.3 2.33 47 47 4.4 4.67 33 215 4.7 3.67 60 While a full description of each of these variables can be found at openintro.org, let’s summarize what each of these variables represent score: Numerical variable of the average teaching score based on students’ evaluations between 1 and 5. This is the outcome variable \\(y\\) of interest. bty_avg: Numerical variable of average “beauty” rating based on a panel of 6 students’ scores between 1 and 10. This is the numerical explanatory variable \\(x\\) of interest. age: A numerical variable of age. Another way to look at the raw values is using the glimpse() function, which gives us a slightly different view of the data. We see Observations: 463, indicating that there are 463 observations in evals, each corresponding to a particular instructor at UT Austin. Expressed differently, each row in the data frame evals corresponds to one of 463 instructors. glimpse(evals) Observations: 463 Variables: 3 $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5,... $ bty_avg <dbl> 5.00, 5.00, 5.00, 5.00, 3.00, 3.00, 3.00, 3.33, 3.33, 3.17,... $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40,... Since both the outcome variable score and the explanatory variable bty_avg are numerical, we can compute summary statistics about them such as the mean and median. Let’s take evals, then select only the two variables of interest for now, and pipe them into the summary() command which returns: the minimum (smallest) value, the first quartile, the median, the mean (average), the third quartile, and the maximum (largest) value. evals %>% select(score, bty_avg) %>% summary() score bty_avg Min. :2.30 Min. :1.67 1st Qu.:3.80 1st Qu.:3.17 Median :4.30 Median :4.33 Mean :4.17 Mean :4.42 3rd Qu.:4.60 3rd Qu.:5.50 Max. :5.00 Max. :8.17 We get an idea of how the values in both variables are distributed. For example, the mean teaching score was 4.17 out of 5 whereas the mean beauty score was 4.42 out of 10. Furthermore, the middle 50% of teaching scores were between 3.80 and 4.6 (the first and third quartiles) while the middle 50% of beauty scores were between 3.17 and 5.5 out of 10. The summary() function however only returns what are called univariate summaries, i.e. summaries about single variables at a time. Since we are considering the relationship between two numerical variables, it would be nice to have a summary statistic that simultaneously considers both variables. The correlation coefficient is a bivariate summary statistic that fits this bill. Coefficients in general are quantitative expressions of a specific property of a phenomenon. A correlation coefficient is a quantitative expression between -1 and 1 that summarizes the strength of the linear relationship between two numerical variables: -1 indicates a perfect negative relationship: as the value of one variable goes up, the value of the other variable tends to go down. 0 indicates no relationship: the values of both variables go up/down independently of each other. +1 indicates a perfect positive relationship: as the value of one variable goes up, the value of the other variable tends to go up as well. Figure 6.1 gives examples of different correlation coefficient values for hypothetical numerical variables \\(x\\) and \\(y\\). We see that while for a correlation coefficient of -0.75 there is still a negative relationship between \\(x\\) and \\(y\\), it is not as strong as the negative relationship between \\(x\\) and \\(y\\) when the correlation coefficient is -1. Figure 6.1: Different correlation coefficients The correlation coefficient is computed using the cor() function, where in this case the inputs to the function are the two numerical variables from which we want to calculate the correlation coefficient. Recall from Subsection 2.4.3 that the $ pulls out specific variables from a data frame: cor(evals$score, evals$bty_avg) [1] 0.187 In our case, the correlation coefficient of 0.187 indicates that the relationship between teaching evaluation score and beauty average is “weakly positive.” There is a certain amount of subjectivity in interpreting correlation coefficients, especially those that aren’t close to -1, 0, and 1. For help developing such intuition and more discussion on the correlation coefficient see Subsection 6.3.1 below. Let’s now proceed by visualizing this data. Since both the score and bty_avg variables are numerical, a scatterplot is an appropriate graph to visualize this data. Let’s do this using geom_point() and set informative axes labels and title. ggplot(evals, aes(x = bty_avg, y = score)) + geom_point() + labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") Figure 6.2: Instructor evaluation scores at UT Austin However Figure 6.2 suffers from overplotting. Recall from the data visualization Subsection 3.3.2 that overplotting occurs when several points are stacked directly on top of each other thereby obscuring the number of points. For example, let’s focus on the 6 points in the top-right of the plot with a beauty score of around 8 out of 10: are there truly only 6 points, or are there many more just stacked on top of each other? You can think of these as ties. Let’s break up these ties with a little random “jitter” added to the points in Figure 6.3. Jittering adds a little random bump to each of the points to break up these ties. Remember that the geom_jitter only alters the visual display of the points; the values in the data frame stay the same. ggplot(evals, aes(x = bty_avg, y = score)) + geom_jitter() + labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") Figure 6.3: Instructor evaluation scores at UT Austin: Jittered From Figure 6.3 we make several observations: Focusing our attention on the top-right of the plot again, we now see that those originally unjittered 6 points actually were actually 12! A further interesting trend is that the jittering revealed a large number of instructors with beauty scores of between 3 and 4.5, towards the lower end of the beauty scale. Most beauty scores lie between 2 and 8. Most teaching scores lie between 3 and 5. Recall our earlier computation of the correlation coefficient, which describes the strength of the linear relationship between two numerical variables. Looking at Figure 6.3, it is not immediately apparent that these two variables are positively related. This is to be expected given the positive, but rather weak (close to 0), correlation coefficient of 0.187. Going back to the unjittered plot in Figure 6.2, let’s improve on it by adding a “regression line” in Figure 6.4. This is easily done by adding a new layer to the ggplot code that created Figure 6.3: + geom_smooth(method="lm"). A regression line is a “best fitting” line in that of all possible lines you could draw on this plot, it is “best” in terms of some mathematical criteria. We discuss the criteria for “best” in Subsection 6.3.3 below, but we suggest you read this only after covering the concept of a residual coming up in Subsection 6.1.3. ggplot(evals, aes(x = bty_avg, y = score)) + geom_point() + labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") + geom_smooth(method = "lm") Figure 6.4: Regression line When viewed on this plot, the regression line is a visual summary of the relationship between two numerical variables, in our case the outcome variable score and the explanatory variable bty_avg. The positive slope of the blue line is consistent with our observed correlation coefficient of 0.187 suggesting that there is a positive relationship between score and bty_avg. We’ll see later however that while the correlation coefficient is not equal to the slope of this line, they always have the same sign: positive or negative. What are the grey bands surrounding the blue line? These are standard error bands, which can be thought of as error/uncertainty bands. Let’s skip this idea for now and suppress these grey bars for now by adding the argument se = FALSE to geom_smooth(method = "lm"). We’ll introduce standard errors in Chapter 8 on sampling, use them for constructing confidence intervals and conducting hypothesis tests in Chapters 9 and 10, and consider them when we revisit regression in Chapter 11. ggplot(evals, aes(x = bty_avg, y = score)) + geom_point() + labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") + geom_smooth(method = "lm", se = FALSE) Figure 6.5: Regression line without error bands Learning check (LC6.1) Conduct a new exploratory data analysis with the same outcome variable \\(y\\) being score but with age as the new explanatory variable \\(x\\). Remember, this involves three things: Looking at the raw values Computing summary statistics of the variables of interest. Creating informative visualizations What can you say about the relationship between age and teaching scores based on this exploration? 6.1.2 Simple linear regression If case you’ve forgotten from high school algebra, in general, the equation of a line is \\(y = a + bx\\), which is defined by two coefficients. Recall we defined this earlier as “quantitative expressions of a specific property of a phenomenon. These two coefficients are: the intercept coefficient \\(a\\), or the value of \\(y\\) when \\(x = 0\\), and the slope coefficient \\(b\\), or the increase in \\(y\\) for every increase of one in \\(x\\). However, when defining a line specifically for regression, like the blue regression line in Figure 6.5, we use slightly different notation: the equation of the regression line is \\(\\widehat{y} = b_0 + b_1 x\\) where the intercept coefficient is \\(b_0\\), or the value of \\(\\widehat{y}\\) when \\(x=0\\), and the slope coefficient \\(b_1\\), or the increase in \\(\\widehat{y}\\) for every increase of one in \\(x\\). Why do we put a “hat” on top of the \\(y\\)? It’s a form of notation commonly used in regression, which we’ll introduce in the next Subsection 6.1.3 when we discuss fitted values. For now, let’s ignore the hat and treat the equation of the line as you would from high school algebra recognizing the slope and the intercept. We know looking at Figure 6.5 that the slope coefficient corresponding to bty_avg should be positive. Why? Because as bty_avg increases, professors tend to roughly have larger teaching evaluation scores. However, what are the specific values of the intercept and slope coefficients? Let’s not worry about computing these by hand, but instead let the computer do the work for us, specifically R! Let’s get the value of the intercept and slope coefficients by outputting something called the linear regression table. This is always done in a two-step process: First “fit” the linear regression model to the data using the lm() function and save this to score_model. lm stands for “linear model”, given that we are dealing with lines. When we say “fit”, we are saying find the best fitting line to this data. Then apply the get_regression_table() function from the moderndive R package to score_model. score_model <- lm(score ~ bty_avg, data = evals) get_regression_table(score_model, digits = 2) Table 6.2: Linear regression table term estimate std_error statistic p_value conf_low conf_high intercept 3.880 0.076 50.96 0 3.731 4.030 bty_avg 0.067 0.016 4.09 0 0.035 0.099 Whoa! There is a lot going on, both in terms of the inputs and outputs! Let’s unpack this slowly. First, the lm() function that “fits” the linear regression model is typically used as lm(y ~ x, data = DATA_FRAME_NAME) where: y is the outcome variable, followed by a tilde (~), the key to the left of “1” on your keyboard. In our case, y is set to score. x is the explanatory variable. In our case, x is set to bty_avg. We call the combination y ~ x a model formula. DATA_FRAME_NAME is the name of the data frame that contains the variables y and x. In our case the evals data frame. Then we pipe this output to be the input of the get_regression_table() function, just as when we discussed piping in Section 5.1 in the data wrangling chapter. An additional argument to the get_regression_table() function is digits, where we specify the number of significant digits of precision (number of digits after the decimal points) we want the regression table to have. digits defaults to 3, meaning if you don’t specify this argument, digits = 3 is used by default. All the get_regression_table() function in the moderndive package does is generate regression table outputs that are clean and easy-to-read while hiding a lot of the code necessary to do so and not much else. This is known as a wrapper function in computer programming, which takes other pre-existing functions and “wraps” them in a single function. While not necessary to understand regression, if you are curious to know what is going on under the hood of get_regression_table(), see Subsection 6.3.4 below. Now let’s consider the outputted regression table, which has two rows denoted by the first column term: one corresponding to the intercept coefficient \\(b_0\\) and one corresponding to the slope coefficient \\(b_1\\) for bty_avg. The second column estimate gives us the “fitted” (or computed) values for both these coefficients. Therefore the blue regression line in Figure 6.5 is \\(\\widehat{\\text{score}} = b_0 + b_{\\text{bty avg}} \\text{bty avg} = 3.88 + 0.067\\text{bty avg}\\) where The intercept coefficient \\(b_0\\) = 3.88, meaning for instructors that had a hypothetical beauty score of 0 would on average have a teaching score of 3.88. In this case however, while the intercept has a mathematical interpretation when defining the regression line, there is no practical interpretation since score is an average of a panel of 6 students’ ratings from 1 to 10, a bty_avg of 0 would be impossible. Furthermore, no instructors had a beauty score anywhere near 0. Of more interest is the slope coefficient associated with bty_avg \\(b_{\\text{bty avg}}\\) = 0.067. This is a numerical quantity that summarizes the relationship between the outcome and explanatory variables. It is interpreted as follows, for every increase of 1 unit in bty_avg, there is an associated increase of on average 0.067 units of score. We note in particular that the sign of this slope is positive, suggesting a positive relationship between beauty scores and teaching scores. We are very careful with our wording: We only stated that there is an associated increase, and not necessarily a causal increase. For example, perhaps it’s not that beauty directly affects teaching scores, but instead individuals from wealthier backgrounds tend to have had better education and training, and hence have higher teaching scores, but these same individuals also have higher beauty scores. Avoiding such reasoning can be summarized by the adage “correlation is not necessarily causation”. In other words, just because two variables are correlated, it doesn’t mean one directly causes the other. We discuss these ideas more in Subsection 6.3.2. We say that this associated increase is on average 0.067 units of teaching score and not that the associated increase is exactly 0.067 units of score across all values of bty_avg. This is because the slope is the average increase across all points as shown by the regression line in Figure 6.5. But what about the remaining 5 columns: std_error, statistic, p_value, conf_low and conf_high? They give you information on the statistical significance of these results, or their “meaningfulness” from a statistical perspective. We’ll revisit these in Chapter 11 on (statistical) inference for regression after we’ve covered standard errors in Chapter 8 (std_error), confidence intervals in Chapter 9 (conf_low and conf_high), and hypothesis testing in Chapter 10 (statistic and p_value). For now, we’ll only focus on the term and estimate columns. Learning check (LC6.2) Fit a new simple linear regression using lm(score ~ age, data = evals) where age is the new explanatory variable \\(x\\). Get information about the “best-fitting” line from the regression table by applying the get_regression_table() function. How do the regression results match up with the results from your exploratory data analysis above? 6.1.3 Observed/fitted values and residuals We just saw how to get the value of the intercept and the slope of the regression line from the regression table generated by get_regression_table(). Now instead, say we want information on individual points, in this case one of the \\(n = 463\\) instructors in this dataset, one corresponding to each row of evals. For example, say we are interested in the 21st instructor in this dataset: Table 6.3: Data for 21st instructor score bty_avg age 4.9 7.33 31 What is the value on the blue line corresponding to this instructors bty_avg of 7.333? In Figure 6.6 we mark three values in particular corresponding to this instructor. Note we revert back to the geom_point() as the geom_jitter() has random noise added to teach point, making it difficult to identify points exactly. Red circle: This is the observed value \\(y\\) = 4.9 and corresponds to this instructor’s actual teaching score. Red square: This is the fitted value \\(\\widehat{y}\\) and corresponds to the value on the regression line for \\(x\\) = 7.333. This value is computed using the intercept and slope in the regression table above: \\(\\widehat{y} = b_0 + b_1 x\\) = 3.88 + 0.067 * 7.333 = 4.369 Blue arrow: The length of this arrow is the residual and is computed by subtracting the fitted value \\(\\widehat{y}\\) from the observed value \\(y\\). The residual can be thought of as the error or “lack of fit” of the regression line, In the case of this instructor, it is \\(y - \\widehat{y}\\) = 4.9 - 4.369 = 0.531. In other words, the model was off by 0.531 teaching score units for this instructor. Figure 6.6: Example of observed value, fitted value, and residual What if we want both the fitted value \\(\\widehat{y} = b_0 + b_1 \\times x\\) the residual \\(y - \\widehat{y}\\) not only the 21st instructor but for all 463 instructors in the study? Recall that each instructor corresponds to one of the 463 rows in the evals data frame and also one of the 463 points in regression plot in Figure 6.5. We could repeat the above calculations by hand 463 times, but that would be tedious and time consuming. Instead, let’s use the get_regression_points() function that we’ve included in the moderndive R package. Note that in the table below we only present the results for 21st through 24th instructors. regression_points <- get_regression_points(score_model) regression_points Table 6.4: Regression points (for only 21st through 24th instructor) ID score bty_avg score_hat residual 21 4.9 7.33 4.37 0.531 22 4.6 7.33 4.37 0.231 23 4.5 7.33 4.37 0.131 24 4.4 5.50 4.25 0.153 Just as with the get_regression_table() function, the inputs to the get_regression_points() function are the same, however the outputs are different. Let’s inspect the individual columns: The score column represents the observed value of the outcome variable \\(y\\) The bty_avg column represents the values of the explanatory variable \\(x\\) The score_hat column represents the fitted values \\(\\widehat{y}\\) The residual column represents the residuals \\(y - \\widehat{y}\\) Just as we did for the 21st instructor in the evals dataset (in the first row of the table above), let’s repeat the above calculations for the 24th instructor in the evals dataset (in the fourth row of the table above): score = 4.4 is the observed value \\(y\\) for this instructor. bty_avg = 5.50 is the value of the explanatory variable \\(x\\) for this instructor. score_hat = 4.25 = 3.88 + 0.067 * \\(x\\) = 3.88 + 0.067 * 5.50 is the fitted value \\(\\widehat{y}\\) for this instructor. residual = 0.153 = 4.4 - 4.25 is the value of the residual for this instructor. In other words, the model was off by 0.153 teaching score units for this instructor. At this point, we suggest you read Subsection 6.3.3, where we explicitly define how a regression line is a “best” fitting line. 6.1.4 Residual analysis Recall the residuals can be thought of as the error or the “lack-of-fit” between the observed value \\(y\\) and the fitted value \\(\\widehat{y}\\) on the blue regression line in Figure 6.5. Ideally when we fit a regression model, we’d like there to be no systematic pattern to these residuals. We’ll be more specific as to what we mean by no systematic pattern when we see Figure 6.8 below, but let’s keep this notion imprecise for now. Investigating any such patterns is known as residual analysis and is the theme of this section. We’ll perform our residual analysis in two ways: Creating a scatterplot with the residuals on the \\(y\\)-axis and the original explanatory variable \\(x\\) on the \\(x\\)-axis. Creating a histogram of the residuals, thereby showing the distribution of the residuals. First, recall in Figure 6.6 above we created a scatterplot where On the vertical axis we had the teaching score \\(y\\) On the horizontal axis we had the beauty score \\(x\\) The blue arrow represented the residual for one particular instructor. Instead, in Figure 6.7 below, let’s create a scatterplot where On the vertical axis we have the residual \\(y-\\widehat{y}\\) instead On the horizontal axis we have the beauty score \\(x\\) as before Figure 6.7: Plot of residuals over beauty score You can think of Figure 6.7 as Figure 6.6 but with the blue line flattened out to \\(y=0\\). Does it seem like there is no systematic pattern to the residuals? This question is rather qualitative and subjective in nature, thus different people may respond with different answers to the above question. However, it can be argued that there isn’t a drastic pattern in the residuals. Let’s now get a little more precise in our definition of no systematic pattern in the residuals. Ideally, the residuals should behave randomly and The residuals should be on average 0. In other words, sometimes the regression model will make a positive error in that \\(y - \\widehat{y} > 0\\), sometimes the regression model will make a negative error in that \\(y - \\widehat{y} < 0\\), but on average the error is 0. The value and spread of the residuals should not depend on the value of \\(x\\). In Figure 6.8 below, we display some hypothetical examples where there are drastic patterns to the residuals. In Example 1, the value of the residual seems to depend on \\(x\\): the residuals tend to be positive for small and large values of \\(x\\) in this range, whereas values of \\(x\\) more in the middle tend to have negative residuals. In Example 2, while the residuals seem to be on average 0 for each value of \\(x\\), the spread of the residuals varies for different values of \\(x\\); this situation is known as heteroskedasticity. Figure 6.8: Examples of less than ideal residual patterns The second way to perform a residual analysis is to look at the histogram of the residuals: ggplot(regression_points, aes(x = residual)) + geom_histogram(binwidth = 0.25, color = "white") + labs(x = "Residual") (#fig:model1_residuals_hist)Histogram of residuals This histogram seems to indicate that we have more positive residuals than negative. Since residual = \\(y-\\widehat{y} > 0\\) when \\(y > \\widehat{y}\\), it seems our fitted teaching score from the regression model tends to underestimate the true teaching score. This histogram has a slight left-skew in that there is a long tail on the left. Another way to say this is this data exhibits a negative skew. Is this a problem? Again, there is a certain amount of subjectivity in the response. In the authors’ opinion, while there is a slight skew/pattern to the residuals isn’t a large concern. On the other hand, others might disagree with our assessment. Here are examples of an ideal and less than ideal pattern to the residuals when viewed in a histogram: Figure 6.9: Examples of ideal and less than ideal residual patterns In fact, we’ll see later on that we would like the residuals to be normally distributed with mean 0. In other words, be bell-shaped and centered at 0! While this requirement and residual analysis in general may seem to some of you as not being overly critical at this point, we’ll see later after when we cover inference for regression in Chapter 11 that for the last five columns of the regression table from earlier (std error, statistic, p_value,conf_low, and conf_high) to have valid interpretations, the above three conditions should roughly hold. Learning check (LC6.3) Continuing with our regression using age as the explanatory variable and teaching score as the outcome variable, use the get_regression_points() function to get the observed values, fitted values, and residuals for all 463 instructors. Perform a residual analysis and look for any systematic patterns in the residuals. Ideally, there should be little to no pattern. 6.2 One categorical explanatory variable It’s an unfortunate truth that life expectancy is not the same across various countries in the world; there are a multitude of factors that are associated with how long people live. International development agencies are very interested in studying these differences in the hope of understanding where governments should allocate resources to address this problem. In this section, we’ll explore differences in life expectancy in two ways: Differences between continents: Are there significant differences in life expectancy, on average, between the five continents of the world: Africa, the Americas, Asia, Europe, and Oceania? Differences within continents: How does life expectancy vary within the world’s five continents? For example, is the spread of life expectancy among the countries of Africa larger than the spread of life expectancy among the countries of Asia? To answer such questions, we’ll study the gapminder dataset in the gapminder package. Recall we introduced this dataset in Subsection 3.1.2 when we first studied the “Grammar of Graphics”; in particular Figure 3.1. This dataset has international development statistics such as life expectancy, GDP per capita, and population by country (\\(n\\) = 142) for 5-year intervals between 1952 and 2007. We’ll use this data for linear regression again, but note that our explanatory variable \\(x\\) is now categorical, and not numerical like when we covered simple linear regression in Section 6.1: A numerical outcome variable \\(y\\). In this case, life expectancy. A single categorical explanatory variable \\(x\\), In this case, the continent the country is part of. When the explanatory variable \\(x\\) is categorical, the concept of a “best-fitting” line is a little different than the one we saw previously in Section 6.1 where the explanatory variable \\(x\\) was numerical. We’ll study these differences shortly in Subsection 6.2.2, but first our exploratory data analysis. 6.2.1 Exploratory data analysis Let’s load the gapminder data, filter() for only observations in 2007, select() only the variables we’ll need along with gdpPercap which is each country’s gross domestic product per capita, a rough measure of that country’s economic performance (this will be used for the upcoming Learning Check). Save this in a data frame gapminder2007: library(gapminder) gapminder2007 <- gapminder %>% filter(year == 2007) %>% select(country, continent, lifeExp, gdpPercap) Let’s look at the raw data values both by bringing up RStudio’s spreadsheet viewer and the glimpse() function, although in Table 6.5 we only show 5 randomly selected countries out of 142: View(gapminder2007) Table 6.5: Random sample of 5 countries country continent lifeExp gdpPercap Slovak Republic Europe 74.7 18678 Israel Asia 80.7 25523 Bulgaria Europe 73.0 10681 Tanzania Africa 52.5 1107 Myanmar Asia 62.1 944 glimpse(gapminder2007) Observations: 142 Variables: 4 $ country <fct> Afghanistan, Albania, Algeria, Angola, Argentina, Austral... $ continent <fct> Asia, Europe, Africa, Africa, Americas, Oceania, Europe, ... $ lifeExp <dbl> 43.8, 76.4, 72.3, 42.7, 75.3, 81.2, 79.8, 75.6, 64.1, 79.... $ gdpPercap <dbl> 975, 5937, 6223, 4797, 12779, 34435, 36126, 29796, 1391, ... We see that the variable continent is indeed categorical, as it is encoded as fctr which stands for “factor”: R’s way of storing categorical variables. Let’s look at a summary of the explanatory variable continent: summary(gapminder2007$continent) Africa Americas Asia Europe Oceania 52 25 33 30 2 We observe that all other continents have 25 countries or more, but Oceania only has two: Australia and New Zealand. Let’s now compute some summary statistics of the outcome variable lifeExp, in particular the worldwide median and mean life expectancy lifeExp_worldwide <- gapminder2007 %>% summarize(median = median(lifeExp), mean = mean(lifeExp)) Table 6.6: Worldwide life expectancy median mean 71.9 67 Given that the global median life expectancy is 71.935 half of the world’s countries (71 countries) will have a life expectancy less than 71.935, while half will have a life expectancy greater than this value. The mean life expectancy of 67.007 is lower however. Why are these two values different? Let’s look at a histogram of lifeExp to see why. ggplot(gapminder2007, aes(x = lifeExp)) + geom_histogram(binwidth = 5, color = "white") + labs(x = "Life expectancy", y = "Number of countries", title = "Worldwide life expectancy") We see that this data is left-skewed/negatively skewed: there are a few countries with very low life expectancies that are bringing down the mean life expectancy. However, the median is less sensitive to the effects of such outliers. Hence the median is greater than the mean in this case. Let’s proceed by comparing median and mean life expectancy between continents by adding a group_by(continent) to the above code: lifeExp_by_continent <- gapminder2007 %>% group_by(continent) %>% summarize(median = median(lifeExp), mean = mean(lifeExp)) Table 6.7: Life expectancy by continent continent median mean Africa 52.9 54.8 Americas 72.9 73.6 Asia 72.4 70.7 Europe 78.6 77.6 Oceania 80.7 80.7 We see now that there are differences in life expectancies between the continents. For example focusing on only medians, while the median life expectancy across all \\(n = 142\\) countries in 2007 was 71.935, the median life expectancy across the \\(n =52\\) countries in Africa was only 52.927. Let’s create a corresponding visualization. One way to compare the life expectancies of countries in different continents would be via a faceted histogram. Recall we saw back in the Data Visualization chapter, specifically Section 3.6, that facets allow us to split a visualization by the different levels of a categorical variable or factor variable. In Figure 6.10, the variable we facet by is continent, which is categorical with five levels, each corresponding to the five continents of the world. ggplot(gapminder2007, aes(x = lifeExp)) + geom_histogram(binwidth = 5, color = "white") + labs(x = "Life expectancy", y = "Number of countries", title = "Life expectancy by continent") + facet_wrap(~continent, nrow = 2) Figure 6.10: Life expectancy in 2007 Another way would be via a geom_boxplot where we map the categorical variable continent to the \\(x\\)-axis and the different life expectancies within each continent on the \\(y\\)-axis; we do this in Figure 6.11. ggplot(gapminder2007, aes(x = continent, y = lifeExp)) + geom_boxplot() + labs(x = "Continent", y = "Life expectancy (years)", title = "Life expectancy by continent") Figure 6.11: Life expectancy in 2007 Some people prefer comparing a numerical variable between different levels of a categorical variable, in this case comparing life expectancy between different continents, using a boxplot over a faceted histogram as we can make quick comparisons with single horizontal lines. For example, we can see that even the country with the highest life expectancy in Africa is still lower than all countries in Oceania. It’s important to remember however that the solid lines in the middle of the boxes correspond to the medians (i.e. the middle value) rather than the mean (the average). So, for example, if you look at Asia, the solid line denotes the median life expectancy of around 72 years, indicating to us that half of all countries in Asia have a life expectancy below 72 years whereas half of all countries in Asia have a life expectancy above 72 years. Furthermore, note that: Africa and Asia have much more spread/variation in life expectancy as indicated by the interquartile range (the height of the boxes). Oceania has almost no spread/variation, but this might in large part be due to the fact there are only two countries in Oceania: Australia and New Zealand. Now, let’s start making comparisons of life expectancy between continents. Let’s use Africa as a baseline for comparsion. Why Africa? Only because it happened to be first alphabetically, we could’ve just as appropriately used the Americas as the baseline for comparison. Using the “eyeball test” (just using our eyes to see if anything stands out), we make the following observations about differences in median life expectancy compared to the baseline of Africa: The median life expectancy of the Americas is roughly 20 years greater. The median life expectancy of Asia is roughly 20 years greater. The median life expectancy of Europe is roughly 25 years greater. The median life expectancy of Oceania is roughly 27.8 years greater. Let’s remember these four differences vs Africa corresponding to the Americas, Asia, Europe, and Oceania: 20, 20, 25, 27.8. Learning check (LC6.4) Conduct a new exploratory data analysis with the same explanatory variable \\(x\\) being continent but with gdpPercap as the new outcome variable \\(y\\). Remember, this involves three things: Looking at the raw values Computing summary statistics of the variables of interest. Creating informative visualizations What can you say about the differences in GDP per capita between continents based on this exploration? 6.2.2 Linear regression In Subsection 6.1.2 we introduced simple linear regression, which involves modeling the relationship between a numerical outcome variable \\(y\\) as a function of a numerical explanatory variable \\(x\\), in our life expectancy example, we now have a categorical explanatory variable \\(x\\) continent. While we still can fit a regression model, given our categorical explanatory variable we no longer have a concept of a “best-fitting” line, but differences relative to a baseline for comparison. Before we fit our regression model, let’s create a table similar to Table 6.7, but Report the mean life expectancy for each continent. Report the difference in mean life expectancy relative to Africa’s mean life expectancy of 54.806 in the column “mean vs Africa”; this column is simply the “mean” column minus 54.806. Think back to your observations from the eyeball test of Figure 6.11 at the end of the last subsection. The column “mean vs Africa” is the same idea of comparing a summary statistic to a baseline for comparison, in this case the countries of Africa, but using means instead of medians. Table 6.8: Mean life expectancy by continent continent mean mean vs Africa Africa 54.8 0.0 Americas 73.6 18.8 Asia 70.7 15.9 Europe 77.6 22.8 Oceania 80.7 25.9 Now, let’s use the get_regression_table() function we introduced in Section 6.1.2 to get the regression table for gapminder2007 analysis: lifeExp_model <- lm(lifeExp ~ continent, data = gapminder2007) get_regression_table(lifeExp_model) Table 6.9: Linear regression table term estimate std_error statistic p_value conf_low conf_high intercept 54.8 1.02 53.45 0 52.8 56.8 continentAmericas 18.8 1.80 10.45 0 15.2 22.4 continentAsia 15.9 1.65 9.68 0 12.7 19.2 continentEurope 22.8 1.70 13.47 0 19.5 26.2 continentOceania 25.9 5.33 4.86 0 15.4 36.5 Just as before, we have the term and estimates columns of interest, but unlike before, we now have 5 rows corresponding to 5 outputs in our table: an intercept like before, but also continentAmericas, continentAsia, continentEurope, and continentOceania. What are these values? intercept = 54.8 corresponds to the mean life expectancy for Africa. This mean life expectancy is treated as a baseline for comparison for the other continents. continentAmericas = 18.8 is the difference in mean life expectancies of the Americas minus Africa. Note that \\(18.80 = 73.6 - 54.8\\) is the 2nd “mean vs Africa” value in Table 6.8. continentAmericas = 15.9 is the difference in mean life expectancy of Asia minus Africa. Note that \\(15.9 = 70.7 - 54.8\\) is the 2nd “mean vs Africa” value in Table 6.8. continentEurope = 22.8 is the difference in mean life expectancy of Europe minus Africa. Note that \\(22.8 = 77.6 - 54.8\\) is the 3rd “mean vs Africa” value in Table 6.8. continentOceania = 25.9 is the difference in mean life expectancy of Oceania minus Africa. Note that \\(25.9 = 80.7 - 54.8\\) is the 3rd “mean vs Africa” value in Table 6.8. Let’s generalize this idea a bit. If we fit a linear regression model using a categorical explanatory variable \\(x\\) that has \\(k\\) levels, a regression model will return an intercept and \\(k - 1\\) “slope” coefficients. When \\(x\\) is a numerical explanatory variable the interpretation is of a “slope” coefficient, but when \\(x\\) is categorical the meaning is a little trickier. They are offsets relative to the baseline. In our case, since there are \\(k = 5\\) continents, the regression model returns an intercept corresponding to the baseline for comparison Africa and \\(k - 1 = 4\\) slope coefficients corresponding to the Americas, Asia, Europe, and Oceania. Africa was chosen as the baseline by R for no other reason than it is first alphabetically of the 5 continents. You can manually specify which continent to use as baseline instead of the default choice of whichever comes first alphabetically, but we leave that to a more advanced course. Learning check (LC6.5) Fit a new linear regression using lm(gdpPercap ~ continent, data = gapminder) where gdpPercap is the new outcome variable \\(y\\). Get information about the “best-fitting” line from the regression table by applying the get_regression_table() function. How do the regression results match up with the results from your exploratory data analysis above? 6.2.3 Observed/fitted values and residuals Recall in Subsection 6.1.3 when we had a numerical explanatory variable \\(x\\), we defined: Observed values \\(y\\), or the observed value of the outcome variable Fitted values \\(\\widehat{y}\\), or the value on the regression line for a given \\(x\\) value Residuals \\(y - \\widehat{y}\\), or the error between the observed value and the fitted value What do fitted values \\(\\widehat{y}\\) and residuals \\(y - \\widehat{y}\\) correspond to when the explanatory variable \\(x\\) is categorical? Let’s investigate these values for the first 10 countries in the gapminder2007 dataset: Table 6.10: First 10 out of 142 countries country continent lifeExp gdpPercap Afghanistan Asia 43.8 975 Albania Europe 76.4 5937 Algeria Africa 72.3 6223 Angola Africa 42.7 4797 Argentina Americas 75.3 12779 Australia Oceania 81.2 34435 Austria Europe 79.8 36126 Bahrain Asia 75.6 29796 Bangladesh Asia 64.1 1391 Belgium Europe 79.4 33693 Recall the get_regression_points() function we used in Subsection 6.1.3 to return the observed value of the outcome variable, all explanatory variables, fitted values, and residuals for all points in the regression. Recall that each “point” in this case corresponds to one of 142 countries in the gapminder2007 dataset. They are also the 142 observations used to construct the boxplots in Figure 6.11. regression_points <- get_regression_points(lifeExp_model) regression_points Table 6.11: Regression points (First 10 out of 142 countries) ID lifeExp continent lifeExp_hat residual 1 43.8 Asia 70.7 -26.900 2 76.4 Europe 77.6 -1.226 3 72.3 Africa 54.8 17.495 4 42.7 Africa 54.8 -12.075 5 75.3 Americas 73.6 1.712 6 81.2 Oceania 80.7 0.515 7 79.8 Europe 77.6 2.180 8 75.6 Asia 70.7 4.907 9 64.1 Asia 70.7 -6.666 10 79.4 Europe 77.6 1.792 Notice The fitted values lifeExp_hat \\(\\widehat{\\text{lifeexp}}\\). Countries in Africa have the same fitted value of 54.8, which is the mean life expectancy of Africa; countries in Asia have the same fitted value of 70.7, which is the mean life expectancy of Asia; this similarly holds for countries in the Americas, Europe, and Oceania. The residual column is simply \\(y - \\widehat{y}\\) = lifeexp - lifeexp_hat. These values can be interpreted as that particular country’s deviation from the mean life expectancy of the respective continent’s mean. For example, the first row of this dataset corresponds to Afghanistan, and the residual of $-26.9 = 43.8 70.7$ is Afghanistan’s mean life expectancy minus the mean life expectancy of all Asian countries. 6.2.4 Residual analysis Recall our discussion on residuals from Section 6.1.4 where our goal was to investigate whether or not there was a systematic pattern to the residuals, as ideally since residuals can be thought of as error, there should be no such pattern. While there are many ways to do such residual analysis, we focused on two approaches based on visualizations. A plot with residuals on the vertical axis and the predictor (in this case continent) on the horizontal axis A histogram of all residuals First, let’s plot the residuals vs continent in Figure 6.12, but also let’s plot all 142 points with a little horizontal random jitter by setting the width = 0.1 parameter in geom_jitter(): ggplot(regression_points, aes(x = continent, y = residual)) + geom_jitter(width = 0.1) + labs(x = "Continent", y = "Residual") + geom_hline(yintercept = 0, col = "blue") Figure 6.12: Plot of residuals over continent We observe: There seems to be a rough balance of both positive and negative residuals for all 5 continents. However, there is one clear outlier in Asia. It has the smallest residual, hence also has the smallest life expectancy in Asia. Let’s investigate the 5 countries in Asia with the shortest life expectancy: gapminder2007 %>% filter(continent == "Asia") %>% arrange(lifeExp) Table 6.12: Countries in Asia with shortest life expectancy country continent lifeExp gdpPercap Afghanistan Asia 43.8 975 Iraq Asia 59.5 4471 Cambodia Asia 59.7 1714 Myanmar Asia 62.1 944 Yemen, Rep. Asia 62.7 2281 This was the earlier identified residual for Afghanistan of -26.9. Unfortunately given recent geopolitical turmoil, individuals who live in Afghanistan have a drastically lower life expectancy. Second, let’s look at a histogram of all 142 values of residuals in Figure 6.13. In this case, the residuals form a rather nice bell-shape, although there are a couple of very low and very high values at the tails. As we said previously, searching for patterns in residuals can be somewhat subjective, but ideally we hope there are no “drastic” patterns. ggplot(regression_points, aes(x = residual)) + geom_histogram(binwidth = 5, color = "white") + labs(x = "Residual") Figure 6.13: Histogram of residuals Learning check (LC6.6) Continuing with our regression using gdpPercap as the outcome variable and continent as the explanatory variable, use the get_regression_points() function to get the observed values, fitted values, and residuals for all 142 countries in 2007 and perform a residual analysis and look for any systematic patterns in the residuals. Is there a patter? 6.3 Related topics 6.3.1 Correlation coefficient Let’s re-plot Figure 6.1, but now consider a broader range of correlation coefficient values in Figure 6.14. Figure 6.14: Different Correlation Coefficients As we suggested in Subsection 6.1.1, interpreting coefficients that are not close to the extreme values of -1 and 1 can be subjective. To develop your sense of correlation coefficients, we suggest you play the following 80’s-style video game called “Guess the correlation”! Click on the image below: 6.3.2 Correlation is not necessarily causation Causation is a tricky problem and frequently takes either carefully designed experiments or methods to control for the effects of potential confounding variables. Both these approaches attempt either to remove all confounding variables or take them into account as best they can, and only focus on the behavior of a outcome variable in the presence of the levels of the other variable(s). Be careful as you read studies to make sure that the writers aren’t falling into this fallacy of correlation implying causation. If you spot one, you may want to send them a link to Spurious Correlations. 6.3.3 Best fitting line Regression lines are also known as “best fitting lines”. But what do we mean by best? Let’s unpack the criteria that is used by regression to determine best. Recall the plot in Figure 6.6 where for a instructor with a beauty average score of \\(x=7.333\\) The observed value \\(y=4.9\\) was marked with a red circle The fitted value \\(\\widehat{y} = 4.369\\) on the regression line was marked with a red square The residual \\(y-\\widehat{y} = 4.9-4.369 = 0.531\\) was the length of the blue arrow. Let’s do this for another arbitrarily chosen instructor whose beauty score was \\(x=2.333\\). The residual in this case is \\(2.7 - 4.036 = -1.336\\). Let’s do this for another arbitrarily chosen instructor whose beauty score was \\(x=3.667\\). The residual in this case is \\(4.4 - 4.125 = 0.2753\\). Let’s do this for another arbitrarily chosen instructor whose beauty score was \\(x = 6\\). The residual in this case is \\(3.8 - 4.28 = -0.4802\\). Now let’s say we repeated this process for all 463 instructors in our dataset. Regression minimizes the sum of all 463 arrow lengths squared. In other words, it minimizes the sum of the squared residuals: \\[ \\sum_{i=1}^{n}(y_i - \\widehat{y}_i)^2 \\] We square the arrow lengths so that positive and negative deviations of the same amount are treated equally. That’s why alternative names for the simple linear regression line are the least-squares line and the best fitting line. It can be proven via calculus and linear algebra that this line uniquely minimizes the sum of the squared arrow lengths. For the regression line in the plot, the sum of the squared residuals is 131.879. This is the lowest possible value of the sum of the squared residuals of all possible lines we could draw on this scatterplot? How do we know this? We can mathematically prove this fact, but this requires some calculus and linear algebra, so let’s leave this proof for another course! 6.3.4 How does get_regression_table() work? Note that this subsection is optional! What is going on behind the scenes with the get_regression_table() from the moderndive package? Recall in Subsection 6.1.2 we noted that these are wrapper functions that take other pre-existing functions and “wraps” them in a single function. This wrapper function leverages the the tidy() function in the broom package and the clean_names() function in the janitor package to generate clean looking outputs. Here is what the regression table from Subsection 6.1.2 looks like: score_model <- lm(score ~ bty_avg, data = evals) get_regression_table(score_model, digits = 2) term estimate std_error statistic p_value conf_low conf_high intercept 3.88 0.08 50.96 0 3.73 4.03 bty_avg 0.07 0.02 4.09 0 0.03 0.10 The main idea behind the get_regression_table() function is to hide you from the following code running behind the scenes, which we’ve found to be confusing to students in the past: library(broom) library(janitor) score_model %>% tidy(conf.int = TRUE) %>% mutate_if(is.numeric, round, digits = 3) %>% clean_names() term estimate std_error statistic p_value conf_low conf_high (Intercept) 3.880 0.076 50.96 0 3.731 4.030 bty_avg 0.067 0.016 4.09 0 0.035 0.099 Note that the mutate_if() function is from the dplyr package and applies the round() function with 3 significant digits precision only to those variables that are numerical. But oof, the second code block is long and messy! We felt it appropriate to hide all this from you and give the function an easy to remember name: get_regression_table(). 6.4 Conclusion In this chapter, you seen what we call “basic regression” when you only have one explanatory variable. In Chapter 7, we’ll study multiple regression where we have more than one explanatory variable! In particular, we’ll see why we’ve been conducting the residual analyses from Subsections {#model1residuals} and {#model2residuals}; we are actually verifying some very important assumptions that must be met for the std_error (standard error), p_value, conf_low and conf_high (the end-points of the confidence intervals) columns in our regression tables to have valid interpretation. Again, don’t worry for now if you don’t understand what these terms mean. After the next chapter on multiple regression, we’ll dive in! 6.4.1 Script of R code An R script file of all R code used in this chapter is available here. "], +["index.html", "An Introduction to Statistical and Data Sciences via R 1 Introduction 1.1 Introduction for students 1.2 Introduction for instructors 1.3 Connect and contribute 1.4 About this book 1.5 About the authors", " An Introduction to Statistical and Data Sciences via R Chester Ismay and Albert Y. Kim February 3, 2018 1 Introduction Help! I’m new to R and RStudio and I need to learn about them! However, I’m completely new to coding! What do I do? If you’re asking yourself this question, then you’ve come to the right place! Start with our Introduction for Students. Are you an instructor hoping to use this book in your courses? Then click here for more information on how to teach with this book. Are you looking to connect with and contribute to ModernDive? Then click here for information on how. Are you curious about the publishing of this book? Then click here for more information on the open-source technology, in particular R Markdown and the bookdown package. This is version 0.3.0 of ModernDive published on February 3, 2018. For previous versions of ModernDive, see Section 1.4. 1.1 Introduction for students This book assumes no prerequisites: no algebra, no calculus, and no prior programming/coding experience. This is intended to be a gentle introduction to the practice of analyzing data and answering questions using data the way data scientists, statisticians, data journalists, and other researchers would. Here is a flowchart of what you’ll cover: We get started with data in Chapter 2: R vs RStudio, coding in R, R packages, and exploring your first real data: all domestic departure flights from a New York City airport in 2013. Then we build up your data science toolbox via the tidyverse, an opinionated collection of R packages designed for data science. Specifically Chapter 3 on data visualization via the ggplot2 package Chapter 4 on the “tidy” data format Chapter 5 on data wrangling via the dplyr package. Equipped with your new data science toolbox, in Chapters 6 and 7 we’ll make your first forays into data modeling using one of the most commonly-used and easy to understand approaches: linear regression. We’ll use regression as a descriptive tool for now and leverage the moderndive accompaniment package to this book to help digest the results. We the proceed to cover topics related to statistical inference, the bread and butter of statistics. To this end, we’ll leverage a new package for tidyverse-friendly inference called infer. In particular: Chapter 8 on sampling theory Chapter 9 on confidence intervals (Under construction) Chapter 10 on hypothesis testing (Under construction) (Under construction) After studying simple instances of statistical inference, we revisit the data modeling topics from Chapters 6 and 7 and boost your abilities to interpret the results of regression in Chapter 11 on inference for regression. (Under construction) We’ll end with a discussion on what it means to “think with data” in Chapter 12. 1.1.1 What you will learn from this book We hope that by the end of this book, you’ll have learned How to use R to explore data. How to answer statistical questions using tools like confidence intervals and hypothesis tests. How to effectively create “data stories” using these tools. What do we mean by data stories? We mean any analysis involving data that engages the reader in answering questions with careful visuals and thoughtful discussion, such as How strong is the relationship between per capita income and crime in Chicago neighborhoods? and How many f**ks does Quentin Tarantino give (as measured by the amount of swearing in his films)?. Further discussions on data stories can be found in this Think With Google article. For other examples of data stories constructed by students like yourselves, look at the final projects for two courses that have previously used ModernDive: Middlebury College MATH 116 Introduction to Statistical and Data Sciences using student collected data. Pacific University SOC 301 Social Statistics using data from the fivethirtyeight R package. This book will help you develop your “data science toolbox”, including tools such as data visualization, data formatting, data wrangling, and data modeling using regression. With these tools, you’ll be able to perform the entirety of the “data/science pipeline” while building data communication skills (see Subsection 1.1.2 for more details). In particular, this book will lean heavily on data visualization. In today’s world, we are bombarded with graphics that attempt to convey ideas. We will explore what makes a good graphic and what the standard ways are to convey relationships with data. You’ll also see the use of visualization to introduce concepts like mean, median, standard deviation, distributions, etc. In general, we’ll use visualization as a way of building almost all of the ideas in this book. To impart the statistical lessons in this book, we have intentionally minimized the number of mathematical formulas used and instead have focused on developing a conceptual understanding via data visualization, statistical computing, and simulations. We hope this is a more intuitive experience than the way statistics has traditionally been taught in the past and how it is commonly perceived. Finally, you’ll learn the importance of literate programming. By this we mean you’ll learn how to write code that is useful not just for a computer to execute but also for readers to understand exactly what your analysis is doing and how you did it. This is part of a greater effort to encourage reproducible research (see Subsection 1.1.3 for more details). Hal Abelson coined the phrase that we will follow throughout this book: “Programs must be written for people to read, and only incidentally for machines to execute.” We understand that there may be challenging moments as you learn to program. Both of us continue to struggle and find ourselves often using web searches to find answers and reach out to colleagues for help. In the long run though, we all can solve problems faster and more elegantly via programming. We wrote this book as our way to help you get started and you should know that there is a huge community of R users that are always happy to help everyone along as well. This community exists in particular on the internet on various forums and websites such as stackoverflow.com. 1.1.2 Data/science pipeline You may think of statistics as just being a bunch of numbers. We commonly hear the phrase “statistician” when listening to broadcasts of sporting events. Statistics (in particular, data analysis), in addition to describing numbers like with baseball batting averages, plays a vital role in all of the sciences. You’ll commonly hear the phrase “statistically significant” thrown around in the media. You’ll see articles that say “Science now shows that chocolate is good for you.” Underpinning these claims is data analysis. By the end of this book, you’ll be able to better understand whether these claims should be trusted or whether we should be wary. Inside data analysis are many sub-fields that we will discuss throughout this book (though not necessarily in this order): data collection data wrangling data visualization data modeling inference correlation and regression interpretation of results data communication/storytelling These sub-fields are summarized in what Grolemund and Wickham term the “data/science pipeline” in Figure 1.1. Figure 1.1: Data/Science Pipeline We will begin by digging into the gray Understand portion of the cycle with data visualization, then with a discussion on what is meant by tidy data and data wrangling, and then conclude by talking about interpreting and discussing the results of our models via Communication. These steps are vital to any statistical analysis. But why should you care about statistics? “Why did they make me take this class?” There’s a reason so many fields require a statistics course. Scientific knowledge grows through an understanding of statistical significance and data analysis. You needn’t be intimidated by statistics. It’s not the beast that it used to be and, paired with computation, you’ll see how reproducible research in the sciences particularly increases scientific knowledge. 1.1.3 Reproducible research “The most important tool is the mindset, when starting, that the end product will be reproducible.” – Keith Baggerly Another goal of this book is to help readers understand the importance of reproducible analyses. The hope is to get readers into the habit of making their analyses reproducible from the very beginning. This means we’ll be trying to help you build new habits. This will take practice and be difficult at times. You’ll see just why it is so important for you to keep track of your code and well-document it to help yourself later and any potential collaborators as well. Copying and pasting results from one program into a word processor is not the way that efficient and effective scientific research is conducted. It’s much more important for time to be spent on data collection and data analysis and not on copying and pasting plots back and forth across a variety of programs. In a traditional analyses if an error was made with the original data, we’d need to step through the entire process again: recreate the plots and copy and paste all of the new plots and our statistical analysis into your document. This is error prone and a frustrating use of time. We’ll see how to use R Markdown to get away from this tedious activity so that we can spend more time doing science. “We are talking about computational reproducibility.” - Yihui Xie Reproducibility means a lot of things in terms of different scientific fields. Are experiments conducted in a way that another researcher could follow the steps and get similar results? In this book, we will focus on what is known as computational reproducibility. This refers to being able to pass all of one’s data analysis, data-sets, and conclusions to someone else and have them get exactly the same results on their machine. This allows for time to be spent interpreting results and considering assumptions instead of the more error prone way of starting from scratch or following a list of steps that may be different from machine to machine. 1.1.4 Final note for students At this point, if you are interested in instructor perspectives on this book, ways to contribute and collaborate, or the technical details of this book’s construction and publishing, then continue with the rest of the chapter below. Otherwise, let’s get started with R and RStudio in Chapter 2! 1.2 Introduction for instructors This book is inspired by the following books: “Mathematical Statistics with Resampling and R” (Chihara and Hesterberg 2011), “OpenIntro: Intro Stat with Randomization and Simulation” (Diez, Barr, and Çetinkaya-Rundel 2014), and “R for Data Science” (Grolemund and Wickham 2016). The first book, while designed for upper-level undergraduates and graduate students, provides an excellent resource on how to use resampling to impart statistical concepts like sampling distributions using computation instead of large-sample approximations and other mathematical formulas. The last two books are free options to learning introductory statistics and data science, providing an alternative to the many traditionally expensive introductory statistics textbooks. When looking over the large number of introductory statistics textbooks that currently exist, we found that there wasn’t one that incorporated many newly developed R packages directly into the text, in particular the many packages included in the tidyverse collection of packages, such as ggplot2, dplyr, tidyr, and broom. Additionally, there wasn’t an open-source and easily reproducible textbook available that exposed new learners all of three of the learning goals listed at the outset of Subsection 1.1.1. 1.2.1 Who is this book for? This book is intended for instructors of traditional introductory statistics classes using RStudio, either the desktop or server version, who would like to inject more data science topics into their syllabus. We assume that students taking the class will have no prior algebra, calculus, nor programming/coding experience. Here are some principles and beliefs we kept in mind while writing this text. If you agree with them, this might be the book for you. Blur the lines between lecture and lab With increased availability and accessibility of laptops and open-source non-proprietary statistical software, the strict dichotomy between lab and lecture can be loosened. It’s much harder for students to understand the importance of using software if they only use it once a week or less. They forget the syntax in much the same way someone learning a foreign language forgets the rules. Frequent reinforcement is key. Focus on the entire data/science research pipeline We believe that the entirety of Grolemund and Wickham’s data/science pipeline should be taught. We believe in “minimizing prerequisites to research”: students should be answering questions with data as soon as possible. It’s all about the data We leverage R packages for rich, real, and realistic data-sets that at the same time are easy-to-load into R, such as the nycflights13 and fivethirtyeight packages. We believe that data visualization is a gateway drug for statistics and that the Grammar of Graphics as implemented in the ggplot2 package is the best way to impart such lessons. However, we often hear: “You can’t teach ggplot2 for data visualization in intro stats!” We, like David Robinson, are much more optimistic. dplyr has made data wrangling much more accessible to novices, and hence much more interesting data-sets can be explored. Use simulation/resampling to introduce statistical inference, not probability/mathematical formulas Instead of using formulas, large-sample approximations, and probability tables, we teach statistical concepts using resampling-based inference. This allows for a de-emphasis of traditional probability topics, freeing up room in the syllabus for other topics. Don’t fence off students from the computation pool, throw them in! Computing skills are essential to working with data in the 21st century. Given this fact, we feel that to shield students from computing is to ultimately do them a disservice. We are not teaching a course on coding/programming per se, but rather just enough of the computational and algorithmic thinking necessary for data analysis. Complete reproducibility and customizability We are frustrated when textbooks give examples, but not the source code and the data itself. We give you the source code for all examples as well as the whole book! Ultimately the best textbook is one you’ve written yourself. You know best your audience, their background, and their priorities. You know best your own style and the types of examples and problems you like best. Customization is the ultimate end. For more about how to make this book your own, see About this Book. 1.3 Connect and contribute If you would like to connect with ModernDive, check out the following links: If you would like to receive periodic updates about ModernDive (roughly every 3 months), please sign up for our mailing list. Contact Albert at albert@moderndive.com and Chester chester@moderndive.com We’re on Twitter at ModernDive. If you would like to contribute to ModernDive, there are many ways! Let’s all work together to make this book as great as possible for as many students and instructors as possible! Please let us know if you find any errors, typos, or areas from improvement on our GitHub issues page. If you are familiar with GitHub and would like to contribute more, please see Section 1.4 below. The authors would like to thank Nina Sonneborn, Kristin Bott, and the participants of our USCOTS 2017 workshop for their feedback and suggestions. A special thanks goes to Prof. Yana Weinstein, cognitive psychological scientist and co-founder of The Learning Scientists, for her extensive contributions. 1.4 About this book This book was written using RStudio’s bookdown package by Yihui Xie (Xie 2017). This package simplifies the publishing of books by having all content written in R Markdown. The bookdown/R Markdown source code for all versions of ModernDive is available on GitHub: Latest published version The most up-to-date release: Version 0.3.0 released on February 3, 2018 (source code). Available at ModernDive.com Development version The working copy of the next version which is currently being edited: Preview of development version is available at http://moderndive.netlify.com/ Source code: Available on ModernDive’s GitHub repository page Previous versions Older versions that may be out of date: Version 0.2.0 released on August 02, 2017 (source code) Version 0.1.3 released on February 09, 2017 (source code) Version 0.1.2 released on January 22, 2017 (source code) Could this be a new paradigm for textbooks? Instead of the traditional model of textbook companies publishing updated editions of the textbook every few years, we apply a software design influenced model of publishing more easily updated versions. We can then leverage open-source communities of instructors and developers for ideas, tools, resources, and feedback. As such, we welcome your pull requests. Finally, feel free to modify the book as you wish for your own needs, but please list the authors at the top of index.Rmd as “Chester Ismay, Albert Y. Kim, and YOU!” 1.5 About the authors Who we are! Chester Ismay Albert Y. Kim Chester Ismay - Data Science Curriculum Lead, DataCamp. Portland, OR, USA. Email: chester@moderndive.com Webpage: http://ismayc.github.io/ Twitter: old_man_chester GitHub: https://github.com/ismayc Albert Y. Kim - Lecturer of Statistics, Amherst College. Amherst, MA, USA. Email: albert@moderndive.com Webpage: http://rudeboybert.rbind.io/ Twitter: rudeboybert GitHub: https://github.com/rudeboybert "], +["2-getting-started.html", "2 Getting Started with Data in R 2.1 What are R and RStudio? 2.2 How do I code in R? 2.3 What are R packages? 2.4 Explore your first dataset 2.5 Conclusion", " 2 Getting Started with Data in R Before we can start exploring data in R, there are some key concepts to understand first: What are R and RStudio? How do I code in R? What are R packages? If you are already familiar with these concepts, feel free to skip to Section 2.4 below introducing some of the datasets we will explore in depth in this book. Much of this chapter is based on two sources which you should feel free to use as references if you are looking for additional details: Ismay’s Getting used to R, RStudio, and R Markdown (Ismay 2016), which includes video screen recordings that you can follow along and pause as you learn. DataCamp’s online tutorials. DataCamp is a browser-based interactive platform for learning data science and their tutorials will help facilitate your learning of the above concepts (and other topics in this book). Go to DataCamp and create an account before continuing. 2.1 What are R and RStudio? For much of this book, we will assume that you are using R via RStudio. First time users often confuse the two. At its simplest: R is like a car’s engine RStudio is like a car’s dashboard R: Engine RStudio: Dashboard More precisely, R is a programming language that runs computations while RStudio is an integrated development environment (IDE) that provides an interface by adding many convenient features and tools. So the way of having access to a speedometer, rearview mirrors, and a navigation system makes driving much easier, using RStudio’s interface makes using R much easier as well. Optional: For a more in-depth discussion on the difference between R and RStudio IDE, watch this DataCamp video (2m52s). 2.1.1 Installing R and RStudio If your instructor has provided you with a link and access to RStudio Server, then you can skip this section. We do recommend though after a few months of working on the RStudio Server that you return to these instructions. If you don’t know what RStudio Server is, then please read this section. You will first need to download and install both R and RStudio (Desktop version) on your computer. Download and install R. Note: You must do this first. Click on the download link corresponding to your computer’s operating system. Download and install RStudio. Scroll down to “Installers for Supported Platforms” Click on the download link corresponding to your computer’s operating system. Optional: If you need more detailed instructions on how to install R and RStudio, watch this DataCamp video (1m22s). 2.1.2 Using R via RStudio Recall our car analogy from above. Much as we don’t drive a car by interacting directly with the engine but rather by using elements on the car’s dashboard, we won’t be using R directly but rather we will use RStudio’s interface. After you install R and RStudio on your computer, you’ll have two new programs AKA applications you can open. We will always work in RStudio and not R. In other words: R: Do not open this RStudio: Open this After you open RStudio, you should see the following: Watch the following DataCamp video (4m10s) to learn about the different panes in RStudio, in particular the Console pane where you will later run R code. 2.2 How do I code in R? Now that you’re set up with R and RStudio, you are probably asking yourself “OK. Now how do I use R?” The first thing to note as that unlike other software like Excel, STATA, or SAS that provide point and click interfaces, R is an interpreted language, meaning you have to enter in R commands written in R code i.e. you have to program in R (we use the terms “coding” and “programming” interchangeably in this book). While it is not required to be a seasoned coder/computer programmer to use R, there is still a set of basic programming concepts that R users need to understand. Consequently, while this book is not a book on programming, you will still learn just enough of these basic programming concepts needed to explore and analyze data effectively. 2.2.1 Basic programming concepts and terminology To introduce you to many of these basic programming concepts and terminology, we direct you to the following DataCamp online interactive tutorials. For each of the tutorials, we give a list of the basic programming concepts covered. Note that in this book, we will use a different font to distinguish regular font from computer_code. It is important to note that while these tutorials serve as excellent introductions, a single pass through them is insufficient for long-term learning and retention. The ultimate tools for long-term learning and retention are “learning by doing” and repetition, something we will have you do over the course of the entire book and we encourage this process as much as possible as you learn any new skill. From the Introduction to R course complete the following chapters. As you work through the chapters, carefully note the important terms and what they are used for. We recommend you do so in a notebook that you can easily refer back to. Chapter 1 Intro to basics: Console pane: where you enter in commands Objects: where values are saved, how to assign values to objects. Data types: integers, doubles/numerics, logicals, characters. Chapter 2 Vectors: Vectors: a series of values. Chapter 4 Factors: Categorical data (as opposed to numerical data) are represented in R as factors. Chapter 5 Data frames: Data frames are analogous to rectangular spreadsheets: they are representations of datasets in R where the rows correspond observations and the columns correspond to variables that describe the observations. We will revisit this later in Section 2.4. From the Intermediate R course complete the following chapters: Chapter 1 Conditionals and Control Flow: Testing for equality in R using == (and not = which is typically used for assignment). Ex: 2 + 1 == 3 compares 2 + 1 to 3 and is correct R syntax, while 2 + 1 = 3 is not and is incorrect R syntax. Boolean algebra: TRUE/FALSE statements and mathematical operators such as < (less than), <= (less than or equal), and != (not equal to). Logical operators: & representing “and”, | representing “or”. Ex: (2 + 1 == 3) & (2 + 1 == 4) returns FALSE while (2 + 1 == 3) | (2 + 1 == 4) returns TRUE. Chapter 3 Functions: Concept of functions: they take in inputs (called arguments) and return outputs. You either manually specify a function’s arguments or use the function’s defaults. This list is by no means an exhaustive list of all the programming concepts and terminology needed to become a savvy R user; such a list would be so large it wouldn’t be very useful, especially for novices. Rather, we feel this is the bare minimum you need to know before you get started; the rest we feel you can learn as you go. Remember that your knowledge of all of these concepts will build as you get better and better at “speaking R” and getting used to its syntax. 2.2.2 Tips on learning to code Learning to code/program is very much like learning a foreign language, it can be very daunting and frustrating at first. However just as with learning a foreign language, if you put in the effort and are not afraid to make mistakes, anybody can learn. Lastly, there are a few useful things to keep in mind as you learn to program: Computers are stupid: You have to tell a computer everything it needs to do. Furthermore, your instructions can’t have any mistakes in them, nor can they be ambiguous in any way. Take the “copy/paste/tweak” approach: Especially when learning your first programming language, it is often much easier to taking existing code that you know works and modify it to suit your ends, rather than trying to write new code from scratch. We call this the copy/paste/tweak approach. So early on, we suggest not trying to code from scratch, but please take the code we provide throughout this book and play around with it! Practice is key: Just as the only solution to improving your foreign language skills is practice, so also the only way to get better at R is through pracitice. Don’t worry however, we’ll give you plenty of opportunities to practice! 2.3 What are R packages? Another point of confusion with new R users is the notion of a package. R packages extend the functionality of R by providing additional functions, data, and documentation and can be downloaded for free from the internet. They are written by a world-wide community of R users. For example, among the many packages we will use in this book are the ggplot2 package for data visualization in Chapter 3 dplyr package for data wrangling in Chapter 5 There are two key things to remember about R packages: Installation: Most packages are not installed by default when you install R and RStudio. You need to install a package before you can use it. Once you’ve installed it, you likely don’t need to install it again unless you want to update it to a newer version of the package. Loading: Packages are not loaded automatically when you open RStudio. You need to load them everytime you open RStudio using the library() command. A good analogy for R packages is they are like apps you can download onto a mobile phone: R: A new phone R Packages: Apps you can download So, expanding on this analogy a bit: R is like a new mobile phone. It has a certain amount of functionality when you use it for the first time, but it doesn’t have everything. R packages are like the apps you can download onto your phone, much like those offered in the App Store and Google Play. For example: Instagram. In order to use a package, just like in order to use Instagram, you must: First download it and install it. You do this only once. Load it, or in other words, “open” it, using the library() command. So just as you can only start sharing photos with your friends on Instagram if you first install the app and then open it, you can only access an R package’s data and functions if you first install the package and then load it with the library() command. Let’s cover these two steps: 2.3.1 Package installation (Note that if you are working on an RStudio Server, you probably will not need to install your own packages as that has been already done for you. Still it is important that you know this process for later when you are not using the RStudio Server but rather your own installation of RStudio Desktop.) There are two ways to install an R package. For example, to install the ggplot2 package: Easy way: In the Files pane of RStudio: Click on the “Packages” tab Click on “Install” Type the name of the package under “Packages (separate multiple with space or comma):” In this case, type ggplot2 Click “Install” Alternative way: In the Console pane run install.packages("ggplot2") (you must include the quotation marks). Repeat this for the dplyr and nycflights13 packages. Note: You only have to install a package once, unless you want to update an already installed package to the latest version. If you want to update a package to the latest version, then re-install it by repeating the above steps. 2.3.2 Package loading After you’ve installed a package, you can now load it using the library() command. For example, to load the ggplot2 and dplyr packages, run the following code in the Console pane: library(ggplot2) library(dplyr) Note: You have to reload each package you want to use every time you open a new session of RStudio. This is a little annoying to get used to and will be your most common error as you begin. When you see an error such as Error: could not find function remember that this likely comes from you trying to use a function in a package that has not been loaded. Remember to run the library() function with the appropriate package to fix this error. 2.4 Explore your first dataset Let’s put everything we’ve learned so far into practice and start exploring some real data! Data comes to us in a variety of formats, from pictures to text to numbers. Throughout this book, we’ll focus on datasets that can be stored in a spreadsheet as that is among the most common way data is collected in the many fields. Remember from Subsection 2.2.1 that these “spreadsheet”-type datasets are called data frames in R and we will focus on working with data frames throughout this book. Let’s first load all the packages needed for this chapter (This assumes you’ve already installed them. Read Section 2.3 for information on how to install and load R packages if you haven’t already.) At the beginning of all subsequent chapters in this text, we’ll always have a list of packages similar to what follows that you should have installed and loaded to work with that chapter’s R code. library(dplyr) # Be sure to install these first! library(nycflights13) library(knitr) 2.4.1 nycflights13 package We likely have all flown on airplanes or know someone who has. Air travel has become an ever-present aspect in many people’s lives. If you live in or are visiting a relatively large city and you walk around that city’s airport, you see gates showing flight information from many different airlines. And you will frequently see that some flights are delayed because of a variety of conditions. Are there ways that we can avoid having to deal with these flight delays? We’d all like to arrive at our destinations on time whenever possible. (Unless you secretly love hanging out at airports. If you are one of these people, pretend for the moment that you are very much anticipating being at your final destination.) Throughout this book, we’re going to analyze data related to flights contained in the nycflights13 package (Wickham 2017). Specifically, this package contains five datasets saved as “data frames” (see Section 2.2) with information about all domestic flights departing from New York City in 2013, from either Newark Liberty International (EWR), John F. Kennedy International (JFK), or LaGuardia (LGA) airports: flights: information on all 336,776 flights airlines: translation between two letter IATA carrier codes and names (16 in total) planes: construction information about each of 3,322 planes used weather: hourly meteorological data (about 8710 observations) for each of the three NYC airports airports: airport names and locations 2.4.2 flights data frame We will begin by exploring the flights data frame that is included in the nycflights13 package and getting an idea of its structure. Run the following in your code in your console: it loads in the flights dataset into your Console. Note depending on the size of your monitor, the output may vary slightly. flights # A tibble: 336,776 x 19 year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time <int> <int> <int> <int> <int> <dbl> <int> <int> 1 2013 1 1 517 515 2 830 819 2 2013 1 1 533 529 4 850 830 3 2013 1 1 542 540 2 923 850 4 2013 1 1 544 545 -1 1004 1022 5 2013 1 1 554 600 -6 812 837 6 2013 1 1 554 558 -4 740 728 7 2013 1 1 555 600 -5 913 854 8 2013 1 1 557 600 -3 709 723 9 2013 1 1 557 600 -3 838 846 10 2013 1 1 558 600 -2 753 745 # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>, # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>, # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm> Let’s unpack this output: A tibble: 336,776 x 19: a tibble is a kind of data frame. This particular data frame has 336,776 rows 19 columns corresponding to 19 variables describing each observation year month day dep_time sched_dep_time dep_delay arr_time are different columns, in other words variables, of this data frame. We then have the first 10 rows of observations corresponding to 10 flights. ... with 336,766 more rows, and 11 more variables: indicating to us that 336,766 more rows of data and 11 more variables could not fit in this screen. Unfortunately, this output does not allow us to explore the data very well. Let’s look at different tools to explore data frames. 2.4.3 Exploring data frames Among the many ways of getting a feel for the data contained in a data frame such as flights, we present three functions that take as their argument the data frame in question: Using the View() function built for use in RStudio. We will use this the most. Using the glimpse() function loaded via dplyr package Using the kable() function in the knitr package Using the $ operator to view a single variable in a data frame 1. View(): Run View(flights) in your Console in RStudio and explore this data frame in the resulting pop-up viewer. You should get into the habit of always Viewing any data frames that come your way. Note the capital “V” in View. R is case-sensitive so you’ll receive an error is you run view(flights) instead of View(flights). Learning check (LC2.1) What does any ONE row in this flights dataset refer to? A. Data on an airline B. Data on a flight C. Data on an airport D. Data on multiple flights By running View(flights), we see the different variables listed in the columns and we see that there are different types of variables. Some of the variables like distance, day, and arr_delay are what we will call quantitative variables. These variables are numerical in nature. Other variables here are categorical. Note that if you look in the leftmost column of the View(flights) output, you will see a column of numbers. These are the row numbers of the dataset. If you glance across a row with the same number, say row 5, you can get an idea of what each row corresponds to. In other words, this will allow you to identify what object is being referred to in a given row. This is often called the observational unit. The observational unit in this example is an individual flight departing New York City in 2013. You can identify the observational unit by determining what the thing is that is being measured in each of the variables. 2. glimpse(): The second way to explore a data frame is using the glimpse() function that you can access after you’ve loaded the dplyr package. It provides us with much of the above information and more. glimpse(flights) Observations: 336,776 Variables: 19 $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013... $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1... $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55... $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 60... $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,... $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8... $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 8... $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,... $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"... $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301... $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N... $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG... $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA... $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149... $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73... $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6... $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59... $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-0... Learning check (LC2.2) What are some examples in this dataset of categorical variables? What makes them different than quantitative variables? (LC2.3) What does int, dbl, and chr mean in the output above? We see that glimpse will give you the first few entries of each variable in a row after the variable. In addition, the data type (See Subsection 2.2.1) of the variable is given immediately after each variable’s name inside < >. Here, int and num refer to quantitative variables. In contrast, chr refers to categorical variables. One more type of variable is given here with the time_hour variable: dttm. As you may suspect, this variable corresponds to a specific date and time of day. 3. kable(): The final way to explore the entirety of a data frame is using the kable() function from the knitr package. Let’s explore the different carrier codes for all the airlines in our dataset two ways. Run both of these in your Console: airlines kable(airlines) At first glance of both outputs, it may not appear that there is much difference. However, we’ll see later on, especially when using a tool for document production called R Markdown, that the latter produces output that is much more legible. 4. $ operator Lastly, the $ operator allows us to explore a single variable within a data frame. For example, run the following in your console airlines airlines$name We used the $ operator to extract only the name variable and return it as a vector of length 16. We will only be occasionally exploring data frames using this operator. 2.4.4 Help files Another nice feature of R is the help system. You can get help in R by entering a ? before the name of a function or data frame in question and you will be presented with a page showing the documentation. For example, let’s look at the help file for the flights data frame: ?flights A help file should pop-up in the Help pane of RStudio. Note the content of this particular help file is also accessible on the web on page 3 of the PDF document. You should get in the habit of consulting the help file of any function or data frame in R about which you have questions. 2.5 Conclusion We’ve given you what we feel are the most essential concepts to know before you can start exploring data in R. Is this chapter exhaustive? Absolutely not. To try to include everything in this chapter would make the chapter so large it wouldn’t be useful! However, as we stated earlier, the best way to learn R is to learn by doing. Now let’s get into learning about how to create good stories about and with data. In Chapter 3, we start with what we feel is the most important tool in a data scientist’s toolbox: data visualization. 2.5.1 What’s to come? We’ll now start the “data science” portion of the in Chapter 3, where we will further explore the datasets include the nycflights13 package. We’ll see that data visualization is a powerful tool to add to our toolbox for exploring what is going on in a dataset beyond the View and glimpse functions we introduced in this chapter. "], +["3-viz.html", "3 Data Visualization via ggplot2 3.1 The Grammar of Graphics 3.2 Five Named Graphs - The 5NG 3.3 5NG#1: Scatterplots 3.4 5NG#2: Linegraphs 3.5 5NG#3: Histograms 3.6 Facets 3.7 5NG#4: Boxplots 3.8 5NG#5: Barplots 3.9 Conclusion", " 3 Data Visualization via ggplot2 We begin the development of your data science toolbox with data visualization. By visualizing our data, we will be able to gain valuable insights from our data that we couldn’t initially see from just looking at the raw data in spreadsheet form. We will use the ggplot2 package as it provides an easy way to customize your plots and is rooted in the data visualization theory known as The Grammar of Graphics (Wilkinson 2005). At the most basic level, graphics/plots/charts (we use these terms interchangeably in this book) provide a nice way for us to get a sense for how quantitative variables compare in terms of their center (where the values tend to be located) and their spread (how they vary around the center). The most important thing to know about graphics is that they should be created to make it obvious for your audience to understand the findings and insight you want to get across. This does however require a balancing act. On the one hand, you want to highlight as many meaningful relationships and interesting findings as possible, but on the other you don’t want to include so many as to overwhelm your audience. As we will see, plots/graphics also help us to identify patterns and outliers in our data. We will see that a common extension of these ideas is to compare the distribution of one quantitative variable (i.e., what the spread of a variable looks like or how the variable is distributed in terms of its values) as we go across the levels of a different categorical variable. Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). Read Section 2.3 for information on how to install and load R packages. library(nycflights13) library(ggplot2) library(dplyr) library(knitr) 3.1 The Grammar of Graphics We begin with a discussion of a theoretical framework for data visualization known as the “The Grammar of Graphics,” which serves as the basis for the ggplot2 package. Much like how we construct sentences in any language by using a linguistic grammar (nouns, verbs, subjects, objects, etc.), the theoretical framework given by Leland Wilkinson (Wilkinson 2005) allows us to specify the components of a statistical graphic. 3.1.1 Components of the Grammar In short, the grammar tells us that: A statistical graphic is a mapping of data variables to aesthetic attributes of geometric objects. Specifically, we can break a graphic into the following three essential components: data: the data-set comprised of variables that we map. geom: the geometric object in question. This refers to our type of objects we can observe in our plot. For example, points, lines, bars, etc. aes: aesthetic attributes of the geometric object that we can perceive on a graphic. For example, x/y position, color, shape, and size. Each assigned aesthetic attribute can be mapped to a variable in our data-set. Let’s break down the grammar with an example. 3.1.2 Gapminder In February 2006, a statistician named Hans Rosling gave a TED talk titled “The best stats you’ve ever seen” where he presented global economic, health, and development data from the website gapminder.org. For example, from the 1704 countries included from 2007, consider only the first 6 countries when listed alphabetically: Table 3.1: Gapminder 2007 Data: First 6 of 142 countries Country Continent Life Expectancy Population GDP per Capita Afghanistan Asia 43.83 31889923 974.6 Albania Europe 76.42 3600523 5937.0 Algeria Africa 72.30 33333216 6223.4 Angola Africa 42.73 12420476 4797.2 Argentina Americas 75.32 40301927 12779.4 Australia Oceania 81.23 20434176 34435.4 Each row in this table corresponds to a country in 2007. For each row, we have 5 columns: Country: Name of country. Continent: Which of the five continents the country is part of. (Note that Americas groups North and South America and that Antarctica is excluded here.) Life Expectancy: Life expectancy in years. Population: Number of people living in the country. GDP per Capita: Gross domestic product (in US dollars). Now consider Figure 3.1, which plots this data for all 142 countries in the data frame. Note that R will deal with large numbers using scientific notation. So in the legend for “Population”, 1.25e+09 = \\(1.25 \\times 10^{9}\\) = 1,250,000,000 = 1.25 billion. Figure 3.1: Life Expectancy over GDP per Capita in 2007 Let’s view this plot through the grammar of graphics: The data variable GDP per Capita gets mapped to the x-position aesthetic of the points. The data variable Life Expectancy gets mapped to the y-position aesthetic of the points. The data variable Population gets mapped to the size aesthetic of the points. The data variable Continent gets mapped to the color aesthetic of the points. Recall that data here corresponds to each of the variables being in the same data frame and the “data variable” corresponds to a column in a data frame. While in this example we are considering one type of geometric object (of type point), graphics are not limited to just points. Some plots involve lines while others involve bars. Let’s summarize the three essential components of the grammar in a table: Table 3.2: Summary of Grammar of Graphics for this plot data variable aes geom GDP per Capita x point Life Expectancy y point Population size point Continent color point 3.1.3 Other components of the Grammar There are other components of the Grammar of Graphics we can control. As you start to delve deeper into the Grammar of Graphics, you’ll start to encounter these topics more and more often. In this book, we’ll only work with the two other components below (The other components are left to a more advanced text such as R for Data Science (Grolemund and Wickham 2016)): faceting breaks up a plot into small multiples corresponding to the levels of another variable (Section 3.6) position adjustments for barplots (Section 3.8) In general, the Grammar of Graphics allows for a high degree of customization and also a consistent framework for easy updating/modification of plots. 3.1.4 The ggplot2 package In this book, we will be using the ggplot2 package for data visualization, which is an implementation of the Grammar of Graphics for R (Wickham and Chang 2017). You may have noticed that a lot of the previous text in this chapter is written in computer font. This is because the various components of the Grammar of Graphics are specified in the ggplot function, which expects at a bare minimum as arguments: The data frame where the variables exist: the data argument The mapping of the variables to aesthetic attributes: the mapping argument, which specifies the aesthetic attributes involved After we’ve specified these components, we then add layers to the plot using the + sign. The most essential layer to add to a plot is the specification of which type of geometric object we want the plot to involve; e.g. points, lines, bars. Other layers we can add include the specification of the plot title, axes labels, facets, and visual themes for the plot. Let’s now put the theory of the Grammar of Graphics into practice. 3.2 Five Named Graphs - The 5NG For our purposes, we will be limiting consideration to five different types of graphs. We term these five named graphs the 5NG: scatterplots linegraphs boxplots histograms barplots We will discuss some variations of these plots, but with this basic repertoire in your toolbox you can visualize a wide array of different data variable types. Note that certain plots are only appropriate for categorical/logical variables and others only for quantitative variables. You’ll want to quiz yourself often as we go along on which plot makes sense a given a particular problem or data-set. 3.3 5NG#1: Scatterplots The simplest of the 5NG are scatterplots (also called bivariate plots); they allow you to investigate the relationship between two numerical variables. While you may already be familiar with this type of plot, let’s view it through the lens of the Grammar of Graphics. Specifically, we will graphically investigate the relationship between the following two numerical variables in the flights data frame: dep_delay: departure delay on the horizontal “x” axis and arr_delay: arrival delay on the vertical “y” axis for Alaska Airlines flights leaving NYC in 2013. This requires paring down the flights data frame to a smaller data frame all_alaska_flights consisting of only Alaska Airlines (carrier code “AS”) flights. Don’t worry for now if you don’t fully understand what this code is doing, we’ll explain this in details Chapter 5, just run it all and understand that we are taking all flights and only considering those corresponding to Alaska Airlines. all_alaska_flights <- flights %>% filter(carrier == "AS") This code snippet makes use of functions in the dplyr package for data wrangling to achieve our goal: it takes the flights data frame and filters it to only return the rows which meet the condition carrier == "AS". Recall from Section 2.2 that testing for equality is specified with == and not =. You will see many more examples of == and filter() in Chapter 5. Learning check (LC3.1) Take a look at both the flights and all_alaska_flights data frames by running View(flights) and View(all_alaska_flights) in the console. In what respect do these data frames differ? 3.3.1 Scatterplots via geom_point We proceed to create the scatterplot using the ggplot() function: ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_point() Figure 3.2: Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013 In Figure 3.2 we see that a positive relationship exists between dep_delay and arr_delay: as departure delays increase, arrival delays tend to also increase. We also note that the majority of points fall near the point (0, 0). There is a large mass of points clustered there. Let’s break this down, keeping in mind our discussion in Section 3.1: Within the ggplot() function call, we specify two of the components of the grammar: The data frame to be all_alaska_flights by setting data = all_alaska_flights The aesthetic mapping by setting aes(x = dep_delay, y = arr_delay). Specifically the variable dep_delay maps to the x position aesthetic the variable arr_delay maps to the y position aesthetic We add a layer to the ggplot() function call using the + sign. The layer in question specifies the third component of the grammar: the geometric object. In this case the geometric object are points, set by specifying geom_point(). Some notes on layers: Note that the + sign comes at the end of lines, and not at the beginning. You’ll get an error in R if you put it at the beginning. When adding layers to a plot, you are encouraged to hit Return on your keyboard after entering the + so that the code for each layer is on a new line. As we add more and more layers to plots, you’ll see this will greatly improve the legibility of your code. To stress the importance of adding layers, in particular the layer specifying the geometric object, consider Figure 3.3 where no layers are added. A not very useful plot! ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) Figure 3.3: Plot with No Layers Learning check (LC3.2) What are some practical reasons why dep_delay and arr_delay have a positive relationship? (LC3.3) What variables (not necessarily in the flights data frame) would you expect to have a negative correlation (i.e. a negative relationship) with dep_delay? Why? Remember that we are focusing on numerical variables here. (LC3.4) Why do you believe there is a cluster of points near (0, 0)? What does (0, 0) correspond to in terms of the Alaskan flights? (LC3.5) What are some other features of the plot that stand out to you? (LC3.6) Create a new scatterplot using different variables in the all_alaska_flights data frame by modifying the example above. 3.3.2 Over-plotting The large mass of points near (0, 0) in Figure 3.2 can cause some confusion. This is the result of a phenomenon called overplotting. As one may guess, this corresponds to values being plotted on top of each other over and over again. It is often difficult to know just how many values are plotted in this way when looking at a basic scatterplot as we have here. There are two ways to address this issue: By adjusting the transparency of the points via the alpha argument By jittering the points via geom_jitter() The first way of relieving overplotting is by changing the alpha argument in geom_point() which controls the transparency of the points. By default, this value is set to 1. We can change this to any value between 0 and 1 where 0 sets the points to be 100% transparent and 1 sets the points to be 100% opaque. Note how the following function call is identical to the one in Section 3.3, but with alpha = 0.2 added to the geom_point(). ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_point(alpha = 0.2) Figure 3.4: Delay scatterplot with alpha=0.2 The key feature to note in Figure 3.4 is that the transparency of the points is cumulative: areas with a high-degree of overplotting are darker, whereas areas with a lower degree are less dark. Note that there is no aes() surrounding alpha = 0.2 here. Since we are NOT mapping a variable to an aesthetic but instead are just changing a setting, we don’t need to create a mapping with aes(). In fact, you’ll receive an error if you try to change the second line above to geom_point(aes(alpha = 0.2)). The second way of relieving overplotting is to jitter the points a bit. In other words, we are going to add just a bit of random noise to the points to better see them and remove some of the overplotting. You can think of “jittering” as shaking the points around a bit on the plot. Instead of using geom_point, we use geom_jitter to perform this shaking. To specify how much jitter to add, we adjust the width and height arguments. This corresponds to how hard you’d like to shake the plot in units corresponding to those for both the horizontal and vertical variables (in this case, minutes). ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_jitter(width = 30, height = 30) Figure 3.5: Jittered delay scatterplot Note how this function call is identical to the one in Subsection 3.3.1, but with geom_point() replaced with geom_jitter(). The plot in Figure 3.5 helps us a little bit in getting a sense for the overplotting, but with a relatively large data-set like this one (714 flights), it can be argued that changing the transparency of the points by setting alpha proved more effective. We’ll see later on that the two following R commands will yield the exact same plot: ggplot(data = all_alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_jitter(width = 30, height = 30) ggplot(all_alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_jitter(width = 30, height = 30) In other words you can drop the data = and mapping = if you keep the order of the two arguments the same. Since the ggplot() function is expecting its first argument data to be a data frame and its second argument to correspond to mapping =, you can omit both and you’ll get the same plot. As you get more and more practice, you’ll likely find yourself not including the specification of the argument like this. But for now to keep things straightforward let’s make it a point to include the data = and mapping =. Learning check (LC3.7) Why is setting the alpha argument value useful with scatterplots? What further information does it give you that a regular scatterplot cannot? (LC3.8) After viewing the Figure 3.4 above, give an approximate range of arrival delays and departure delays that occur the most frequently. How has that region changed compared to when you observed the same plot without the alpha = 0.2 set in Figure 3.2? 3.3.3 Summary Scatterplots display the relationship between two numerical variables. They are among the most commonly used plots because they can provide an immediate way to see the trend in one variable versus another. However, if you try to create a scatterplot where either one of the two variables is not numerical, you will get strange results. Be careful! With medium to large data-sets, you may need to play with either geom_jitter() or the alpha argument in order to get a good feel for relationships in your data. This tweaking is often a fun part of data visualization since you’ll have the chance to see different relationships come about as you make subtle changes to your plots. 3.4 5NG#2: Linegraphs The next of the 5NG is a linegraph. They are most frequently used when the x-axis represents time and the y-axis represents some other numerical variable; such plots are known as time series. Time represents a variable that is connected together by each day following the previous day. In other words, time has a natural ordering. Linegraphs should be avoided when there is not a clear sequential ordering to the explanatory variable, i.e. the x-variable or the predictor variable. Our focus now turns to the temp variable in this weather data-set. By Looking over the weather data-set by typing View(weather) in the console. Running ?weather to bring up the help file. We can see that the temp variable corresponds to hourly temperature (in Fahrenheit) recordings at weather stations near airports in New York City. Instead of considering all hours in 2013 for all three airports in NYC, let’s focus on the hourly temperature at Newark airport (origin code “EWR”) for the first 15 days in January 2013. The weather data frame in the nycflights13 package contains this data, but we first need to filter it to only include those rows that correspond to Newark in the first 15 days of January. early_january_weather <- weather %>% filter(origin == "EWR" & month == 1 & day <= 15) This is similar to the previous use of the filter command in Section 3.3, however we now use the & operator. The above selects only those rows in weather where the originating airport is "EWR" and we are in the first month and the day is from 1 to 15 inclusive. Learning check (LC3.9) Take a look at both the weather and early_january_weather data frames by running View(weather) and View(early_january_weather) in the console. In what respect do these data frames differ? (LC3.10) View() the flights data frame again. Why does the time_hour variable uniquely identify the hour of the measurement whereas the hour variable does not? 3.4.1 Linegraphs via geom_line We plot a linegraph of hourly temperature using geom_line(): ggplot(data = early_january_weather, mapping = aes(x = time_hour, y = temp)) + geom_line() Figure 3.6: Hourly Temperature in Newark for January 1-15, 2013 Much as with the ggplot() call in Chapter 3.3.1, we describe the components of the Grammar of Graphics: Within the ggplot() function call, we specify two of the components of the grammar: The data frame to be early_january_weather by setting data = early_january_weather The aesthetic mapping by setting aes(x = time_hour, y = temp). Specifically time_hour (i.e. the time variable) maps to the x position temp maps to the y position We add a layer to the ggplot() function call using the + sign The layer in question specifies the third component of the grammar: the geometric object in question. In this case the geometric object is a line, set by specifying geom_line(). Learning check (LC3.11) Why should linegraphs be avoided when there is not a clear ordering of the horizontal axis? (LC3.12) Why are linegraphs frequently used when time is the explanatory variable? (LC3.13) Plot a time series of a variable other than temp for Newark Airport in the first 15 days of January 2013. 3.4.2 Summary Linegraphs, just like scatterplots, display the relationship between two numerical variables. However, the variable on the x-axis (i.e. the explanatory variable) should have a natural ordering, like some notion of time. We can mislead our audience if that isn’t the case. 3.5 5NG#3: Histograms Let’s consider the temp variable in the weather data frame once again, but now unlike with the linegraphs in Chapter 3.4, let’s say we don’t care about the relationship of temperature to time, but rather we care about the (statistical) distribution of temperatures. We could just produce points where each of the different values appear on something similar to a number line: Figure 3.7: Plot of Hourly Temperature Recordings from NYC in 2013 This gives us a general idea of how the values of temp differ. We see that temperatures vary from around 11 up to 100 degrees Fahrenheit. The area between 40 and 60 degrees appears to have more points plotted than outside that range. 3.5.1 Histograms via geom_histogram What is commonly produced instead of the above plot is a plot known as a histogram. The histogram shows how many elements of a single numerical variable fall in specified bins. In this case, these bins may correspond to between 0-10°F, 10-20°F, etc. We produce a histogram of the hour temperatures at all three NYC airports in 2013: ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram() `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. Warning: Removed 1 rows containing non-finite values (stat_bin). Figure 3.8: Histogram of Hourly Temperature Recordings from NYC in 2013 Note here: There is only one variable being mapped in aes(): the single numerical variable temp. You don’t need to compute the y-aesthetic: it gets computed automatically. We set the geometric object to be geom_histogram() We got a warning message of 1 rows containing non-finite values being removed. This is due to one of the values of temperature being missing. R is alerting us that this happened. Another warning corresponds to an urge to specify the number of bins you’d like to create. 3.5.2 Adjusting the bins We can adjust characteristics of the bins in one of two ways: By adjusting the number of bins via the bins argument By adjusting the width of the bins via the binwidth argument First, we have the power to specify how many bins we would like to put the data into as an argument in the geom_histogram() function. By default, this is chosen to be 30 somewhat arbitrarily; we have received a warning above our plot that this was done. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(bins = 60, color = "white") Figure 3.9: Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins Note the addition of the color argument. If you’d like to be able to more easily differentiate each of the bins, you can specify the color of the outline as done above. You can also adjust the color of the bars by setting the fill argument. Type colors() in your console to see all 657 available colors. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(bins = 60, color = "white", fill = "steelblue") Figure 3.10: Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Colored Bins Second, instead of specifying the number of bins, we can also specify the width of the bins by using the binwidth argument in the geom_histogram function. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(binwidth = 10, color = "white") Figure 3.11: Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10 Learning check (LC3.14) What does changing the number of bins from 30 to 60 tell us about the distribution of temperatures? (LC3.15) Would you classify the distribution of temperatures as symmetric or skewed? (LC3.16) What would you guess is the “center” value in this distribution? Why did you make that choice? (LC3.17) Is this data spread out greatly from the center or is it close? Why? 3.5.3 Summary Histograms, unlike scatterplots and linegraphs, present information on only a single numerical variable. In particular they are visualizations of the (statistical) distribution of values. 3.6 Facets Before continuing the 5NG, we briefly introduce a new concept called faceting. Faceting is used when we’d like to create small multiples of the same plot over a different categorical variable. By default, all of the small multiples will have the same vertical axis. For example, suppose we were interested in looking at how the temperature histograms we saw in Chapter 3.5 varied by month. This is what is meant by “the distribution of a variable over another variable”: temp is one variable and month is the other variable. In order to look at histograms of temp for each month, we add a layer facet_wrap(~ month). You can also specify how many rows you’d like the small multiple plots to be in using nrow or how many columns using ncol inside of facet_wrap. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(binwidth = 5, color = "white") + facet_wrap(~ month, nrow = 4) Figure 3.12: Faceted histogram Note the use of the ~ before month in facet_wrap. The tilde (~) is required and you’ll receive the error Error in as.quoted(facets) : object 'month' not found if you don’t include it before month here. As we might expect, the temperature tends to increase as summer approaches and then decrease as winter approaches. Learning check (LC3.18) What other things do you notice about the faceted plot above? How does a faceted plot help us see relationships between two variables? (LC3.19) What do the numbers 1-12 correspond to in the plot above? What about 25, 50, 75, 100? (LC3.20) For which types of data-sets would these types of faceted plots not work well in comparing relationships between variables? Give an example describing the nature of these variables and other important characteristics. (LC3.21) Does the temp variable in the weather data-set have a lot of variability? Why do you say that? 3.7 5NG#4: Boxplots While using faceted histograms can provide a way to compare distributions of a numerical variable split by groups of a categorical variable as in Section 3.6, an alternative plot called a boxplot (also called a side-by-side boxplot) achieves the same task and is frequently preferred. The boxplot uses the information provided in the five-number summary referred to in Appendix A. It gives a way to compare this summary information across the different levels of a categorical variable. 3.7.1 Boxplots via geom_boxplot Let’s create a boxplot to compare the monthly temperatures as we did above with the faceted histograms. ggplot(data = weather, mapping = aes(x = month, y = temp)) + geom_boxplot() Figure 3.13: Invalid boxplot specification Warning messages: 1: Continuous x aesthetic -- did you forget aes(group=...)? 2: Removed 1 rows containing non-finite values (stat_boxplot). Note the set of warnings that is given here. The second warning corresponds to missing values in the data frame and it is turned off on subsequent plots. Let’s focus on the first warning. Observe that this plot does not look like what we were expecting. We were expecting to see the distribution of temperatures for each month (so 12 different boxplots). The first warning is letting us know that we are plotting a numerical, and not categorical variable, on the x-axis. This gives us the overall boxplot without any other groupings. We can get around this by introducing a new function for our x variable: ggplot(data = weather, mapping = aes(x = factor(month), y = temp)) + geom_boxplot() Figure 3.14: Month by temp boxplot We have introduced a new function called factor() here. One of the things this function does is to convert a discrete value like month (1, 2, …, 12) into a categorical variable. The “box” part of this plot represents the 25th percentile, the median (50th percentile), and the 75th percentile. The dots correspond to outliers. (The specific formulation for these outliers is discussed in Appendix A.) The lines show how the data varies that is not in the center 50% defined by the first and third quantiles. Longer lines correspond to more variability and shorter lines correspond to less variability. Looking at this plot we can see, as expected, that summer months (6 through 8) have higher median temperatures. We can easily compare temperatures across months by drawing imaginary horizontal lines across the plot. Furthermore, the height of the 12 boxes are informative too; they tell us about variability, or spread, of temperatures recorded in a given month. But to really bring home what boxplots show, let’s focus only on the month of November’s 2138 temperature recordings. Figure 3.15: November boxplot Now let’s plot all 2138 temperature recordings for November on top of the boxplot in Figure 3.16. Figure 3.16: November boxplot with points What the boxplot does is summarize the 2138 points for you, in particular: 25% of points (about 534 observations) fall below the bottom edge of the box which is the first quartile of 35.96 degrees Farenheit (2.2 degrees Celsius). In other words 25% of observations were colder than 35.96 degrees Farenheit. 25% of points fall between the bottom edge of the box and the solid middle line which is the median of 46.04 degrees Farenheit (7.8 degrees Celsius). In other words 25% of observations were between 35.96 and 46.04 degrees Farenheit. 25% of points fall between the solid middle line and the top edge of the box which is the third quartile of 51.98 degrees Farenheit (11.1 degrees Celsius). In other words 25% of observations were between 46.04 and 51.98 degrees Farenheit. 25% of points fall over the top edge of the box. In other words 25% of observations were warmer than 51.98 degrees Farenheit. Learning check (LC3.22) What does the dot at the bottom of the plot for May correspond to? Explain what might have occurred in May to produce this point. (LC3.23) Which months have the highest variability in temperature? What reasons do you think this is? (LC3.24) We looked at the distribution of a numerical variable over a categorical variable here with this boxplot. Why can’t we look at the distribution of one numerical variable over the distribution of another numerical variable? Say, temperature across pressure, for example? (LC3.25) Boxplots provide a simple way to identify outliers. Why may outliers be easier to identify when looking at a boxplot instead of a faceted histogram? 3.7.2 Summary Boxplots provide a way to compare and contrast the distribution of one quantitative variable across multiple levels of one categorical variable. One can see where the median falls across the different groups by looking at the center line in the box. To see how spread out the variable is across the different groups, look at both the width of the box and also how far the lines stretch vertically from the box. (If the lines stretch far from the box but the box has a small width, the variability of the values closer to the center is much smaller than the variability of the outer ends of the variable.) Outliers are even more easily identified when looking at a boxplot than when looking at a histogram. 3.8 5NG#5: Barplots Both histograms and boxplots represent ways to visualize the variability of numerical variables. Another common task is to present the distribution of a categorical variable. This is a simpler task, focused on how many elements from the data fall into different categories of the categorical variable. Often the best way to visualize these different counts (also known as frequencies) is via a barplot, also known as a barchart. One complication, however, is how your counts are represented in your data. For example, run the following code in your Console. This code manually creates two data frames representing counts of fruit. fruits <- data_frame( fruit = c("apple", "apple", "apple", "orange", "orange") ) fruits_counted <- data_frame( fruit = c("apple", "orange"), number = c(3, 2) ) We see both the fruits and fruits_counted data frames represent the same collection of fruit: three apples and two oranges. However, whereas fruits just lists the fruit: Table 3.3: Fruits fruit apple apple apple orange orange fruits_counted has a variable count, where the counts are pre-tabulated. Table 3.4: Fruits (Pre-Counted) fruit number apple 3 orange 2 Compare the barcharts in Figures 3.17 and 3.18, which are identical, but are based on two different data frames: ggplot(data = fruits, mapping = aes(x = fruit)) + geom_bar() Figure 3.17: Barplot when counts are not pre-tabulated ggplot(data = fruits_counted, mapping = aes(x = fruit, y = number)) + geom_col() Figure 3.18: Barplot when counts are pre-tabulated Observe that: The code that generates Figure 3.17 based on fruits does not have an explicit y aesthetic and uses geom_bar() The code that generates Figure 3.18 based on fruits_counted has an explicit y aesthetic (to the variable number) and uses geom_col() (Please note that this one of ggplot2’s trickier aspects that causes the most confusion, and fortunately this is as complicated as our use of ggplot2 is going to get.) Stating the above differently: When the categorical variable you want to plot is not pre-tabulated in your data frame you need to use geom_bar(). When the categorical variable is pre-tabulated (in the above fruits_counted example in the variable number), you need to use geom_col() with the y aesthetic explicitly mapped. 3.8.1 Barplots via geom_bar/geom_col Consider the distribution of airlines that flew out of New York City in 2013. Here we explore the number of flights from each airline/carrier. This can be plotted by invoking the geom_bar function in ggplot2: ggplot(data = flights, mapping = aes(x = carrier)) + geom_bar() Figure 3.19: Number of flights departing NYC in 2013 by airline using geom_bar To get an understanding of what the names of these airlines are corresponding to these carrier codes, we can look at the airlines data frame in the nycflights13 package. airlines carrier name 9E Endeavor Air Inc. AA American Airlines Inc. AS Alaska Airlines Inc. B6 JetBlue Airways DL Delta Air Lines Inc. EV ExpressJet Airlines Inc. F9 Frontier Airlines Inc. FL AirTran Airways Corporation HA Hawaiian Airlines Inc. MQ Envoy Air OO SkyWest Airlines Inc. UA United Air Lines Inc. US US Airways Inc. VX Virgin America WN Southwest Airlines Co. YV Mesa Airlines Inc. Going back to our barplot, we see that United Air Lines, JetBlue Airways, and ExpressJet Airlines had the most flights depart New York City in 2013. To get the actual number of flights by each airline we can use the group_by(), summarize(), and n() functions in the dplyr package on the carrier variable in flights, which we will introduce formally in Chapter 5. flights_table <- flights %>% group_by(carrier) %>% summarize(number = n()) flights_table carrier number 9E 18460 AA 32729 AS 714 B6 54635 DL 48110 EV 54173 F9 685 FL 3260 HA 342 MQ 26397 OO 32 UA 58665 US 20536 VX 5162 WN 12275 YV 601 In this table, the counts of the carriers are pre-tabulated. To create a barchart using the data frame flights_table, we use geom_col and map the y aesthetic to the variable number. Compare this barplot using geom_col in Figure 3.20 with the earlier barplot using geom_bar in Figure 3.19. They are identical. ggplot(data = flights_table, mapping = aes(x = carrier, y = number)) + geom_col() Figure 3.20: Number of flights departing NYC in 2013 by airline using geom_col Learning check (LC3.26) Why are histograms inappropriate for visualizing categorical variables? (LC3.27) What is the difference between histograms and barplots? (LC3.28) How many Envoy Air flights departed NYC in 2013? (LC3.29) What was the seventh highest airline in terms of departed flights from NYC in 2013? How could we better present the table to get this answer quickly. 3.8.2 Must avoid pie charts! Unfortunately, one of the most common plots seen today for categorical data is the pie chart. While they may see harmless enough, they actually present a problem in that humans are unable to judge angles well. As Naomi Robbins describes in her book “Creating More Effective Graphs” (Robbins 2013), we overestimate angles greater than 90 degrees and we underestimate angles less than 90 degrees. In other words, it is difficult for us to determine relative size of one piece of the pie compared to another. Let’s examine our previous barplot example on the number of flights departing NYC by airline. This time we will use a pie chart. As you review this chart, try to identify how much larger the portion of the pie is for ExpressJet Airlines (EV) compared to US Airways (US), what the third largest carrier is in terms of departing flights, and how many carriers have fewer flights than United Airlines (UA)? Figure 3.21: The dreaded pie chart While it is quite easy to look back at the barplot to get the answer to these questions, it’s quite difficult to get the answers correct when looking at the pie graph. Barplots can always present the information in a way that is easier for the eye to determine relative position. There may be one exception from Nathan Yau at FlowingData.com but we will leave this for the reader to decide: Figure 3.22: The only good pie chart Learning check (LC3.30) Why should pie charts be avoided and replaced by barplots? (LC3.31) What is your opinion as to why pie charts continue to be used? 3.8.3 Using barplots to compare two categorical variables Barplots are the go-to way to visualize the frequency of different categories of a categorical variable. They make it easy to order the counts and to compare the frequencies of one group to another. Another use of barplots (unfortunately, sometimes inappropriately and confusingly) is to compare two categorical variables together. Let’s examine the distribution of outgoing flights from NYC by carrier and airport. We begin by getting the names of the airports in NYC that were included in the flights data-set. Here, we preview the inner_join() function from Chapter 5. This function will join the data frame flights with the data frame airports by matching rows that have the same airport code. However, in flights the airport code is included in the origin variable whereas in airports the airport code is included in the faa variable. We will revisit such examples in Section 5.8 on joining data-sets. flights_namedports <- flights %>% inner_join(airports, by = c("origin" = "faa")) After running View(flights_namedports), we see that name now corresponds to the name of the airport as referenced by the origin variable. We will now plot carrier as the horizontal variable. When we specify geom_bar, it will specify count as being the vertical variable. A new addition here is fill = name. Look over what was produced from the plot to get an idea of what this argument gives. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar() Figure 3.23: Stacked barplot comparing the number of flights by carrier and airport This plot is what is known as a stacked barplot. While simple to make, it often leads to many problems. For example in this plot, it is difficult to compare the heights of the different colors (corresponding to the number of flights from each airport) between the bars (corresponding to the different carriers). Note that fill is an aesthetic just like x is an aesthetic, and thus must be included within the parentheses of the aes() mapping. The following code, where the fill aesthetic is specified on the outside will yield an error. This is a fairly common error that new ggplot users make: ggplot(data = flights_namedports, mapping = aes(x = carrier), fill = name) + geom_bar() Learning check (LC3.32) What kinds of questions are not easily answered by looking at the above figure? (LC3.33) What can you say, if anything, about the relationship between airline and airport in NYC in 2013 in regards to the number of departing flights? Another variation on the stacked barplot is the side-by-side barplot. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar(position = "dodge") Figure 3.24: Side-by-side barplot comparing the number of flights by carrier and airport Learning check (LC3.34) Why might the side-by-side barplot be preferable to a stacked barplot in this case? (LC3.35) What are the disadvantages of using a side-by-side barplot, in general? Lastly, an often preferred type of barplot is the faceted barplot. We already saw this concept of faceting and small multiples in Section 3.6. This gives us a nicer way to compare the distributions across both carrier and airport/name. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar() + facet_grid(name ~ .) Figure 3.25: Faceted barplot comparing the number of flights by carrier and airport Note how the facet_grid function arguments are written here. We are wanting the names of the airports vertically and the carrier listed horizontally. As you may have guessed, this argument and other formulas of this sort in R are in y ~ x order. We will see more examples of this in Chapter 6. If you’d like to create small multiples in a vertical direction, you’ll want to use facet_grid() with the name of the variable before the ~ as we did in Figure 3.25. This corresponds to vertical going with y in the formula. If instead you’d like the small multiples to be in the horizontal direction, you’d use facet_grid() with the name of the variable after the ~, corresponding to the x position in the formula. Further, you can use facet_wrap() if you would like the small multiples to wrap into multiple rows as we saw earlier in the faceted histogram example in Figure 3.12. Additionally, you could use facet_grid() with one variable in the y position and another variable in the x position creating a grid of all possible combinations of the two variables. Learning check (LC3.36) Why is the faceted barplot preferred to the side-by-side and stacked barplots in this case? (LC3.37) What information about the different carriers at different airports is more easily seen in the faceted barplot? 3.8.4 Summary Barplots are the preferred way of displaying categorical variables. They are easy-to-understand and make it easy to compare across groups of a categorical variable. When dealing with more than one categorical variable, faceted barplots are frequently preferred over side-by-side or stacked barplots. Stacked barplots are sometimes nice to look at, but it is quite difficult to compare across the levels since the sizes of the bars are all of different sizes. Side-by-side barplots can provide an improvement on this, but the issue about comparing across groups still must be dealt with. 3.9 Conclusion 3.9.1 Review questions Review questions have been designed using the fivethirtyeight R package (Kim, Ismay, and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the chapters of the DataCamp course available below: Scatterplots & Linegraphs Histograms & Boxplots Barplots ggplot2 Review 3.9.2 What’s to come? In Chapter 4, we’ll introduce the concept of “tidy data” and how it is used as a key data format for all the packages we use in this textbook. You’ll see that the concept appears to be simple, but actually can be a little challenging to decipher without careful practice. We’ll also investigate how to import CSV (comma-separated value) files into R using the readr package. 3.9.3 Resources An excellent resource as you begin to create plots using the ggplot2 package is a cheatsheet that RStudio has put together entitled “Data Visualization with ggplot2” available by clicking here or by clicking the RStudio Menu Bar -> Help -> Cheatsheets -> “Data Visualization with ggplot2” This cheatsheet covers more than what we’ve discussed in this chapter but provides nice visual descriptions of what each function produces. In addition, we’ve created a mind map to help you remember which types of plots are most appropriate in a given situation by identifying the types of variables involved in the problem. Figure 3.26: Mind map for Data Visualization 3.9.4 Script of R code An R script file of all R code used in this chapter is available here. "], +["4-tidy.html", "4 Tidy Data via tidyr 4.1 What is tidy data? 4.2 Back to nycflights13 4.3 Importing CSVs via readr 4.4 Converting from wide to long 4.5 Optional: Normal forms of data 4.6 Conclusion", " 4 Tidy Data via tidyr In Subsection 2.2.1 we introduced the concept of a data frame: a rectangular spreadsheet-like representation of data in R where the rows correspond to observations and the columns correspond to variables describing each observation. In Section 2.4, we started explorations of our first data frame flights included in the nycflights13 package. In Chapter 3 we made graphics using data contained in flights and other data frames. In this chapter, we extend some of these ideas by discussing a type of data formatting called “tidy” data. You will see that having data stored in “tidy” format is about more than what the colloquial definition of the term “tidy” might suggest of having your data “neatly organized” in a spreadsheet. Instead, we define the term “tidy” in a more rigorous fashion, outlining a set of rules by which data can be stored and the implications of these rules on analyses. Although knowledge of this type of data formatting was not necessary in our treatment of data visualization in Chapter 3 since all the data was already in tidy format, we’ll see going forward that having tidy data will allow you to more easily create data visualizations in a wide range of settings. Furthermore, it will also help you with data wrangling in Chapter 5 and in all subsequent chapters in this book when we cover regression and discuss statistical inference. Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(nycflights13) library(tidyr) library(readr) 4.1 What is tidy data? You have surely heard the word “tidy” in your life: “Tidy up your room!” “Please write your homework in a tidy way so that it is easier to grade and to provide feedback.” Marie Kondo’s best-selling book The Life-Changing Magic of Tidying Up: The Japanese Art of Decluttering and Organizing “I am not by any stretch of the imagination a tidy person, and the piles of unread books on the coffee table and by my bed have a plaintive, pleading quality to me - ‘Read me, please!’” - Linda Grant What does it mean for your data to be “tidy”? Beyond just being organized, in the context of this book having “tidy” data means that your data follows a standardized format. This makes it easier for you and others to visualize your data, to wrangle/transform your data, and to model your data. We will follow Hadley Wickham’s definition of tidy data here (Wickham 2014): A dataset is a collection of values, usually either numbers (if quantitative) or strings AKA text data (if qualitative). Values are organised in two ways. Every value belongs to a variable and an observation. A variable contains all values that measure the same underlying attribute (like height, temperature, duration) across units. An observation contains all values measured on the same unit (like a person, or a day, or a city) across attributes. Tidy data is a standard way of mapping the meaning of a dataset to its structure. A dataset is messy or tidy depending on how rows, columns and tables are matched up with observations, variables and types. In tidy data: Each variable forms a column. Each observation forms a row. Each type of observational unit forms a table. Figure 4.1: Tidy data graphic from http://r4ds.had.co.nz/tidy-data.html For example, say the following table consists of stock prices: Table 4.1: Stock Prices (Non-Tidy Format) Date Boeing Stock Price Amazon Stock Price Google Stock Price 2009-01-01 $173.55 $174.90 $174.34 2009-01-02 $172.61 $171.42 $170.04 Although the data are neatly organized in a spreadsheet-type format, they are not in tidy format since there are three variables corresponding to three unique pieces of information (Date, Stock Name, and Stock Price), but there are not three columns. In tidy data format each variable should be its own column, as shown below. Notice that both tables present the same information, but in different formats. Table 4.2: Stock Prices (Tidy Format) Date Stock Name Stock Price 2009-01-01 Boeing $173.55 2009-01-02 Boeing $172.61 2009-01-01 Amazon $174.90 2009-01-02 Amazon $171.42 2009-01-01 Google $174.34 2009-01-02 Google $170.04 However, consider the following table Table 4.3: Date, Boeing Price, Weather Data Date Boeing Price Weather 2009-01-01 $173.55 Sunny 2009-01-02 $172.61 Overcast In this case, even though the variable “Boeing Price” occurs again, the data is tidy since there are three variables corresponding to three unique pieces of information (Date, Boeing stock price, and the weather that particular day). The non-tidy data format in the original table is also known as “wide” format whereas the tidy data format in the second table is also known as “long”/“narrow” data format. In this book, we will work with work with datasets that are already in tidy format. But data isn’t always in this nice format that the tidyverse gets its name from. Data actually may come to you in a variety of different formats that require data cleaning and reshaping beyond the scope of this book. For a thorough example of the steps needed to take a messy dataset and turn it into a tidy one, check out the different functions available for data tidying and a case study using data from the World Health Organization in R for Data Science (Grolemund and Wickham 2016). Most frequently though, data that isn’t in long format and is instead in wide format can be converted into “tidy” format by using the tidyr package (Wickham and Henry 2017) in the tidyverse. We’ll now investigate how that can be done using the gather() function in tidyr. Before we proceed with reshaping our data, we will discuss how to read data stored in CSV format into R as a data frame. 4.2 Back to nycflights13 Recall the nycflights13 package with data about all domestic flights departing from New York City in 2013 that we introduced in Section 2.4 and used extensively in Chapter 3 to create visualizations. In particular, let’s revisit the flights data frame by running View(flights) in your console. We see that flights has a rectangular shape with each row corresponding to a different flight and each column corresponding to a characteristic of that flight. This matches exactly with how Hadley Wickham defined tidy data: Each variable forms a column. Each observation forms a row. But what about the third property? Each type of observational unit forms a table. 4.2.1 Observational units We identified earlier that the observational unit in the flights dataset is an individual flight. And we have shown that this dataset consists of 336,776 flights with 19 variables. In other words, rows of this dataset don’t refer to a measurement on an airline or on an airport; they refer to characteristics/measurements on a given flight from New York City in 2013. Also included in the nycflights13 package are datasets with different observational units (Wickham 2017): airlines: translation between two letter IATA carrier codes and names (16 in total) planes: construction information about each of 3,322 planes used weather: hourly meteorological data (about 8710 observations) for each of the three NYC airports airports: airport names and locations The organization of this data follows the third “tidy” data property: observations corresponding to the same observational unit should be saved in the same table/data frame. 4.2.2 Identification vs measurement variables There is a subtle difference between the kinds of variables that you will encounter in data frames: measurement variables and identification variables. The airports data frame you worked with above contains both these types of variables. Recall that in airports the observational unit is an airport, and thus each row corresponds to one particular airport. Let’s pull them apart using the glimpse function: glimpse(airports) Observations: 1,458 Variables: 8 $ faa <chr> "04G", "06A", "06C", "06N", "09J", "0A9", "0G6", "0G7", "0P2"... $ name <chr> "Lansdowne Airport", "Moton Field Municipal Airport", "Schaum... $ lat <dbl> 41.13, 32.46, 41.99, 41.43, 31.07, 36.37, 41.47, 42.88, 39.79... $ lon <dbl> -80.62, -85.68, -88.10, -74.39, -81.43, -82.17, -84.51, -76.7... $ alt <int> 1044, 264, 801, 523, 11, 1593, 730, 492, 1000, 108, 409, 875,... $ tz <dbl> -5, -6, -6, -5, -5, -5, -5, -5, -5, -8, -5, -6, -5, -5, -5, -... $ dst <chr> "A", "A", "A", "A", "A", "A", "A", "A", "U", "A", "A", "U", "... $ tzone <chr> "America/New_York", "America/Chicago", "America/Chicago", "Am... The variables faa and name are what we will call identification variables: variables that uniquely identify each observational unit. They are mainly used to provide a name to the observational unit. faa gives the code provided by the FAA for that airport while the name variable gives the longer more natural name of the airport. The remaining variables (lat, lon, alt, tz, dst, tzone) are often called measurement or characteristic variables: variables that describe properties of each observational unit, in other words each observation in each row. For example, lat and long describe the latitude and longitude of each airport. While it is not an absolute rule, for organizational purposes it considered good practice to have your identification variables in the far left-most columns of your data frame. Learning check (LC4.1) What properties of the observational unit do each of lat, lon, alt, tz, dst, and tzone describe for the airports data frame? Note that you may want to use ?airports to get more information. (LC4.2) Provide the names of variables in a data frame with at least three variables in which one of them is an identification variable and the other two are not. In other words, create your own tidy dataset that matches these conditions. 4.3 Importing CSVs via readr Up to this point, we’ve used data either stored inside of an R package or we’ve manually created the data such as the fruits and fruits_counted data in Subsection 3.8. Another common way to get data into R is via reading in data from a spreadsheet either stored on your computer or stored online. For our purposes here, we will work with downloading data stored online. First, let’s download a Comma Separated Values (CSV) file of ratings of the level of democracy in different countries spanning 1952 to 1992: http://ismayc.github.io/dem_score.csv. After downloading it open it and take a look. You can think of a CSV file as a bare-bones spreadsheet where: Each line in the file corresponds to a row of data/one observation. Values for each line are separated with commas. In other words, the values of different variables are separated by commas. The first line is usually a header row indicating the names of the columns/variables. As opposed to a bare-bones CSV file, Excel files contain a lot of metadata, or put more simply, data about the data. Examples include the used of bold and italic fonts, colored cells, different column widths, etc. However, going forward we will only avail ourselves of just the data, and not the metadata, as saved in a CSV file. There are many ways to read in this data into RStudio. Here are two of the simplest; for the purposes of practice, we suggest you try both. First, we can use the read_csv() function from the readr package to read in the data directly off the web: dem_score <- read_csv("http://ismayc.github.io/dem_score.csv") dem_score # A tibble: 96 x 10 country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992` <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> 1 Albania -9 -9 -9 -9 -9 -9 -9 -9 5 2 Argentina -9 -1 -1 -9 -9 -9 -8 8 7 3 Armenia -9 -7 -7 -7 -7 -7 -7 -7 7 4 Australia 10 10 10 10 10 10 10 10 10 5 Austria 10 10 10 10 10 10 10 10 10 6 Azerbaijan -9 -7 -7 -7 -7 -7 -7 -7 1 7 Belarus -9 -7 -7 -7 -7 -7 -7 -7 7 8 Belgium 10 10 10 10 10 10 10 10 10 9 Bhutan -10 -10 -10 -10 -10 -10 -10 -10 -10 10 Bolivia -4 -3 -3 -4 -7 -7 8 9 9 # ... with 86 more rows Second, let’s read in the same data, but using the file you just downloaded on to your computer: Go to the Files pane of RStudio -> Navigate the directories to where your downloaded files are -> Right click dem_score.csv -> Click “Import Dataset…” -> Click “Import”. You’ll see two things happen: The RStudio Viewer will pop open with your data. In the console, the command that read-in the data will run. You can copy and paste this code to reload your data again later. In this dem_score data frame, the minimum value of -10 corresponds to a highly autocratic nation whereas a value of 10 corresponds to a highly democratic nation. Note also that backticks surround the different names of the columns here. Variable names are not allowed to start with a number but this can be worked around by surrounding the column name in backticks. Variable names also can’t include spaces so if you’d like to refer to the variable Stock Names above, for example, you’ll need to surround it in backticks: `Stock Names`. 4.4 Converting from wide to long Let’s focus on only the data corresponding to the country of Guatemala. guat_dem <- dem_score %>% filter(country == "Guatemala") guat_dem # A tibble: 1 x 10 country `1952` `1957` `1962` `1967` `1972` `1977` `1982` `1987` `1992` <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> 1 Guatemala 2 -6 -5 3 1 -3 -7 3 3 Now let’s produce a plot showing how the democracy scores have changed over the 40 years from 1952 to 1992 for Guatemala. Let’s start by laying out how we would map our aesthetics to variables in the data frame: The data frame is guat_dem by setting data = guat_dem What are the names of the variables to plot? We’d like to see how the democracy score has changed over the years. Now we are stuck in a predicament. We see that we have a variable named country but its only value is "Guatemala". We have other variables denoted by different year values. Unfortunately, we’ve run into a dataset that is not in the appropriate format to apply the Grammar of Graphics and ggplot2. Remember that ggplot2 is a package in the tidyverse and, thus, needs data to be in a tidy format. We’d like to finish off our mapping of aesthetics to variables by doing something like The aesthetic mapping is set by aes(x = year, y = democracy_score) but this is not possible with our wide-formatted data. We need to take the values of the current column names in guat_dem (aside from country) and convert them into a new variable that will act as a key called year. Then, we’d like to take the numbers on the inside of the table and turn them into a column that will act as values called democracy_score. Our resulting data frame will have three columns: country, year, and democracy_score. The gather() function in the tidyr package can complete this task for us. The first argument to gather(), just as with ggplot2(), is the data argument where we specify which data frame we would like to tidy. The next two arguments to gather() are key and value, which specify what we’d like to call the new columns that convert our wide data into long format. Lastly, we include a specification for variables we’d like to NOT include in this tidying process using a -. guat_tidy <- gather(data = guat_dem, key = year, value = democracy_score, - country) guat_tidy # A tibble: 9 x 3 country year democracy_score <chr> <chr> <int> 1 Guatemala 1952 2 2 Guatemala 1957 -6 3 Guatemala 1962 -5 4 Guatemala 1967 3 5 Guatemala 1972 1 6 Guatemala 1977 -3 7 Guatemala 1982 -7 8 Guatemala 1987 3 9 Guatemala 1992 3 We can now create the plot to show how the democracy score of Guatemala changed from 1952 to 1992 using a linegraph and ggplot2. ggplot(data = guat_tidy, mapping = aes(x = year, y = democracy_score)) + geom_line() geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic? Observe that the year variable in guat_tidy is stored as a character vector since we had to circumvent the naming rules in R by adding backticks around the different year columns in guat_dem. This is leading to ggplot not knowing exactly how to plot a line using a categorical variable. We can fix this by using the parse_number() function in the readr package and then specify the horizontal axis label to be "year": ggplot(data = guat_tidy, mapping = aes(x = parse_number(year), y = democracy_score)) + geom_line() + labs(x = "year") Figure 4.2: Guatemala’s democracy score ratings from 1952 to 1992 We’ll see in Chapter 5 how we could use the mutate() function to change year to be a numeric variable instead after we have done our tidying. Notice now that the mappings of aesthetics to variables make sense in Figure 4.2: The data frame is guat_tidy by setting data = dem_score The x aesthetic is mapped to year The y aesthetic is mapped to democracy_score The geom_etry chosen is line Learning check (LC4.3) Convert the dem_score data frame into a tidy data frame and assign the name of dem_tidy to the resulting long-formatted data frame. (LC4.4) Read in the life expectancy data stored at http://ismayc.github.io/le_mess.csv and convert it to a tidy data frame. 4.5 Optional: Normal forms of data The datasets included in the nycflights13 package are in a form that minimizes redundancy of data. We will see that there are ways to merge (or join) the different tables together easily. We are capable of doing so because each of the tables have keys in common to relate one to another. This is an important property of normal forms of data. The process of decomposing data frames into less redundant tables without losing information is called normalization. More information is available on Wikipedia. We saw an example of this above with the airlines dataset. While the flights data frame could also include a column with the names of the airlines instead of the carrier code, this would be repetitive since there is a unique mapping of the carrier code to the name of the airline/carrier. Below an example is given showing how to join the airlines data frame together with the flights data frame by linking together the two datasets via a common key of "carrier". Note that this “joined” data frame is assigned to a new data frame called joined_flights. The key variable that we frequently join by is one of the identification variables mentioned above. library(dplyr) joined_flights <- inner_join(x = flights, y = airlines, by = "carrier") View(joined_flights) If we View this dataset, we see a new variable has been created called name. (We will see in Subsection 5.9.2 ways to change name to a more descriptive variable name.) More discussion about joining data frames together will be given in Chapter 5. We will see there that the names of the columns to be linked need not match as they did here with "carrier". Learning check (LC4.5) What are common characteristics of “tidy” datasets? (LC4.6) What makes “tidy” datasets useful for organizing data? (LC4.7) What are some advantages of data in normal forms? What are some disadvantages? 4.6 Conclusion 4.6.1 Review questions Review questions have been designed using the fivethirtyeight R package (Kim, Ismay, and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the Tidy Data chapter of the DataCamp course available here. 4.6.2 What’s to come? In Chapter 5, we’ll further explore data in tidy format by grouping our data, creating summaries based on those groupings, filtering our data to match conditions, and performing other wranglings with our data including defining new columns/variables. These data wrangling procedures will go hand-in-hand with the data visualizations you’ve produced in Chapter 3. 4.6.3 Script of R code An R script file of all R code used in this chapter is available here. "], +["5-wrangling.html", "5 Data Wrangling via dplyr 5.1 The pipe %>% 5.2 Five Main Verbs - The 5MV 5.3 5MV#1: Filter observations using filter 5.4 5MV#2: Summarize variables using summarize 5.5 5MV#3: Group rows using group_by 5.6 5MV#4: Create new variables/change old variables using mutate 5.7 5MV#5: Reorder the data frame using arrange 5.8 Joining data frames 5.9 Optional: Other verbs 5.10 Conclusion", " 5 Data Wrangling via dplyr Let’s briefly recap where we have been so far and where we are headed. In Chapter 4, we discussed what it means for data to be tidy. We saw that this refers to observations corresponding to rows and variables being stored in columns (one variable for every column). The entries in the data frame correspond to different combinations of observations (specific instances of observational units) and variables. In the flights data frame, we saw that each row corresponds to a different flight leaving New York City. In other words, the observational unit of the flights tidy data frame is a flight. The variables are listed as columns, and for flights these columns include both quantitative variables like dep_delay and distance and also categorical variables like carrier and origin. An entry in the table corresponds to a particular flight on a given day and a particular value of a given variable representing that flight. Armed with this knowledge and looking back on Chapter 3, we see that organizing data in this tidy way makes it easy for us to produce graphics, specifically a set of 5 common graphics we termed the 5 Named Graphics (5NG): scatterplots linegraphs boxplots histograms barplots We can simply specify what variable/column we would like on one axis, (if applicable) what variable we’d like on the other axis, and what type of plot we’d like to make by specifying the geometric object in question. We can also vary aesthetic attributes of the geometric objects in question (points, lines, bar), such as the size and color, along the values of another variable in this tidy dataset. Recall the Gapminder example from Figure 3.1. Lastly, in a few spots in Chapter 3 and Chapter 4, we hinted at some ways to summarize and wrangle data to suit your needs, using the filter() and inner_join() functions. This chapter expands on these functions by giving a variety of examples using what we term the Five Main Verbs (5MV) in the dplyr package (Wickham et al. 2017). Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(nycflights13) library(knitr) 5.1 The pipe %>% Before we introduce the five main verbs, we first introduce the pipe operator (%>%). Just as the + sign was used to add layers to a plot created using ggplot(), the pipe operator allows us to chain together dplyr data wrangling functions. The pipe operator can be read as “then”. The %>% operator allows us to go from one step in dplyr to the next easily so we can, for example: filter our data frame to only focus on a few rows then group_by another variable to create groups then summarize this grouped data to calculate the mean for each level of the group. The piping syntax will be our major focus throughout the rest of this book and you’ll find that you’ll quickly be addicted to the chaining with some practice. 5.2 Five Main Verbs - The 5MV The d in dplyr stands for data frames, so the functions in dplyr are built for working with objects of the data frame type. For now, we focus on the 5MV: the five most commonly used functions that help wrangle and summarize data. A description of these verbs follows, with each section devoted to an example of that verb, or a combination of a few verbs, in action. filter(): Pick rows based on conditions about their values summarize(): Compute summary measures known as “summary statistics” of variables group_by(): Group rows of observations together mutate(): Create a new variable in the data frame by mutating existing ones arrange(): Arrange/sort the rows based on one or more variables Just as we had the Five Named Graphs for data visualization using ggplot2 in Chapter 3, we have the 5MV here (The Five Main Verbs in dplyr) for data wrangling. All of the 5MVs follow the same syntax, with the argument before the pipe %>% being the name of the data frame, then the name of the verb, followed with other arguments specifying which criteria you’d like the verb to work with in parentheses. Keep in mind, there are more advanced functions than just these five and you’ll see some examples of this near the end of this chapter in 5.9, but with the 5MV you’ll be able to perform a broad array of data wrangling tasks. 5.3 5MV#1: Filter observations using filter Figure 5.1: Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet The filter function here works much like the “Filter” option in Microsoft Excel; it allows you to specify criteria about values of a variable in your dataset and then chooses only those rows that match that criteria. We begin by focusing only on flights from New York City to Portland, Oregon. The dest code (or airport code) for Portland, Oregon is "PDX". Run the following and look at the resulting spreadsheet to ensure that only flights heading to Portland are chosen here: portland_flights <- flights %>% filter(dest == "PDX") View(portland_flights) Note the following: The ordering of the commands: Take the data frame flights then filter the data frame so that only those where the dest equals "PDX" are included. The double equal sign == for testing for equality, and not =. You are almost guaranteed to make the mistake at least once of only including one equals sign. You can combine multiple criteria together using operators that make comparisons: | corresponds to “or” & corresponds to “and” We can often skip the use of & and just separate our conditions with a comma. You’ll see this in the example below. In addition, you can use other mathematical checks (similar to ==): > corresponds to “greater than” < corresponds to “less than” >= corresponds to “greater than or equal to” <= corresponds to “less than or equal to” != corresponds to “not equal to” To see many of these in action, let’s select all flights that left JFK airport heading to Burlington, Vermont ("BTV") or Seattle, Washington ("SEA") in the months of October, November, or December. Run the following btv_sea_flights_fall <- flights %>% filter(origin == "JFK", (dest == "BTV" | dest == "SEA"), month >= 10) View(btv_sea_flights_fall) Note: even though colloquially speaking one might say “all flights leaving Burlington, Vermont and Seattle, Washington,” in terms of computer logical operations, we really mean “all flights leaving Burlington, Vermont or Seattle, Washington.” For a given row in the data, dest can be “BTV”, “SEA”, or something else, but not “BTV” and “SEA” at the same time. Another example uses the ! to pick rows that don’t match a condition. The ! can be read as “not”. Here we are selecting rows corresponding to flights that didn’t go to Burlington, VT or Seattle, WA. not_BTV_SEA <- flights %>% filter(!(dest == "BTV" | dest == "SEA")) View(not_BTV_SEA) As a final note we point out that filter() should often be the first verb you’ll apply to your data. This cleans your dataset to only those rows you care about, or put differently, it narrows down the scope to just the observations your care about. Learning check (LC5.1) What’s another way using the “not” operator ! we could filter only the rows that are not going to Burlington, VT nor Seattle, WA in the flights data frame? Test this out using the code above. 5.4 5MV#2: Summarize variables using summarize The next common task when working with data is to be able to summarize data: take a large number of values and summarize then with a single value. While this may seem like a very abstract idea, something as simple as the sum, the smallest value, and the largest values are all summaries of a large number of values. Figure 5.2: Summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet Figure 5.3: Another summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet We can calculate the standard deviation and mean of the temperature variable temp in the weather data frame of nycflights13 in one step using the summarize (or equivalently using the UK spelling summarise) function in dplyr (See Appendix A): summary_temp <- weather %>% summarize(mean = mean(temp), std_dev = sd(temp)) kable(summary_temp) mean std_dev NA NA In this chapter we’ll display the contents of certain data frames using the kable() function from the knitr package in Subsection 2.4.3; all it does is yield an alternative formating of all data frames. We’ve created a small data frame here called summary_temp that includes both the mean and the std_dev of the temp variable in weather. Notice as shown in Figures 5.2 and 5.3, the data frame weather went from many rows to a single row of just the summary values in the data frame summary_temp. But why are the values returned NA? This stands for “not available or not applicable” and is how R encodes missing values; if in a data frame for a particular row and column no value exists, NA is stored instead. Furthermore, by default any time you try to summarize a number of values (using mean() and sd() for example) that has one or more missing values, then NA is returned. Values can missing for many reasons. Perhaps the data was collected but someone forgot to enter it? Perhaps the data was not collected at all because it was too difficult? Perhaps there was an erroneous value that someone entered that has been correct to read as missing? You’ll often encounter issues with missing values. You can summarize all non-missing values by setting the na.rm argument to TRUE (rm is short for “remove”). This will remove any NA missing values and only return the summary value for all non-missing values. So the code below computes the mean and standard deviation of all non-missing values. Notice how the na.rm=TRUE are set as arguments to the mean() and sd() functions, and not to the summarize() function. summary_temp <- weather %>% summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) kable(summary_temp) mean std_dev 55.2 17.78 It is not good practice to include a na.rm = TRUE in your summary commands by default; you should attempt to run them without this argument. This is because removing missing data can have an impact on your analyses. In fact, an entire branch of the field of statistics deals with missing data. The take away point is that na.rm = TRUE should only be used after you aware of the implications of its use (see the Learning Checks below for an example). What other summary functions can we use inside the summarize() verb? Any function in R that takes a vector of values and returns just one. Here are just a few: mean(): the mean AKA the average sd(): the standard deviation, which is a measure of spread min() and max(): the minimum and maximum values respectively IQR(): Interquartile range sum(): the sum n(): a count of the number of rows/observations in each group. This particular summary function will make more sense when group_by() is covered in Section 5.5. Learning check (LC5.2) Say a doctor is studying the effect of smoking on lung cancer for a large number of patients who have records measured at five year intervals. She notices that a large number of patients have missing data points because the patient has died, so she chooses to ignore these patients in his analysis. What is wrong with this doctor’s approach? (LC5.3) Modify the above summarize function to create summary_temp to also use the n() summary function: summarize(count = n()). What does the returned value correspond to? (LC5.4) Why doesn’t the following code work? Run the code line by line instead of all at once, and then look at the data. In other words, run summary_temp <- weather %>% summarize(mean = mean(temp, na.rm = TRUE)) first. summary_temp <- weather %>% summarize(mean = mean(temp, na.rm = TRUE)) %>% summarize(std_dev = sd(temp, na.rm = TRUE)) 5.5 5MV#3: Group rows using group_by Figure 5.4: Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet It’s often more useful to summarize a variable based on the groupings of another variable. Let’s say, we are interested in the mean and standard deviation of temperatures but grouped by month. To be more specific: we want the mean and standard deviation of temperatures split by month. sliced by month. aggregated by month. collapsed over month. Run the following code: summary_monthly_temp <- weather %>% group_by(month) %>% summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) kable(summary_monthly_temp) month mean std_dev 1 35.64 10.185 2 34.15 6.940 3 39.81 6.225 4 51.67 8.785 5 61.59 9.609 6 72.14 7.603 7 80.01 7.148 8 74.40 5.171 9 67.43 8.476 10 60.03 8.830 11 45.11 10.502 12 38.37 9.941 This code is identical to the previous code that created summary_temp, with an extra group_by(month) added. Grouping the weather dataset by month and then passing this new data frame into summarize yields a data frame that shows the mean and standard deviation of temperature for each month in New York City. Note: Since each row in summary_monthly_temp represents a summary of different rows in weather, the observational units have changed. It is important to note that group_by doesn’t change the data frame. It sets meta-data (data about the data), specifically the group structure of the data. It is only after we apply the summarize function that the data frame changes. If we would like to remove this group structure meta-data, we can pipe the resulting data frame into the ungroup() function. For example, say the group structure meta-data is set to be by month via group_by(month), all future summarizations will be reported on a month-by-month basis. If however, we would like to no longer have this and have all summarizations be for all data in a single group (in this case over the entire year of 2013), then pipe the data frame in question through and ungroup() to remove this. We now revisit the n() counting summary function we introduced in the previous section. For example, suppose we’d like to get a sense for how many flights departed each of the three airports in New York City: by_origin <- flights %>% group_by(origin) %>% summarize(count = n()) kable(by_origin) origin count EWR 120835 JFK 111279 LGA 104662 We see that Newark ("EWR") had the most flights departing in 2013 followed by "JFK" and lastly by LaGuardia ("LGA"). Note there is a subtle but important difference between sum() and n(). While sum() simply adds up a large set of numbers, the latter counts the number of times each of many different values occur. You are not limited to grouping by one variable! Say you wanted to know the number of flights leaving each of the three New York City airports for each month, we can also group by a second variable month: group_by(origin, month). by_monthly_origin <- flights %>% group_by(origin, month) %>% summarize(count = n()) by_monthly_origin # A tibble: 36 x 3 # Groups: origin [?] origin month count <chr> <int> <int> 1 EWR 1 9893 2 EWR 2 9107 3 EWR 3 10420 4 EWR 4 10531 5 EWR 5 10592 6 EWR 6 10175 7 EWR 7 10475 8 EWR 8 10359 9 EWR 9 9550 10 EWR 10 10104 # ... with 26 more rows Alternatively, you can use the shortcut count() function in dplyr to get the same result: by_monthly_origin <- flights %>% count(origin, month) by_monthly_origin Learning check (LC5.5) Recall from Chapter 3 when we looked at plots of temperatures by months in NYC. What does the standard deviation column in the summary_monthly_temp data frame tell us about temperatures in New York City throughout the year? (LC5.6) What code would be required to get the mean and standard deviation temperature for each day in 2013 for NYC? (LC5.7) Recreate by_monthly_origin, but instead of grouping via group_by(origin, month), group variables in a different order group_by(month, origin). What differs in the resulting dataset? (LC5.8) How could we identify how many flights left each of the three airports for each carrier? (LC5.9) How does the filter operation differ from a group_by followed by a summarize? 5.6 5MV#4: Create new variables/change old variables using mutate Figure 5.5: Mutate diagram from Data Wrangling with dplyr and tidyr cheatsheet When looking at the flights dataset, there are some clear additional variables that could be calculated based on the values of variables already in the dataset. Passengers are often frustrated when their flights departs late, but change their mood a bit if pilots can make up some time during the flight to get them to their destination close to when they expected to land. This is commonly referred to as “gain” and we will create this variable using the mutate function. Note that we have also overwritten the flights data frame with what it was before as well as an additional variable gain here, or put differently, the mutate() command outputs a new data frame which then gets saved over the original flights data frame. flights <- flights %>% mutate(gain = dep_delay - arr_delay) Why did we overwrite flights instead of assigning the resulting data frame to a new object, like flights_with_gain? As a rough rule of thumb, as long as you are not losing information that you might need later, it’s acceptable practice to overwrite data frames. However, if you overwrite existing variables and/or change the observational units, recovering the original information might prove difficult. In this case, it might make sense to create a new data object. Let’s look at summary measures of this gain variable and even plot it in the form of a histogram: gain_summary <- flights %>% summarize( min = min(gain, na.rm = TRUE), q1 = quantile(gain, 0.25, na.rm = TRUE), median = quantile(gain, 0.5, na.rm = TRUE), q3 = quantile(gain, 0.75, na.rm = TRUE), max = max(gain, na.rm = TRUE), mean = mean(gain, na.rm = TRUE), sd = sd(gain, na.rm = TRUE), missing = sum(is.na(gain)) ) kable(gain_summary) min q1 median q3 max mean sd missing -196 -3 7 17 109 5.66 18.04 9430 We’ve recreated the summary function we saw in Chapter 3 here using the summarize function in dplyr. ggplot(data = flights, mapping = aes(x = gain)) + geom_histogram(color = "white", bins = 20) Figure 5.6: Histogram of gain variable We can also create multiple columns at once and even refer to columns that were just created in a new column. Hadley and Garrett produce one such example in Chapter 5 of “R for Data Science” (Grolemund and Wickham 2016): flights <- flights %>% mutate( gain = dep_delay - arr_delay, hours = air_time / 60, gain_per_hour = gain / hours ) Learning check (LC5.10) What do positive values of the gain variable in flights correspond to? What about negative values? And what about a zero value? (LC5.11) Could we create the dep_delay and arr_delay columns by simply subtracting dep_time from sched_dep_time and similarly for arrivals? Try the code out and explain any differences between the result and what actually appears in flights. (LC5.12) What can we say about the distribution of gain? Describe it in a few sentences using the plot and the gain_summary data frame values. 5.7 5MV#5: Reorder the data frame using arrange One of the most common things people working with data would like to do is sort the data frames by a specific variable in a column. Have you ever been asked to calculate a median by hand? This requires you to put the data in order from smallest to highest in value. The dplyr package has a function called arrange that we will use to sort/reorder our data according to the values of the specified variable. This is often used after we have used the group_by and summarize functions as we will see. Let’s suppose we were interested in determining the most frequent destination airports from New York City in 2013: freq_dest <- flights %>% group_by(dest) %>% summarize(num_flights = n()) freq_dest # A tibble: 105 x 2 dest num_flights <chr> <int> 1 ABQ 254 2 ACK 265 3 ALB 439 4 ANC 8 5 ATL 17215 6 AUS 2439 7 AVL 275 8 BDL 443 9 BGR 375 10 BHM 297 # ... with 95 more rows You’ll see that by default the values of dest are displayed in alphabetical order here. We are interested in finding those airports that appear most: freq_dest %>% arrange(num_flights) # A tibble: 105 x 2 dest num_flights <chr> <int> 1 LEX 1 2 LGA 1 3 ANC 8 4 SBN 10 5 HDN 15 6 MTJ 15 7 EYW 17 8 PSP 19 9 JAC 25 10 BZN 36 # ... with 95 more rows This is actually giving us the opposite of what we are looking for. It tells us the least frequent destination airports first. To switch the ordering to be descending instead of ascending we use the desc (descending) function: freq_dest %>% arrange(desc(num_flights)) # A tibble: 105 x 2 dest num_flights <chr> <int> 1 ORD 17283 2 ATL 17215 3 LAX 16174 4 BOS 15508 5 MCO 14082 6 CLT 14064 7 SFO 13331 8 FLL 12055 9 MIA 11728 10 DCA 9705 # ... with 95 more rows 5.8 Joining data frames Another common task is joining/merging two different datasets. For example, in the flights data, the variable carrier lists the carrier code for the different flights. While "UA" and "AA" might be somewhat easy to guess for some (United and American Airlines), what are “VX”, “HA”, and “B6”? This information is provided in a separate data frame airlines. View(airlines) We see that in airports, carrier is the carrier code while name is the full name of the airline. Using this table, we can see that “VX”, “HA”, and “B6” correspond to Virgin America, Hawaiian Airlines, and JetBlue respectively. However, will we have to continually look up the carrier’s name for each flight in the airlines dataset? No! Instead of having to do this manually, we can have R automatically do the “looking up” for us. Note that the values in the variable carrier in flights match the values in the variable carrier in airlines. In this case, we can use the variable carrier as a key variable to join/merge/match the two data frames by. Hadley and Garrett (Grolemund and Wickham 2016) created the following diagram to help us understand how the different datasets are linked: Figure 5.7: Data relationships in nycflights13 from R for Data Science 5.8.1 Joining by key variables In both flights and airlines, the key variable we want to join/merge/match the two data frames with has the same name in both datasets: carriers. We make use of the inner_join() function to join by the variable carrier. flights_joined <- flights %>% inner_join(airlines, by = "carrier") View(flights) View(flights_joined) We observed that the flights and flights_joined are identical except that flights_joined has an additional variable name whose values were drawn from airlines. A visual representation of the inner_join is given below (Grolemund and Wickham 2016): Figure 5.8: Diagram of inner join from R for Data Science There are more complex joins available, but the inner_join will solve nearly all of the problems you’ll face in our experience. 5.8.2 Joining by key variables with different names Say instead, you are interested in all the destinations of flights from NYC in 2013 and ask yourself: “What cities are these airports in?” “Is "ORD" Orlando?” “Where is "FLL"? The airports data frame contains airport codes: View(airports) However, looking at both the airports and flights and the visual representation of the relations between the data frames in Figure 5.8, we see that in: airports the airport code is in the variable faa flights the airport code is in the variable origin So to join these two datasets, our inner_join operation involves a by argument that accounts for the different names: flights %>% inner_join(airports, by = c("dest" = "faa")) Let’s construct the sequence of commands that computes the number of flights from NYC to each destination, but also includes information about each destination airport: named_dests <- flights %>% group_by(dest) %>% summarize(num_flights = n()) %>% arrange(desc(num_flights)) %>% inner_join(airports, by = c("dest" = "faa")) %>% rename(airport_name = name) View(named_dests) In case you didn’t know, "ORD" is the airport code of Chicago O’Hare airport and "FLL" is the main airport in Fort Lauderdale, Florida, which we can now see in our named_freq_dests data frame. Learning check (LC5.13) Looking at Figure 5.7, when joining flights and weather (or, in other words, matching the hourly weather values with each flight), why do we need to join by all of year, month, day, hour, and origin, and not just hour? (LC5.14) What surprises you about the top 10 destinations from NYC in 2013? 5.9 Optional: Other verbs On top of the following examples of other verbs, if you’d like to see more examples on using dplyr, the 5MV, and %>% with the nycflights13 dataset, check out Chapter 5 of Hadley and Garrett’s book (Grolemund and Wickham 2016). 5.9.1 Select variables using select Figure 5.9: Select diagram from Data Wrangling with dplyr and tidyr cheatsheet We’ve seen that the flights data frame in the nycflights13 package contains many different variables. The names function gives a listing of all the columns in a data frame; in our case you would run names(flights). You can also identify these variables by running the glimpse function in the dplyr package: glimpse(flights) However, say you only want to consider two of these variables, say carrier and flight. You can select these: flights %>% select(carrier, flight) Another one of these variables is year. If you remember the original description of the flights data frame (or by running ?flights), you’ll remember that this data correspond to flights in 2013 departing New York City. The year variable isn’t really a variable here in that it doesn’t vary… flights actually comes from a larger dataset that covers many years. We may want to remove the year variable from our dataset since it won’t be helpful for analysis in this case. We can deselect year by using the - sign: flights_no_year <- flights %>% select(-year) names(flights_no_year) Or we could specify a ranges of columns: flight_arr_times <- flights %>% select(month:day, arr_time:sched_arr_time) flight_arr_times The select function can also be used to reorder columns in combination with the everything helper function. Let’s suppose we’d like the hour, minute, and time_hour variables, which appear at the end of the flights dataset, to actually appear immediately after the day variable: flights_reorder <- flights %>% select(month:day, hour:time_hour, everything()) names(flights_reorder) in this case everything() picks up all remaining variables. Lastly, the helper functions starts_with, ends_with, and contains can be used to choose column names that match those conditions: flights_begin_a <- flights %>% select(starts_with("a")) flights_begin_a flights_delays <- flights %>% select(ends_with("delay")) flights_delays flights_time <- flights %>% select(contains("time")) flights_time 5.9.2 Rename variables using rename Another useful function is rename, which as you may suspect renames one column to another name. Suppose we wanted dep_time and arr_time to be departure_time and arrival_time instead in the flights_time data frame: flights_time_new <- flights %>% select(contains("time")) %>% rename(departure_time = dep_time, arrival_time = arr_time) names(flights_time) Note that in this case we used a single = sign with the rename(). Ex: departure_time = dep_time. This is because we are not testing for equality like we would using ==, but instead we want to assign a new variable departure_time to have the same values as dep_time and then delete the variable dep_time. It’s easy to forget if the new name comes before or after the equals sign. I usually remember this as “New Before, Old After” or NBOA. You’ll receive an error if you try to do it the other way: Error: Unknown variables: departure_time, arrival_time. 5.9.3 Find the top number of values using top_n We can also use the top_n function which automatically tells us the most frequent num_flights. We specify the top 10 airports here: named_dests %>% top_n(n = 10, wt = num_flights) We’ll still need to arrange this by num_flights though: named_dests %>% top_n(n = 10, wt = num_flights) %>% arrange(desc(num_flights)) Note: Remember that I didn’t pull the n and wt arguments out of thin air. They can be found by using the ? function on top_n. We can go one stop further and tie together the group_by and summarize functions we used to find the most frequent flights: ten_freq_dests <- flights %>% group_by(dest) %>% summarize(num_flights = n()) %>% arrange(desc(num_flights)) %>% top_n(n = 10) View(ten_freq_dests) Learning check (LC5.15) What are some ways to select all three of the dest, air_time, and distance variables from flights? Give the code showing how to do this in at least three different ways. (LC5.16) How could one use starts_with, ends_with, and contains to select columns from the flights data frame? Provide three different examples in total: one for starts_with, one for ends_with, and one for contains. (LC5.17) Why might we want to use the select function on a data frame? (LC5.18) Create a new data frame that shows the top 5 airports with the largest arrival delays from NYC in 2013. 5.10 Conclusion 5.10.1 Review questions Review questions have been designed using the fivethirtyeight R package (Kim, Ismay, and Chunn 2017) with links to the corresponding FiveThirtyEight.com articles in our free DataCamp course Effective Data Storytelling using the tidyverse. The material in this chapter is covered in the chapters of the DataCamp course available below: Filtering, Grouping, & Summarizing dplyr Review 5.10.2 What’s to come? Congratulations! We’ve completed the “data science” portion of this book! We’ll now move to the “data modeling” portion in Chapters 6 and 7, where you’ll leverage your data visualization and wrangling skills to model the relationships between different variables of datasets. However, we’re going to leave “Inference for Regression” (Chapter 11) until later. 5.10.3 Resources As we saw with the RStudio cheatsheet on data visualization, RStudio has also created a cheatsheet for data wrangling entitled “Data Transformation with dplyr”. 5.10.4 Script of R code An R script file of all R code used in this chapter is available here. "], +["6-regression.html", "6 Basic Regression 6.1 One numerical explanatory variable 6.2 One categorical explanatory variable 6.3 Related topics 6.4 Conclusion", " 6 Basic Regression Now that we are equipped with data visualization skills from Chapter 3, data wrangling skills from Chapter 5, and an understanding of the “tidy” data format from Chapter 4, we now proceed with data modeling. The fundamental premise of data modeling is to make explicit the relationship between: An outcome variable \\(y\\), also called a dependent variable and An explanatory/predictor variable \\(x\\), also called an independent variable or covariate. Another way to state this is using mathematical terminology: we will model the outcome variable \\(y\\) as a function of the explanatory/predictor variable \\(x\\). Why do we have two different labels, explanatory and predictor, for the variable \\(x\\)? That’s because roughly speaking data modeling can be used for two purposes: Modeling for prediction: You want to predict an outcome variable \\(y\\) based on the information contained in a set of predictor variables. You don’t care so much about understanding how all the variables relate and interact, but so long as you can make good predictions about \\(y\\), you’re fine. For example, if we know many individuals’ risk factors for lung cancer, such as smoking habits and age, can we predict whether or not they will develop lung cancer? Here we wouldn’t care so much about distinguishing the degree to which the different risk factors contribute to lung cancer, but instead only on whether or not they could be put together to make reliable predictions. Modeling for explanation: You want to explicitly describe the relationship between an outcome variable \\(y\\) and a set of explanatory variables, determine the significance of any found relationships, and have measures summarizing these. Continuing our example from above, we would now be interested in describing the individual effects of the different risk factors and quantifying the magnitude of these effects. One reason could be to design an intervention to reduce lung cancer cases in a population, such as targeting smokers of a specific age group with an advertisement for smoking cessation programs. In this book, we’ll focus more on this latter purpose. Data modeling is used in a wide variety of fields, including statistical inference, causal inference, artificial intelligence, and machine learning. There are many techniques for data modeling, such as tree-based models, neural networks/deep learning, and more. However, we’ll focus on one particular technique: linear regression, one of the most commonly-used and easy-to-understand approaches to modeling. Recall our discussion in Subsection 2.4.3 on numerical and categorical variables. Linear regression involves: An outcome variable \\(y\\) that is numerical Explanatory variables \\(\\vec{x}\\) that are either numerical or categorical Whereas there is always only one numerical outcome variable \\(y\\), we have choices on both the number and the type of explanatory variables \\(\\vec{x}\\) to use. We’re going to cover the following regression scenarios: In this chapter, Chapter 6 on basic regression, where we’ll always have only one explanatory variable: A single numerical explanatory variable \\(x\\) in Section 6.1. This scenario is known as simple linear regression. A single categorical explanatory variable \\(x\\) in Section 6.2. In the next chapter: Chapter 7 on multiple regression, where we’ll have more than one explanatory variable: Two numerical explanatory variables \\(x_1\\) and \\(x_2\\) in Section 7.1. This can be denoted as \\(\\vec{x}\\) as well since we have more than one explanatory variable. One numerical and one categorical explanatory variable in Section 7.1. We’ll also introduce interaction models here; there the effect of one explanatory variable depends on the value of another. We’ll study all four of these regression scenarios using real data, all easily accessible via R packages! Needed packages In this chapter we introduce a new package, moderndive, that is an accompaniment package to this ModernDive book that includes useful functions for linear regression and other functions and data used later in the book. Let’s now load all the packages needed for this chapter. If needed, read Section 2.3 for information on how to install and load R packages. library(ggplot2) library(dplyr) library(moderndive) library(gapminder) 6.1 One numerical explanatory variable Why do some professors and instructors at universities and colleges get high teaching evaluations from students while others don’t? What factors can explain these differences? Are there biases? These are questions that are of interest to university/college administrators, as teaching evaluations are among the many criteria considered in determining which professors and instructors should get promotions. Researchers at the University of Texas in Austin tried to answer this question: what factors can explain differences in instructor’s teaching evaluation scores? To this end, they collected information on \\(n = 463\\) instructors. A full description of the study can be found at openintro.org. We’ll keep things simple for now and try to explain differences in instructor evaluation scores as a function of one numerical variable: their “beauty score” which we’ll describe shortly. Could it be that instructors with higher beauty scores also have higher teaching evaluations? Could it be instead that instructors with higher beauty scores tend to have lower teaching evaluations? Or could it be there is no relationship between beauty score and teaching evaluations? We’ll achieve this by modeling the relationship between these two variables with a particular kind of linear regression called simple linear regression. Simple linear regression is the most basic form of linear regression where we have A numerical outcome variable \\(y\\). In this case, their teaching score. A single numerical explanatory variable \\(x\\). In this case, their beauty score. 6.1.1 Exploratory data analysis A crucial step before doing any kind of modeling or analysis is performing an exploratory data analysis, or EDA, of all our data. Exploratory data analysis can give you a sense of the distribution of the data, whether there are outliers and/or missing values, but most importantly it can inform how to build your model. There are many approaches to exploratory data analysis, here are three: Most fundamentally: just looking at the raw values, in a spreadsheet for example. While this may seem trivial, many people ignore this crucial step! Computing summary statistics likes means, medians, and standard deviations. Creating data visualizations. Let’s load the data, select only a subset of the variables, and look at the raw values. Recall you can look at the raw values by running View(evals) in the console in RStudio to pop-up the spreadsheet viewer. Here, however, we present only a snapshot of 5 randomly chosen rows: load(url("http://www.openintro.org/stat/data/evals.RData")) evals <- evals %>% select(score, bty_avg, age) Table 6.1: Random sample of 5 instructors score bty_avg age 290 3.6 6.67 34 341 4.9 3.50 43 199 3.3 2.33 47 47 4.4 4.67 33 215 4.7 3.67 60 While a full description of each of these variables can be found at openintro.org, let’s summarize what each of these variables represent score: Numerical variable of the average teaching score based on students’ evaluations between 1 and 5. This is the outcome variable \\(y\\) of interest. bty_avg: Numerical variable of average “beauty” rating based on a panel of 6 students’ scores between 1 and 10. This is the numerical explanatory variable \\(x\\) of interest. age: A numerical variable of age. Another way to look at the raw values is using the glimpse() function, which gives us a slightly different view of the data. We see Observations: 463, indicating that there are 463 observations in evals, each corresponding to a particular instructor at UT Austin. Expressed differently, each row in the data frame evals corresponds to one of 463 instructors. glimpse(evals) Observations: 463 Variables: 3 $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8, 4.5,... $ bty_avg <dbl> 5.00, 5.00, 5.00, 5.00, 3.00, 3.00, 3.00, 3.33, 3.33, 3.17,... $ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 40, 40,... Since both the outcome variable score and the explanatory variable bty_avg are numerical, we can compute summary statistics about them such as the mean and median. Let’s take evals, then select only the two variables of interest for now, and pipe them into the summary() command which returns: the minimum (smallest) value, the first quartile, the median, the mean (average), the third quartile, and the maximum (largest) value. evals %>% select(score, bty_avg) %>% summary() score bty_avg Min. :2.30 Min. :1.67 1st Qu.:3.80 1st Qu.:3.17 Median :4.30 Median :4.33 Mean :4.17 Mean :4.42 3rd Qu.:4.60 3rd Qu.:5.50 Max. :5.00 Max. :8.17 We get an idea of how the values in both variables are distributed. For example, the mean teaching score was 4.17 out of 5 whereas the mean beauty score was 4.42 out of 10. Furthermore, the middle 50% of teaching scores were between 3.80 and 4.6 (the first and third quartiles) while the middle 50% of beauty scores were between 3.17 and 5.5 out of 10. The summary() function however only returns what are called univariate summaries, i.e. summaries about single variables at a time. Since we are considering the relationship between two numerical variables, it would be nice to have a summary statistic that simultaneously considers both variables. The correlation coefficient is a bivariate summary statistic that fits this bill. Coefficients in general are quantitative expressions of a specific property of a phenomenon. A correlation coefficient is a quantitative expression between -1 and 1 that summarizes the strength of the linear relationship between two numerical variables: -1 indicates a perfect negative relationship: as the value of one variable goes up, the value of the other variable tends to go down. 0 indicates no relationship: the values of both variables go up/down independently of each other. +1 indicates a perfect positive relationship: as the value of one variable goes up, the value of the other variable tends to go up as well. Figure 6.1 gives examples of different correlation coefficient values for hypothetical numerical variables \\(x\\) and \\(y\\). We see that while for a correlation coefficient of -0.75 there is still a negative relationship between \\(x\\) and \\(y\\), it is not as strong as the negative relationship between \\(x\\) and \\(y\\) when the correlation coefficient is -1. Figure 6.1: Different correlation coefficients The correlation coefficient is computed using the cor() function, where in this case the inputs to the function are the two numerical variables from which we want to calculate the correlation coefficient. Recall from Subsection 2.4.3 that the $ pulls out specific variables from a data frame: cor(evals$score, evals$bty_avg) [1] 0.187 In our case, the correlation coefficient of 0.187 indicates that the relationship between teaching evaluation score and beauty average is “weakly positive.” There is a certain amount of subjectivity in interpreting correlation coefficients, especially those that aren’t close to -1, 0, and 1. For help developing such intuition and more discussion on the correlation coefficient see Subsection 6.3.1 below. Let’s now proceed by visualizing this data. Since both the score and bty_avg variables are numerical, a scatterplot is an appropriate graph to visualize this data. Let’s do this using geom_point() and set informative axes labels and title. ggplot(evals, aes(x = bty_avg, y = score)) + geom_point() + labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") Figure 6.2: Instructor evaluation scores at UT Austin However Figure 6.2 suffers from overplotting. Recall from the data visualization Subsection 3.3.2 that overplotting occurs when several points are stacked directly on top of each other thereby obscuring the number of points. For example, let’s focus on the 6 points in the top-right of the plot with a beauty score of around 8 out of 10: are there truly only 6 points, or are there many more just stacked on top of each other? You can think of these as ties. Let’s break up these ties with a little random “jitter” added to the points in Figure 6.3. Jittering adds a little random bump to each of the points to break up these ties. Remember that the geom_jitter only alters the visual display of the points; the values in the data frame stay the same. ggplot(evals, aes(x = bty_avg, y = score)) + geom_jitter() + labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") Figure 6.3: Instructor evaluation scores at UT Austin: Jittered From Figure 6.3 we make several observations: Focusing our attention on the top-right of the plot again, we now see that those originally unjittered 6 points actually were actually 12! A further interesting trend is that the jittering revealed a large number of instructors with beauty scores of between 3 and 4.5, towards the lower end of the beauty scale. Most beauty scores lie between 2 and 8. Most teaching scores lie between 3 and 5. Recall our earlier computation of the correlation coefficient, which describes the strength of the linear relationship between two numerical variables. Looking at Figure 6.3, it is not immediately apparent that these two variables are positively related. This is to be expected given the positive, but rather weak (close to 0), correlation coefficient of 0.187. Going back to the unjittered plot in Figure 6.2, let’s improve on it by adding a “regression line” in Figure 6.4. This is easily done by adding a new layer to the ggplot code that created Figure 6.3: + geom_smooth(method="lm"). A regression line is a “best fitting” line in that of all possible lines you could draw on this plot, it is “best” in terms of some mathematical criteria. We discuss the criteria for “best” in Subsection 6.3.3 below, but we suggest you read this only after covering the concept of a residual coming up in Subsection 6.1.3. ggplot(evals, aes(x = bty_avg, y = score)) + geom_point() + labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") + geom_smooth(method = "lm") Figure 6.4: Regression line When viewed on this plot, the regression line is a visual summary of the relationship between two numerical variables, in our case the outcome variable score and the explanatory variable bty_avg. The positive slope of the blue line is consistent with our observed correlation coefficient of 0.187 suggesting that there is a positive relationship between score and bty_avg. We’ll see later however that while the correlation coefficient is not equal to the slope of this line, they always have the same sign: positive or negative. What are the grey bands surrounding the blue line? These are standard error bands, which can be thought of as error/uncertainty bands. Let’s skip this idea for now and suppress these grey bars for now by adding the argument se = FALSE to geom_smooth(method = "lm"). We’ll introduce standard errors in Chapter 8 on sampling, use them for constructing confidence intervals and conducting hypothesis tests in Chapters 9 and 10, and consider them when we revisit regression in Chapter 11. ggplot(evals, aes(x = bty_avg, y = score)) + geom_point() + labs(x = "Beauty Score", y = "Teaching Score", title = "Relationship of teaching and beauty scores") + geom_smooth(method = "lm", se = FALSE) Figure 6.5: Regression line without error bands Learning check (LC6.1) Conduct a new exploratory data analysis with the same outcome variable \\(y\\) being score but with age as the new explanatory variable \\(x\\). Remember, this involves three things: Looking at the raw values Computing summary statistics of the variables of interest. Creating informative visualizations What can you say about the relationship between age and teaching scores based on this exploration? 6.1.2 Simple linear regression If case you’ve forgotten from high school algebra, in general, the equation of a line is \\(y = a + bx\\), which is defined by two coefficients. Recall we defined this earlier as “quantitative expressions of a specific property of a phenomenon. These two coefficients are: the intercept coefficient \\(a\\), or the value of \\(y\\) when \\(x = 0\\), and the slope coefficient \\(b\\), or the increase in \\(y\\) for every increase of one in \\(x\\). However, when defining a line specifically for regression, like the blue regression line in Figure 6.5, we use slightly different notation: the equation of the regression line is \\(\\widehat{y} = b_0 + b_1 x\\) where the intercept coefficient is \\(b_0\\), or the value of \\(\\widehat{y}\\) when \\(x=0\\), and the slope coefficient \\(b_1\\), or the increase in \\(\\widehat{y}\\) for every increase of one in \\(x\\). Why do we put a “hat” on top of the \\(y\\)? It’s a form of notation commonly used in regression, which we’ll introduce in the next Subsection 6.1.3 when we discuss fitted values. For now, let’s ignore the hat and treat the equation of the line as you would from high school algebra recognizing the slope and the intercept. We know looking at Figure 6.5 that the slope coefficient corresponding to bty_avg should be positive. Why? Because as bty_avg increases, professors tend to roughly have larger teaching evaluation scores. However, what are the specific values of the intercept and slope coefficients? Let’s not worry about computing these by hand, but instead let the computer do the work for us, specifically R! Let’s get the value of the intercept and slope coefficients by outputting something called the linear regression table. This is always done in a two-step process: First “fit” the linear regression model to the data using the lm() function and save this to score_model. lm stands for “linear model”, given that we are dealing with lines. When we say “fit”, we are saying find the best fitting line to this data. Then apply the get_regression_table() function from the moderndive R package to score_model. score_model <- lm(score ~ bty_avg, data = evals) get_regression_table(score_model, digits = 2) Table 6.2: Linear regression table term estimate std_error statistic p_value conf_low conf_high intercept 3.880 0.076 50.96 0 3.731 4.030 bty_avg 0.067 0.016 4.09 0 0.035 0.099 Whoa! There is a lot going on, both in terms of the inputs and outputs! Let’s unpack this slowly. First, the lm() function that “fits” the linear regression model is typically used as lm(y ~ x, data = DATA_FRAME_NAME) where: y is the outcome variable, followed by a tilde (~), the key to the left of “1” on your keyboard. In our case, y is set to score. x is the explanatory variable. In our case, x is set to bty_avg. We call the combination y ~ x a model formula. DATA_FRAME_NAME is the name of the data frame that contains the variables y and x. In our case the evals data frame. Then we pipe this output to be the input of the get_regression_table() function, just as when we discussed piping in Section 5.1 in the data wrangling chapter. An additional argument to the get_regression_table() function is digits, where we specify the number of significant digits of precision (number of digits after the decimal points) we want the regression table to have. digits defaults to 3, meaning if you don’t specify this argument, digits = 3 is used by default. All the get_regression_table() function in the moderndive package does is generate regression table outputs that are clean and easy-to-read while hiding a lot of the code necessary to do so and not much else. This is known as a wrapper function in computer programming, which takes other pre-existing functions and “wraps” them in a single function. While not necessary to understand regression, if you are curious to know what is going on under the hood of get_regression_table(), see Subsection 6.3.4 below. Now let’s consider the outputted regression table, which has two rows denoted by the first column term: one corresponding to the intercept coefficient \\(b_0\\) and one corresponding to the slope coefficient \\(b_1\\) for bty_avg. The second column estimate gives us the “fitted” (or computed) values for both these coefficients. Therefore the blue regression line in Figure 6.5 is \\(\\widehat{\\text{score}} = b_0 + b_{\\text{bty avg}} \\text{bty avg} = 3.88 + 0.067\\text{bty avg}\\) where The intercept coefficient \\(b_0\\) = 3.88, meaning for instructors that had a hypothetical beauty score of 0 would on average have a teaching score of 3.88. In this case however, while the intercept has a mathematical interpretation when defining the regression line, there is no practical interpretation since score is an average of a panel of 6 students’ ratings from 1 to 10, a bty_avg of 0 would be impossible. Furthermore, no instructors had a beauty score anywhere near 0. Of more interest is the slope coefficient associated with bty_avg \\(b_{\\text{bty avg}}\\) = 0.067. This is a numerical quantity that summarizes the relationship between the outcome and explanatory variables. It is interpreted as follows, for every increase of 1 unit in bty_avg, there is an associated increase of on average 0.067 units of score. We note in particular that the sign of this slope is positive, suggesting a positive relationship between beauty scores and teaching scores. We are very careful with our wording: We only stated that there is an associated increase, and not necessarily a causal increase. For example, perhaps it’s not that beauty directly affects teaching scores, but instead individuals from wealthier backgrounds tend to have had better education and training, and hence have higher teaching scores, but these same individuals also have higher beauty scores. Avoiding such reasoning can be summarized by the adage “correlation is not necessarily causation”. In other words, just because two variables are correlated, it doesn’t mean one directly causes the other. We discuss these ideas more in Subsection 6.3.2. We say that this associated increase is on average 0.067 units of teaching score and not that the associated increase is exactly 0.067 units of score across all values of bty_avg. This is because the slope is the average increase across all points as shown by the regression line in Figure 6.5. But what about the remaining 5 columns: std_error, statistic, p_value, conf_low and conf_high? They give you information on the statistical significance of these results, or their “meaningfulness” from a statistical perspective. We’ll revisit these in Chapter 11 on (statistical) inference for regression after we’ve covered standard errors in Chapter 8 (std_error), confidence intervals in Chapter 9 (conf_low and conf_high), and hypothesis testing in Chapter 10 (statistic and p_value). For now, we’ll only focus on the term and estimate columns. Learning check (LC6.2) Fit a new simple linear regression using lm(score ~ age, data = evals) where age is the new explanatory variable \\(x\\). Get information about the “best-fitting” line from the regression table by applying the get_regression_table() function. How do the regression results match up with the results from your exploratory data analysis above? 6.1.3 Observed/fitted values and residuals We just saw how to get the value of the intercept and the slope of the regression line from the regression table generated by get_regression_table(). Now instead, say we want information on individual points, in this case one of the \\(n = 463\\) instructors in this dataset, one corresponding to each row of evals. For example, say we are interested in the 21st instructor in this dataset: Table 6.3: Data for 21st instructor score bty_avg age 4.9 7.33 31 What is the value on the blue line corresponding to this instructors bty_avg of 7.333? In Figure 6.6 we mark three values in particular corresponding to this instructor. Note we revert back to the geom_point() as the geom_jitter() has random noise added to teach point, making it difficult to identify points exactly. Red circle: This is the observed value \\(y\\) = 4.9 and corresponds to this instructor’s actual teaching score. Red square: This is the fitted value \\(\\widehat{y}\\) and corresponds to the value on the regression line for \\(x\\) = 7.333. This value is computed using the intercept and slope in the regression table above: \\(\\widehat{y} = b_0 + b_1 x\\) = 3.88 + 0.067 * 7.333 = 4.369 Blue arrow: The length of this arrow is the residual and is computed by subtracting the fitted value \\(\\widehat{y}\\) from the observed value \\(y\\). The residual can be thought of as the error or “lack of fit” of the regression line, In the case of this instructor, it is \\(y - \\widehat{y}\\) = 4.9 - 4.369 = 0.531. In other words, the model was off by 0.531 teaching score units for this instructor. Figure 6.6: Example of observed value, fitted value, and residual What if we want both the fitted value \\(\\widehat{y} = b_0 + b_1 \\times x\\) the residual \\(y - \\widehat{y}\\) not only the 21st instructor but for all 463 instructors in the study? Recall that each instructor corresponds to one of the 463 rows in the evals data frame and also one of the 463 points in regression plot in Figure 6.5. We could repeat the above calculations by hand 463 times, but that would be tedious and time consuming. Instead, let’s use the get_regression_points() function that we’ve included in the moderndive R package. Note that in the table below we only present the results for 21st through 24th instructors. regression_points <- get_regression_points(score_model) regression_points Table 6.4: Regression points (for only 21st through 24th instructor) ID score bty_avg score_hat residual 21 4.9 7.33 4.37 0.531 22 4.6 7.33 4.37 0.231 23 4.5 7.33 4.37 0.131 24 4.4 5.50 4.25 0.153 Just as with the get_regression_table() function, the inputs to the get_regression_points() function are the same, however the outputs are different. Let’s inspect the individual columns: The score column represents the observed value of the outcome variable \\(y\\) The bty_avg column represents the values of the explanatory variable \\(x\\) The score_hat column represents the fitted values \\(\\widehat{y}\\) The residual column represents the residuals \\(y - \\widehat{y}\\) Just as we did for the 21st instructor in the evals dataset (in the first row of the table above), let’s repeat the above calculations for the 24th instructor in the evals dataset (in the fourth row of the table above): score = 4.4 is the observed value \\(y\\) for this instructor. bty_avg = 5.50 is the value of the explanatory variable \\(x\\) for this instructor. score_hat = 4.25 = 3.88 + 0.067 * \\(x\\) = 3.88 + 0.067 * 5.50 is the fitted value \\(\\widehat{y}\\) for this instructor. residual = 0.153 = 4.4 - 4.25 is the value of the residual for this instructor. In other words, the model was off by 0.153 teaching score units for this instructor. At this point, we suggest you read Subsection 6.3.3, where we explicitly define how a regression line is a “best” fitting line. 6.1.4 Residual analysis Recall the residuals can be thought of as the error or the “lack-of-fit” between the observed value \\(y\\) and the fitted value \\(\\widehat{y}\\) on the blue regression line in Figure 6.5. Ideally when we fit a regression model, we’d like there to be no systematic pattern to these residuals. We’ll be more specific as to what we mean by no systematic pattern when we see Figure 6.8 below, but let’s keep this notion imprecise for now. Investigating any such patterns is known as residual analysis and is the theme of this section. We’ll perform our residual analysis in two ways: Creating a scatterplot with the residuals on the \\(y\\)-axis and the original explanatory variable \\(x\\) on the \\(x\\)-axis. Creating a histogram of the residuals, thereby showing the distribution of the residuals. First, recall in Figure 6.6 above we created a scatterplot where On the vertical axis we had the teaching score \\(y\\) On the horizontal axis we had the beauty score \\(x\\) The blue arrow represented the residual for one particular instructor. Instead, in Figure 6.7 below, let’s create a scatterplot where On the vertical axis we have the residual \\(y-\\widehat{y}\\) instead On the horizontal axis we have the beauty score \\(x\\) as before Figure 6.7: Plot of residuals over beauty score You can think of Figure 6.7 as Figure 6.6 but with the blue line flattened out to \\(y=0\\). Does it seem like there is no systematic pattern to the residuals? This question is rather qualitative and subjective in nature, thus different people may respond with different answers to the above question. However, it can be argued that there isn’t a drastic pattern in the residuals. Let’s now get a little more precise in our definition of no systematic pattern in the residuals. Ideally, the residuals should behave randomly and The residuals should be on average 0. In other words, sometimes the regression model will make a positive error in that \\(y - \\widehat{y} > 0\\), sometimes the regression model will make a negative error in that \\(y - \\widehat{y} < 0\\), but on average the error is 0. The value and spread of the residuals should not depend on the value of \\(x\\). In Figure 6.8 below, we display some hypothetical examples where there are drastic patterns to the residuals. In Example 1, the value of the residual seems to depend on \\(x\\): the residuals tend to be positive for small and large values of \\(x\\) in this range, whereas values of \\(x\\) more in the middle tend to have negative residuals. In Example 2, while the residuals seem to be on average 0 for each value of \\(x\\), the spread of the residuals varies for different values of \\(x\\); this situation is known as heteroskedasticity. Figure 6.8: Examples of less than ideal residual patterns The second way to perform a residual analysis is to look at the histogram of the residuals: ggplot(regression_points, aes(x = residual)) + geom_histogram(binwidth = 0.25, color = "white") + labs(x = "Residual") (#fig:model1_residuals_hist)Histogram of residuals This histogram seems to indicate that we have more positive residuals than negative. Since residual = \\(y-\\widehat{y} > 0\\) when \\(y > \\widehat{y}\\), it seems our fitted teaching score from the regression model tends to underestimate the true teaching score. This histogram has a slight left-skew in that there is a long tail on the left. Another way to say this is this data exhibits a negative skew. Is this a problem? Again, there is a certain amount of subjectivity in the response. In the authors’ opinion, while there is a slight skew/pattern to the residuals isn’t a large concern. On the other hand, others might disagree with our assessment. Here are examples of an ideal and less than ideal pattern to the residuals when viewed in a histogram: Figure 6.9: Examples of ideal and less than ideal residual patterns In fact, we’ll see later on that we would like the residuals to be normally distributed with mean 0. In other words, be bell-shaped and centered at 0! While this requirement and residual analysis in general may seem to some of you as not being overly critical at this point, we’ll see later after when we cover inference for regression in Chapter 11 that for the last five columns of the regression table from earlier (std error, statistic, p_value,conf_low, and conf_high) to have valid interpretations, the above three conditions should roughly hold. Learning check (LC6.3) Continuing with our regression using age as the explanatory variable and teaching score as the outcome variable, use the get_regression_points() function to get the observed values, fitted values, and residuals for all 463 instructors. Perform a residual analysis and look for any systematic patterns in the residuals. Ideally, there should be little to no pattern. 6.2 One categorical explanatory variable It’s an unfortunate truth that life expectancy is not the same across various countries in the world; there are a multitude of factors that are associated with how long people live. International development agencies are very interested in studying these differences in the hope of understanding where governments should allocate resources to address this problem. In this section, we’ll explore differences in life expectancy in two ways: Differences between continents: Are there significant differences in life expectancy, on average, between the five continents of the world: Africa, the Americas, Asia, Europe, and Oceania? Differences within continents: How does life expectancy vary within the world’s five continents? For example, is the spread of life expectancy among the countries of Africa larger than the spread of life expectancy among the countries of Asia? To answer such questions, we’ll study the gapminder dataset in the gapminder package. Recall we introduced this dataset in Subsection 3.1.2 when we first studied the “Grammar of Graphics”; in particular Figure 3.1. This dataset has international development statistics such as life expectancy, GDP per capita, and population by country (\\(n\\) = 142) for 5-year intervals between 1952 and 2007. We’ll use this data for linear regression again, but note that our explanatory variable \\(x\\) is now categorical, and not numerical like when we covered simple linear regression in Section 6.1: A numerical outcome variable \\(y\\). In this case, life expectancy. A single categorical explanatory variable \\(x\\), In this case, the continent the country is part of. When the explanatory variable \\(x\\) is categorical, the concept of a “best-fitting” line is a little different than the one we saw previously in Section 6.1 where the explanatory variable \\(x\\) was numerical. We’ll study these differences shortly in Subsection 6.2.2, but first our exploratory data analysis. 6.2.1 Exploratory data analysis Let’s load the gapminder data, filter() for only observations in 2007, select() only the variables we’ll need along with gdpPercap which is each country’s gross domestic product per capita, a rough measure of that country’s economic performance (this will be used for the upcoming Learning Check). Save this in a data frame gapminder2007: library(gapminder) gapminder2007 <- gapminder %>% filter(year == 2007) %>% select(country, continent, lifeExp, gdpPercap) Let’s look at the raw data values both by bringing up RStudio’s spreadsheet viewer and the glimpse() function, although in Table 6.5 we only show 5 randomly selected countries out of 142: View(gapminder2007) Table 6.5: Random sample of 5 countries country continent lifeExp gdpPercap Slovak Republic Europe 74.7 18678 Israel Asia 80.7 25523 Bulgaria Europe 73.0 10681 Tanzania Africa 52.5 1107 Myanmar Asia 62.1 944 glimpse(gapminder2007) Observations: 142 Variables: 4 $ country <fctr> Afghanistan, Albania, Algeria, Angola, Argentina, Austra... $ continent <fctr> Asia, Europe, Africa, Africa, Americas, Oceania, Europe,... $ lifeExp <dbl> 43.8, 76.4, 72.3, 42.7, 75.3, 81.2, 79.8, 75.6, 64.1, 79.... $ gdpPercap <dbl> 975, 5937, 6223, 4797, 12779, 34435, 36126, 29796, 1391, ... We see that the variable continent is indeed categorical, as it is encoded as fctr which stands for “factor”: R’s way of storing categorical variables. Let’s look at a summary of the explanatory variable continent: summary(gapminder2007$continent) Africa Americas Asia Europe Oceania 52 25 33 30 2 We observe that all other continents have 25 countries or more, but Oceania only has two: Australia and New Zealand. Let’s now compute some summary statistics of the outcome variable lifeExp, in particular the worldwide median and mean life expectancy lifeExp_worldwide <- gapminder2007 %>% summarize(median = median(lifeExp), mean = mean(lifeExp)) Table 6.6: Worldwide life expectancy median mean 71.9 67 Given that the global median life expectancy is 71.935 half of the world’s countries (71 countries) will have a life expectancy less than 71.935, while half will have a life expectancy greater than this value. The mean life expectancy of 67.007 is lower however. Why are these two values different? Let’s look at a histogram of lifeExp to see why. ggplot(gapminder2007, aes(x = lifeExp)) + geom_histogram(binwidth = 5, color = "white") + labs(x = "Life expectancy", y = "Number of countries", title = "Worldwide life expectancy") We see that this data is left-skewed/negatively skewed: there are a few countries with very low life expectancies that are bringing down the mean life expectancy. However, the median is less sensitive to the effects of such outliers. Hence the median is greater than the mean in this case. Let’s proceed by comparing median and mean life expectancy between continents by adding a group_by(continent) to the above code: lifeExp_by_continent <- gapminder2007 %>% group_by(continent) %>% summarize(median = median(lifeExp), mean = mean(lifeExp)) Table 6.7: Life expectancy by continent continent median mean Africa 52.9 54.8 Americas 72.9 73.6 Asia 72.4 70.7 Europe 78.6 77.6 Oceania 80.7 80.7 We see now that there are differences in life expectancies between the continents. For example focusing on only medians, while the median life expectancy across all \\(n = 142\\) countries in 2007 was 71.935, the median life expectancy across the \\(n =52\\) countries in Africa was only 52.927. Let’s create a corresponding visualization. One way to compare the life expectancies of countries in different continents would be via a faceted histogram. Recall we saw back in the Data Visualization chapter, specifically Section 3.6, that facets allow us to split a visualization by the different levels of a categorical variable or factor variable. In Figure 6.10, the variable we facet by is continent, which is categorical with five levels, each corresponding to the five continents of the world. ggplot(gapminder2007, aes(x = lifeExp)) + geom_histogram(binwidth = 5, color = "white") + labs(x = "Life expectancy", y = "Number of countries", title = "Life expectancy by continent") + facet_wrap(~continent, nrow = 2) Figure 6.10: Life expectancy in 2007 Another way would be via a geom_boxplot where we map the categorical variable continent to the \\(x\\)-axis and the different life expectancies within each continent on the \\(y\\)-axis; we do this in Figure 6.11. ggplot(gapminder2007, aes(x = continent, y = lifeExp)) + geom_boxplot() + labs(x = "Continent", y = "Life expectancy (years)", title = "Life expectancy by continent") Figure 6.11: Life expectancy in 2007 Some people prefer comparing a numerical variable between different levels of a categorical variable, in this case comparing life expectancy between different continents, using a boxplot over a faceted histogram as we can make quick comparisons with single horizontal lines. For example, we can see that even the country with the highest life expectancy in Africa is still lower than all countries in Oceania. It’s important to remember however that the solid lines in the middle of the boxes correspond to the medians (i.e. the middle value) rather than the mean (the average). So, for example, if you look at Asia, the solid line denotes the median life expectancy of around 72 years, indicating to us that half of all countries in Asia have a life expectancy below 72 years whereas half of all countries in Asia have a life expectancy above 72 years. Furthermore, note that: Africa and Asia have much more spread/variation in life expectancy as indicated by the interquartile range (the height of the boxes). Oceania has almost no spread/variation, but this might in large part be due to the fact there are only two countries in Oceania: Australia and New Zealand. Now, let’s start making comparisons of life expectancy between continents. Let’s use Africa as a baseline for comparsion. Why Africa? Only because it happened to be first alphabetically, we could’ve just as appropriately used the Americas as the baseline for comparison. Using the “eyeball test” (just using our eyes to see if anything stands out), we make the following observations about differences in median life expectancy compared to the baseline of Africa: The median life expectancy of the Americas is roughly 20 years greater. The median life expectancy of Asia is roughly 20 years greater. The median life expectancy of Europe is roughly 25 years greater. The median life expectancy of Oceania is roughly 27.8 years greater. Let’s remember these four differences vs Africa corresponding to the Americas, Asia, Europe, and Oceania: 20, 20, 25, 27.8. Learning check (LC6.4) Conduct a new exploratory data analysis with the same explanatory variable \\(x\\) being continent but with gdpPercap as the new outcome variable \\(y\\). Remember, this involves three things: Looking at the raw values Computing summary statistics of the variables of interest. Creating informative visualizations What can you say about the differences in GDP per capita between continents based on this exploration? 6.2.2 Linear regression In Subsection 6.1.2 we introduced simple linear regression, which involves modeling the relationship between a numerical outcome variable \\(y\\) as a function of a numerical explanatory variable \\(x\\), in our life expectancy example, we now have a categorical explanatory variable \\(x\\) continent. While we still can fit a regression model, given our categorical explanatory variable we no longer have a concept of a “best-fitting” line, but differences relative to a baseline for comparison. Before we fit our regression model, let’s create a table similar to Table 6.7, but Report the mean life expectancy for each continent. Report the difference in mean life expectancy relative to Africa’s mean life expectancy of 54.806 in the column “mean vs Africa”; this column is simply the “mean” column minus 54.806. Think back to your observations from the eyeball test of Figure 6.11 at the end of the last subsection. The column “mean vs Africa” is the same idea of comparing a summary statistic to a baseline for comparison, in this case the countries of Africa, but using means instead of medians. Table 6.8: Mean life expectancy by continent continent mean mean vs Africa Africa 54.8 0.0 Americas 73.6 18.8 Asia 70.7 15.9 Europe 77.6 22.8 Oceania 80.7 25.9 Now, let’s use the get_regression_table() function we introduced in Section 6.1.2 to get the regression table for gapminder2007 analysis: lifeExp_model <- lm(lifeExp ~ continent, data = gapminder2007) get_regression_table(lifeExp_model) Table 6.9: Linear regression table term estimate std_error statistic p_value conf_low conf_high intercept 54.8 1.02 53.45 0 52.8 56.8 continentAmericas 18.8 1.80 10.45 0 15.2 22.4 continentAsia 15.9 1.65 9.68 0 12.7 19.2 continentEurope 22.8 1.70 13.47 0 19.5 26.2 continentOceania 25.9 5.33 4.86 0 15.4 36.5 Just as before, we have the term and estimates columns of interest, but unlike before, we now have 5 rows corresponding to 5 outputs in our table: an intercept like before, but also continentAmericas, continentAsia, continentEurope, and continentOceania. What are these values? intercept = 54.8 corresponds to the mean life expectancy for Africa. This mean life expectancy is treated as a baseline for comparison for the other continents. continentAmericas = 18.8 is the difference in mean life expectancies of the Americas minus Africa. Note that \\(18.80 = 73.6 - 54.8\\) is the 2nd “mean vs Africa” value in Table 6.8. continentAmericas = 15.9 is the difference in mean life expectancy of Asia minus Africa. Note that \\(15.9 = 70.7 - 54.8\\) is the 2nd “mean vs Africa” value in Table 6.8. continentEurope = 22.8 is the difference in mean life expectancy of Europe minus Africa. Note that \\(22.8 = 77.6 - 54.8\\) is the 3rd “mean vs Africa” value in Table 6.8. continentOceania = 25.9 is the difference in mean life expectancy of Oceania minus Africa. Note that \\(25.9 = 80.7 - 54.8\\) is the 3rd “mean vs Africa” value in Table 6.8. Let’s generalize this idea a bit. If we fit a linear regression model using a categorical explanatory variable \\(x\\) that has \\(k\\) levels, a regression model will return an intercept and \\(k - 1\\) “slope” coefficients. When \\(x\\) is a numerical explanatory variable the interpretation is of a “slope” coefficient, but when \\(x\\) is categorical the meaning is a little trickier. They are offsets relative to the baseline. In our case, since there are \\(k = 5\\) continents, the regression model returns an intercept corresponding to the baseline for comparison Africa and \\(k - 1 = 4\\) slope coefficients corresponding to the Americas, Asia, Europe, and Oceania. Africa was chosen as the baseline by R for no other reason than it is first alphabetically of the 5 continents. You can manually specify which continent to use as baseline instead of the default choice of whichever comes first alphabetically, but we leave that to a more advanced course. Learning check (LC6.5) Fit a new linear regression using lm(gdpPercap ~ continent, data = gapminder) where gdpPercap is the new outcome variable \\(y\\). Get information about the “best-fitting” line from the regression table by applying the get_regression_table() function. How do the regression results match up with the results from your exploratory data analysis above? 6.2.3 Observed/fitted values and residuals Recall in Subsection 6.1.3 when we had a numerical explanatory variable \\(x\\), we defined: Observed values \\(y\\), or the observed value of the outcome variable Fitted values \\(\\widehat{y}\\), or the value on the regression line for a given \\(x\\) value Residuals \\(y - \\widehat{y}\\), or the error between the observed value and the fitted value What do fitted values \\(\\widehat{y}\\) and residuals \\(y - \\widehat{y}\\) correspond to when the explanatory variable \\(x\\) is categorical? Let’s investigate these values for the first 10 countries in the gapminder2007 dataset: Table 6.10: First 10 out of 142 countries country continent lifeExp gdpPercap Afghanistan Asia 43.8 975 Albania Europe 76.4 5937 Algeria Africa 72.3 6223 Angola Africa 42.7 4797 Argentina Americas 75.3 12779 Australia Oceania 81.2 34435 Austria Europe 79.8 36126 Bahrain Asia 75.6 29796 Bangladesh Asia 64.1 1391 Belgium Europe 79.4 33693 Recall the get_regression_points() function we used in Subsection 6.1.3 to return the observed value of the outcome variable, all explanatory variables, fitted values, and residuals for all points in the regression. Recall that each “point” in this case corresponds to one of 142 countries in the gapminder2007 dataset. They are also the 142 observations used to construct the boxplots in Figure 6.11. regression_points <- get_regression_points(lifeExp_model) regression_points Table 6.11: Regression points (First 10 out of 142 countries) ID lifeExp continent lifeExp_hat residual 1 43.8 Asia 70.7 -26.900 2 76.4 Europe 77.6 -1.226 3 72.3 Africa 54.8 17.495 4 42.7 Africa 54.8 -12.075 5 75.3 Americas 73.6 1.712 6 81.2 Oceania 80.7 0.515 7 79.8 Europe 77.6 2.180 8 75.6 Asia 70.7 4.907 9 64.1 Asia 70.7 -6.666 10 79.4 Europe 77.6 1.792 Notice The fitted values lifeExp_hat \\(\\widehat{\\text{lifeexp}}\\). Countries in Africa have the same fitted value of 54.8, which is the mean life expectancy of Africa; countries in Asia have the same fitted value of 70.7, which is the mean life expectancy of Asia; this similarly holds for countries in the Americas, Europe, and Oceania. The residual column is simply \\(y - \\widehat{y}\\) = lifeexp - lifeexp_hat. These values can be interpreted as that particular country’s deviation from the mean life expectancy of the respective continent’s mean. For example, the first row of this dataset corresponds to Afghanistan, and the residual of $-26.9 = 43.8 70.7$ is Afghanistan’s mean life expectancy minus the mean life expectancy of all Asian countries. 6.2.4 Residual analysis Recall our discussion on residuals from Section 6.1.4 where our goal was to investigate whether or not there was a systematic pattern to the residuals, as ideally since residuals can be thought of as error, there should be no such pattern. While there are many ways to do such residual analysis, we focused on two approaches based on visualizations. A plot with residuals on the vertical axis and the predictor (in this case continent) on the horizontal axis A histogram of all residuals First, let’s plot the residuals vs continent in Figure 6.12, but also let’s plot all 142 points with a little horizontal random jitter by setting the width = 0.1 parameter in geom_jitter(): ggplot(regression_points, aes(x = continent, y = residual)) + geom_jitter(width = 0.1) + labs(x = "Continent", y = "Residual") + geom_hline(yintercept = 0, col = "blue") Figure 6.12: Plot of residuals over continent We observe: There seems to be a rough balance of both positive and negative residuals for all 5 continents. However, there is one clear outlier in Asia. It has the smallest residual, hence also has the smallest life expectancy in Asia. Let’s investigate the 5 countries in Asia with the shortest life expectancy: gapminder2007 %>% filter(continent == "Asia") %>% arrange(lifeExp) Table 6.12: Countries in Asia with shortest life expectancy country continent lifeExp gdpPercap Afghanistan Asia 43.8 975 Iraq Asia 59.5 4471 Cambodia Asia 59.7 1714 Myanmar Asia 62.1 944 Yemen, Rep. Asia 62.7 2281 This was the earlier identified residual for Afghanistan of -26.9. Unfortunately given recent geopolitical turmoil, individuals who live in Afghanistan have a drastically lower life expectancy. Second, let’s look at a histogram of all 142 values of residuals in Figure 6.13. In this case, the residuals form a rather nice bell-shape, although there are a couple of very low and very high values at the tails. As we said previously, searching for patterns in residuals can be somewhat subjective, but ideally we hope there are no “drastic” patterns. ggplot(regression_points, aes(x = residual)) + geom_histogram(binwidth = 5, color = "white") + labs(x = "Residual") Figure 6.13: Histogram of residuals Learning check (LC6.6) Continuing with our regression using gdpPercap as the outcome variable and continent as the explanatory variable, use the get_regression_points() function to get the observed values, fitted values, and residuals for all 142 countries in 2007 and perform a residual analysis and look for any systematic patterns in the residuals. Is there a patter? 6.3 Related topics 6.3.1 Correlation coefficient Let’s re-plot Figure 6.1, but now consider a broader range of correlation coefficient values in Figure 6.14. Figure 6.14: Different Correlation Coefficients As we suggested in Subsection 6.1.1, interpreting coefficients that are not close to the extreme values of -1 and 1 can be subjective. To develop your sense of correlation coefficients, we suggest you play the following 80’s-style video game called “Guess the correlation”! Click on the image below: 6.3.2 Correlation is not necessarily causation Causation is a tricky problem and frequently takes either carefully designed experiments or methods to control for the effects of potential confounding variables. Both these approaches attempt either to remove all confounding variables or take them into account as best they can, and only focus on the behavior of a outcome variable in the presence of the levels of the other variable(s). Be careful as you read studies to make sure that the writers aren’t falling into this fallacy of correlation implying causation. If you spot one, you may want to send them a link to Spurious Correlations. 6.3.3 Best fitting line Regression lines are also known as “best fitting lines”. But what do we mean by best? Let’s unpack the criteria that is used by regression to determine best. Recall the plot in Figure 6.6 where for a instructor with a beauty average score of \\(x=7.333\\) The observed value \\(y=4.9\\) was marked with a red circle The fitted value \\(\\widehat{y} = 4.369\\) on the regression line was marked with a red square The residual \\(y-\\widehat{y} = 4.9-4.369 = 0.531\\) was the length of the blue arrow. Let’s do this for another arbitrarily chosen instructor whose beauty score was \\(x=2.333\\). The residual in this case is \\(2.7 - 4.036 = -1.336\\). Let’s do this for another arbitrarily chosen instructor whose beauty score was \\(x=3.667\\). The residual in this case is \\(4.4 - 4.125 = 0.2753\\). Let’s do this for another arbitrarily chosen instructor whose beauty score was \\(x = 6\\). The residual in this case is \\(3.8 - 4.28 = -0.4802\\). Now let’s say we repeated this process for all 463 instructors in our dataset. Regression minimizes the sum of all 463 arrow lengths squared. In other words, it minimizes the sum of the squared residuals: \\[ \\sum_{i=1}^{n}(y_i - \\widehat{y}_i)^2 \\] We square the arrow lengths so that positive and negative deviations of the same amount are treated equally. That’s why alternative names for the simple linear regression line are the least-squares line and the best fitting line. It can be proven via calculus and linear algebra that this line uniquely minimizes the sum of the squared arrow lengths. For the regression line in the plot, the sum of the squared residuals is 131.879. This is the lowest possible value of the sum of the squared residuals of all possible lines we could draw on this scatterplot? How do we know this? We can mathematically prove this fact, but this requires some calculus and linear algebra, so let’s leave this proof for another course! 6.3.4 How does get_regression_table() work? Note that this subsection is optional! What is going on behind the scenes with the get_regression_table() from the moderndive package? Recall in Subsection 6.1.2 we noted that these are wrapper functions that take other pre-existing functions and “wraps” them in a single function. This wrapper function leverages the the tidy() function in the broom package and the clean_names() function in the janitor package to generate clean looking outputs. Here is what the regression table from Subsection 6.1.2 looks like: score_model <- lm(score ~ bty_avg, data = evals) get_regression_table(score_model, digits = 2) term estimate std_error statistic p_value conf_low conf_high intercept 3.88 0.08 50.96 0 3.73 4.03 bty_avg 0.07 0.02 4.09 0 0.03 0.10 The main idea behind the get_regression_table() function is to hide you from the following code running behind the scenes, which we’ve found to be confusing to students in the past: library(broom) library(janitor) score_model %>% tidy(conf.int = TRUE) %>% mutate_if(is.numeric, round, digits = 3) %>% clean_names() term estimate std_error statistic p_value conf_low conf_high (Intercept) 3.880 0.076 50.96 0 3.731 4.030 bty_avg 0.067 0.016 4.09 0 0.035 0.099 Note that the mutate_if() function is from the dplyr package and applies the round() function with 3 significant digits precision only to those variables that are numerical. But oof, the second code block is long and messy! We felt it appropriate to hide all this from you and give the function an easy to remember name: get_regression_table(). 6.4 Conclusion In this chapter, you seen what we call “basic regression” when you only have one explanatory variable. In Chapter 7, we’ll study multiple regression where we have more than one explanatory variable! In particular, we’ll see why we’ve been conducting the residual analyses from Subsections {#model1residuals} and {#model2residuals}; we are actually verifying some very important assumptions that must be met for the std_error (standard error), p_value, conf_low and conf_high (the end-points of the confidence intervals) columns in our regression tables to have valid interpretation. Again, don’t worry for now if you don’t understand what these terms mean. After the next chapter on multiple regression, we’ll dive in! 6.4.1 Script of R code An R script file of all R code used in this chapter is available here. "], ["7-multiple-regression.html", "7 Multiple Regression 7.1 Two numerical explanatory variables 7.2 One numerical & one categorical explanatory variable 7.3 Related topics 7.4 Conclusion", " 7 Multiple Regression In Chapter 6 we introduced ideas related to modeling, in particular that the fundamental premise of modeling is to make explicit the relationship between an outcome variable \\(y\\) and an explanatory/predictor variable \\(x\\). Recall further the synonyms that we used to also denote \\(y\\) as the dependent variable and \\(x\\) as an independent variable or covariate. There are many modeling approaches one could take, among the most well-known being linear regression, which was the focus of the last chapter. Whereas in the last chapter we only focused on regression scenarios where there is only one explanatory/predictor variable, in this chapter, we now focus on modeling scenarios where there is more than one; this is known as multiple regression. You can imagine when trying to model a particular outcome variable, like teaching evaluation score as in Section 6.1 or life expectancy as in Section 6.2, it would be very useful to incorporate more than one explanatory variable. Since our regression models will now consider more than one explanatory/predictor variable, the interpretation of the associated effect of any one explanatory/predictor variables must be made in conjunction with the others. For example, say we are modeling individuals’ incomes as a function of their number of years of education and their parents’ wealth. When interpreting the effect of education on income, one has to consider the effect of their parents’ wealth at the same time, as these two variables are almost certainly related. Make note of this throughout this chapter and as you work on interpreting the results of multiple regression models into the future. Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). Read Section 2.3 for information on how to install and load R packages. library(ggplot2) library(dplyr) library(moderndive) library(ISLR) 7.1 Two numerical explanatory variables Let’s now attempt to identify factors that are associated with how much credit card debt an individual will have. The textbook An Introduction to Statistical Learning with Applications in R by Gareth James, Daniela Witten, Trevor Hastie, and Robert Tibshirani is an intermediate-level textbook on statistical and machine learning freely available here. It has an accompanying R package called ISLR with datasets that the authors use to demonstrate various machine learning methods. One dataset that is frequently used by the authors is the Credit dataset where predictions are made on the credit card balance held by \\(n = 400\\) credit card holders based on information about them like income, credit limit, and education level. Since no information was provided as to who these \\(n\\) = 400 individuals are and how they came to be included in this dataset, it will be hard to make any scientific claims based on this data. Recall our discussion from the previous chapter that correlation does not necessarily imply causation. That being said, we’ll still use Credit to demonstrate multiple regression with: A numerical outcome variable \\(y\\), in this case credit card balance. Two explanatory variables: A first numerical explanatory variable \\(x_1\\). In this case, their credit limit. A second numerical explanatory variable \\(x_2\\). In this case, their income (in thousands of dollars). In the forthcoming Learning Checks, we’ll consider a different scenario: The same numerical outcome variable \\(y\\): credit card balance. Two new explanatory variables: A first numerical explanatory variable \\(x_1\\): their credit rating. A second numerical explanatory variable \\(x_2\\): their age. 7.1.1 Exploratory data analysis Let’s load the Credit data and select() only the needed subset of variables. library(ISLR) Credit <- Credit %>% select(Balance, Limit, Income, Rating, Age) Let’s look at the raw data values both by bringing up RStudio’s spreadsheet viewer and the glimpse() function. Although in Table 7.1 we only show 5 randomly selected credit card holders out of 400: View(Credit) Table 7.1: Random sample of 5 credit card holders Balance Limit Income Rating Age 141 1425 6045 39.8 459 32 9 279 3300 15.1 266 66 13 204 5308 80.6 394 57 262 1050 9310 180.4 665 67 267 15 4952 88.8 360 86 glimpse(Credit) Observations: 400 Variables: 5 $ Balance <int> 333, 903, 580, 964, 331, 1151, 203, 872, 279, 1350, 1407, 0... $ Limit <int> 3606, 6645, 7075, 9504, 4897, 8047, 3388, 7114, 3300, 6819,... $ Income <dbl> 14.9, 106.0, 104.6, 148.9, 55.9, 80.2, 21.0, 71.4, 15.1, 71... $ Rating <int> 283, 483, 514, 681, 357, 569, 259, 512, 266, 491, 589, 138,... $ Age <int> 34, 82, 71, 36, 68, 77, 37, 87, 66, 41, 30, 64, 57, 49, 75,... Let’s look at some summary statistics: Credit %>% select(Balance, Limit, Income) %>% summary() Balance Limit Income Min. : 0 Min. : 855 Min. : 10.4 1st Qu.: 69 1st Qu.: 3088 1st Qu.: 21.0 Median : 460 Median : 4622 Median : 33.1 Mean : 520 Mean : 4736 Mean : 45.2 3rd Qu.: 863 3rd Qu.: 5873 3rd Qu.: 57.5 Max. :1999 Max. :13913 Max. :186.6 We observe, for example, The mean and median credit card balance is around $500. 25% of card holders had debts of 69 dollars or less. The mean and median credit card limit is just under $5000. 75% of these card holders had incomes of $57,500 or less. Since our outcome variable Balance and the explanatory variables Limit and Rating are numerical, we can compute the correlation coefficient between pairs of these variables. There are two ways of doing this. First, we could run the cor() command as seen in Subsection 6.1.1 twice, once for each explanatory variable: cor(Credit$Balance, Credit$Limit) cor(Credit$Balance, Credit$Income) Or we can simultaneously compute them by returning a correlation matrix in Table 7.2. We can read off the correlation coefficient for any pair of variables by looking them up in the appropriate row/column combination. Credit %>% select(Balance, Limit, Income) %>% cor() Table 7.2: Correlations between credit card balance, credit limit, and credit rating Balance Limit Income Balance 1.000 0.862 0.464 Limit 0.862 1.000 0.792 Income 0.464 0.792 1.000 For example, the correlation coefficient of: Balance with itself is 1 as we would expect based on the definition of the correlation coefficient. Balance with Limit is 0.862. This indicates a strong positive linear relationship, which makes sense as only individuals with large credit limits can accrue large credit card balances. Balance with Income is 0.464. This is suggestive of another positive linear relationship, although not as strong as the relationship between Balance and Limit. As an added bonus, we can read off the correlation coefficient of the two explanatory variables, Limit and Income of 0.792. In this case, we say there is a high degree of collinearity between these two explanatory variables. Collinearity (or multicollinearity) is a phenomenon in which one explanatory variable in a multiple regression model can be linearly predicted from the others with a substantial degree of accuracy. So in this case, if we knew someone’s credit card Limit and since Limit and Income are highly correlated, we could make a fairly accurate guess as to that person’s Income. Or put loosely, these two variables provided redundant information. For now let’s ignore any issues related to collinearity and press on. Let’s visualize the relationship of the outcome variable with each of the two explanatory variables in two separate plots: ggplot(Credit, aes(x = Limit, y = Balance)) + geom_point() + labs(x = "Credit limit (in $)", y = "Credit card balance (in $)", title = "Relationship between balance and credit limit") + geom_smooth(method = "lm", se = FALSE) ggplot(Credit, aes(x = Income, y = Balance)) + geom_point() + labs(x = "Income (in $1000)", y = "Credit card balance (in $)", title = "Relationship between balance and income") + geom_smooth(method = "lm", se = FALSE) Figure 7.1: Relationship between credit card balance and credit limit/income First, there is a positive relationship between credit limit and balance, since as credit limit increases so also does credit card balance; this is to be expected given the strongly positive correlation coefficient of 0.862. In the case of income, the positive relationship doesn’t appear as strong, given the weakly positive correlation coefficient of 0.464. However the two plots in Figure 7.1 only focus on the relationship of the outcome variable with each of the explanatory variables independently. To get a sense of the joint relationship of all three variables simultaneously through a visualization, let’s display the data in a 3-dimensional (3D) scatterplot, where The numerical outcome variable \\(y\\) Balance is on the z-axis (vertical axis) The two numerical explanatory variables form the “floor” axes. In this case The first numerical explanatory variable \\(x_1\\) Income is on of the floor axes. The second numerical explanatory variable \\(x_2\\) Limit is on the other floor axis. Click on the following image to open an interactive 3D scatterplot in your browser: Previously in Figure 6.5, we plotted a “best-fitting” regression line through a set of points where the numerical outcome variable \\(y\\) was teaching score and a single numerical explanatory variable \\(x\\) bty_avg. What is the analogous concept when we have two numerical predictor variables? Instead of a best-fitting line, we now have a best-fitting plane, which is 3D generalization of lines which exist in 2D. Click on the following image to open an interactive plot of the regression plane in your browser. Move the image around, zoom in, and think about how this plane generalizes the concept of a linear regression line to three dimensions. Learning check (LC7.1) Conduct a new exploratory data analysis with the same outcome variable \\(y\\) being Balance but with Rating and Age as the new explanatory variables \\(x_1\\) and \\(x_2\\). Remember, this involves three things: Looking at the raw values Computing summary statistics of the variables of interest. Creating informative visualizations What can you say about the relationship between a credit card holder’s balance and their credit rating and age? 7.1.2 Multiple regression Just as we did when we had a single numerical explanatory variable \\(x\\) in Subsection 6.1.2 and when we had a single categorical explanatory variable \\(x\\) in Subsection 6.2.2, we fit a regression model and get the regression table in our two numerical explanatory variable scenario. To fit a regression model and get a table using get_regression_table(), we now use a + to consider multiple explanatory variables. In this case since we want to preform a regression of Limit and Income simultaneously, we input Balance ~ Limit + Income. Balance_model <- lm(Balance ~ Limit + Income, data = Credit) get_regression_table(Balance_model) Table 7.3: Multiple regression table term estimate std_error statistic p_value conf_low conf_high intercept -385.179 19.465 -19.8 0 -423.446 -346.912 Limit 0.264 0.006 45.0 0 0.253 0.276 Income -7.663 0.385 -19.9 0 -8.420 -6.906 How do we interpret these three values that define the regression plane? Intercept: -$385.18 (rounded to two decimal points to represent cents). The intercept in our case represents the credit card balance for an individual who has both a credit Limit of $0 and Income of $0. In our data however, the intercept has limited practical interpretation as no individuals had Limit or Income values of $0 and furthermore the smallest credit card balance was $0. Rather, it is used to situate the regression plane in 3D space. Limit: $0.26. Now that we have multiple variables to consider, we have to add a caveat to our interpretation: all other things being equal, for every increase of one unit in credit Limit (dollars), there is an associated increase of on average $0.26 in credit card balance. Note: Just as we did in Subsection 6.1.2, we are not making any causal statements, only statements relating to the association between credit limit and balance The all other things being equal is making a statement about all other explanatory variables, in this case only one: Income. This is equivalent to saying “holding Income constant, we observed an associated increase of $0.26 in credit card balance for every dollar increase in credit limit.” Income: -$7.66. Similarly, all other things being equal, for every increase of one unit in Income (in other words, $1000 in income), there is an associated decrease of on average $7.66 in credit card balance. However, recall in Figure 7.1 that when considered separately, both Limit and Income had positive relationships with the outcome variable Balance: as card holders’ credit limits increased their credit card balances tended to increase as well, and a similar relationship held for incomes and balances. In the above multiple regression, however, the slope for Income is now -7.66, suggesting a negative relationship between income and credit card balance. What explains these contradictory results? This is known as Simpson’s Paradox, a phenomenon in which a trend appears in several different groups of data but disappears or reverses when these groups are combined. We expand on this in Subsection 7.3.2 where we’ll look at the relationship between credit Limit and credit card balance but split by different income bracket groups. Learning check (LC7.2) Fit a new simple linear regression using lm(Balance ~ Rating + Age, data = Credit) where Rating and Age are the new numerical explanatory variables \\(x_1\\) and \\(x_2\\). Get information about the “best-fitting” line from the regression table by applying the get_regression_table() function. How do the regression results match up with the results from your exploratory data analysis above? 7.1.3 Observed/fitted values and residuals As we did previously, in Table 7.4 let’s unpack the output of the get_regression_points() function for our model for credit card balance for all 400 card holders in the dataset. Recall that each card holder corresponds to one of the 400 rows in the Credit data frame and also for one of the 400 points in the 3D scatterplots in Subsection 7.1.1. regression_points <- get_regression_points(Balance_model) regression_points Table 7.4: Regression points (first 5 rows of 400) ID Balance Limit Income Balance_hat residual 1 333 3606 14.9 454 -120.8 2 903 6645 106.0 559 344.3 3 580 7075 104.6 683 -103.4 4 964 9504 148.9 986 -21.7 5 331 4897 55.9 481 -150.0 Recall the format of the output: Balance corresponds to \\(y\\) (the observed value) Balance_hat corresponds to \\(\\widehat{y}\\) (the fitted value) residual corresponds to \\(y - \\widehat{y}\\) (the residual) 7.1.4 Residual analysis Recall in Section 6.1.4, our first residual analysis plot investigated the presence of any systematic pattern in the residuals when we had a single numerical predictor: bty_age. For the Credit card dataset, since we have two numerical predictors, Limit and Income, we must perform this twice: ggplot(regression_points, aes(x = Limit, y = residual)) + geom_point() + labs(x = "Credit limit (in $)", y = "Residual", title = "Residuals vs credit limit") ggplot(regression_points, aes(x = Income, y = residual)) + geom_point() + labs(x = "Income (in $1000)", y = "Residual", title = "Residuals vs income") Figure 7.2: Residuals vs credit limit and income In this case, there does appear to be a systematic pattern to the residuals. As the scatter of the residuals around the line \\(y=0\\) is definitely not consistent. This behavior of the residuals is further evidenced by the histogram of residuals in Figure 7.3. We observe that the residuals have a slight right-skew (recall we say that data is right-skewed, or positively-skewed, if there is a tail to the right). Ideally, these residuals should be bell-shaped around a residual value of 0. ggplot(regression_points, aes(x = residual)) + geom_histogram(color = "white") + labs(x = "Residual") Figure 7.3: Relationship between credit card balance and credit limit/income Another way to interpret this histogram is that since the residual is computed as \\(y - \\widehat{y}\\) = balance - balance_hat, we have some values where the fitted value \\(\\widehat{y}\\) is very much lower than the observed value \\(y\\). In other words, we are underestimating certain credit card holders’ balances by a very large amount. Learning check (LC7.3) Continuing with our regression using Rating and Age as the explanatory variables and credit card Balance as the outcome variable, use the get_regression_points() function to get the observed values, fitted values, and residuals for all 400 credit card holders. Perform a residual analysis and look for any systematic patterns in the residuals. 7.2 One numerical & one categorical explanatory variable Let’s revisit the instructor evaluation data introduced in Section 6.1, where we studied the relationship between instructor evaluation scores and their beauty scores. This analysis suggested that there is a positive relationship between bty_avg and score, in other words as instructors had higher beauty scores, they also tended to have higher teaching evaluation scores. Now let’s say instead of bty_avg we are interested in the numerical explanatory variable \\(x_1\\) age and furthermore we want to use a second explanatory variable \\(x_2\\), the (binary) categorical variable gender. Our modeling scenario now becomes A numerical outcome variable \\(y\\). As before, instructor evaluation score. Two explanatory variables: A numerical explanatory variable \\(x_1\\): in this case, their age. A categorical explanatory variable \\(x_2\\): in this case, their binary gender. 7.2.1 Exploratory data analysis Let’s reload the evals data and select() only the needed subset of variables. load(url("http://www.openintro.org/stat/data/evals.RData")) evals <- evals %>% select(score, age, gender) Let’s look at the raw data values both by bringing up RStudio’s spreadsheet viewer and the glimpse() function, although in Table 7.5 we only show 5 randomly selected instructors out of 463: View(evals) Table 7.5: Random sample of 5 instructors score bty_avg age gender 290 3.6 6.67 34 male 341 4.9 3.50 43 male 199 3.3 2.33 47 male 47 4.4 4.67 33 female 215 4.7 3.67 60 male Let’s look at some summary statistics: summary(evals) score bty_avg age gender Min. :2.30 Min. :1.67 Min. :29.0 female:195 1st Qu.:3.80 1st Qu.:3.17 1st Qu.:42.0 male :268 Median :4.30 Median :4.33 Median :48.0 Mean :4.17 Mean :4.42 Mean :48.4 3rd Qu.:4.60 3rd Qu.:5.50 3rd Qu.:57.0 Max. :5.00 Max. :8.17 Max. :73.0 In Figure 7.4, we plot a scatterplot of score over age, but given that gender is a binary categorical variable We can assign a color to points from each of the two levels of gender: female and male. Furthermore, the geom_smooth(method = "lm", se = FALSE) layer automatically fits a different regression line for each. ggplot(evals, aes(x = age, y = score, col = gender)) + geom_jitter() + labs(x = "Age", y = "Teaching Score", color = "Gender") + geom_smooth(method = "lm", se = FALSE) Figure 7.4: Instructor evaluation scores at UT Austin split by gender (jittered) We notice some interesting trends: There are almost no women faculty over the age of 60. Fitting separate regression lines for men and women, we see they have different slopes. We see that the associated effect of increasing age seems to be much harsher for women than men. In other words, as women age, the drop in their teaching score appears to be more faster. 7.2.2 Multiple regression Much like we started to consider multiple explanatory variables using the + sign in Subsection 7.1.2, let’s fit a regression model and get the regression table, this time saving our regression model fit in score_model_2 so as to not overwrite the model score_model from Section 6.1.2. score_model_2 <- lm(score ~ age + gender, data = evals) get_regression_table(score_model_2) Table 7.6: Regression table term estimate std_error statistic p_value conf_low conf_high intercept 4.484 0.125 35.79 0.000 4.238 4.730 age -0.009 0.003 -3.28 0.001 -0.014 -0.003 gendermale 0.191 0.052 3.63 0.000 0.087 0.294 The modeling equation for this scenario is: \\[ \\begin{align} \\widehat{y} &= b_0 + b_1 * x_1 + b_2 * x_2 \\\\ \\widehat{\\mbox{score}} &= b_0 + b_{\\mbox{age}} * \\mbox{age} + b_{\\mbox{male}} * \\mathbb{1}[\\mbox{is male}] \\\\ \\end{align} \\] where \\(\\mathbb{1}[\\mbox{is male}]\\) is an indicator function for sex == male. In other words, \\(\\mathbb{1}[\\mbox{is male}]\\) equals one if the current observation corresponds to a male professor, and 0 if the current observation corresponds to a female professor. This model can be visualized in Figure 7.5. Figure 7.5: Instructor evaluation scores at UT Austin by gender: same slope We see that: Females are treated as the baseline for comparison for no other reason than “female” is alphabetically earlier than “male.” The \\(b_{male} = 0.1906\\) is the vertical “bump” that men get in their teaching evaluation scores. Or more precisely, it is the average difference in teaching score that men get relative to the baseline of women. Accordingly, the intercepts are (which in this case make no sense since no instructor can have age 0): for women: \\(b_0\\) = 4.484 for men: \\(b_0 + b_{male}\\) = 4.484 + 0.191 = 4.675 Both men and women have the same slope. In other words, in this model the associated effect of age is the same for men and women. All other things being equal, for every increase in 1 in age, there is on average an associated decrease of \\(b_{age}\\) = -0.0086 in teaching score. But wait, why is Figure 7.5 different than Figure 7.4! What is going on? What we have in the original plot is know as an interaction effect between age and gender. Focusing on fitting a model for each of men and women we see that the resulting regression lines are different. Thus, gender appears to interact in different ways for men and women with the different values of age. 7.2.3 Multiple regression with interaction effects We say a model has an interaction effect if the associated effect of one variable depends on the value of another variable. These types of models usually prove to be tricky to view on first glance because of their complexity. In this case, the effect of age will depend on the value of gender. Put differently, the effect of age on teaching scores will differ for men and for women, as was suggested by the different slopes for men and women in our visual exploratory data analysis in Figure 7.4. Let’s fit a regression with an interaction term. Instead of using the + sign in the enumeration of explanatory variables, we use the * sign. Let’s fit this regression and save it in score_model_3, then we get the regression table using the get_regression_table() function as before. score_model_interaction <- lm(score ~ age * gender, data = evals) get_regression_table(score_model_interaction) Table 7.7: Regression table term estimate std_error statistic p_value conf_low conf_high intercept 4.883 0.205 23.80 0.000 4.480 5.286 age -0.018 0.004 -3.92 0.000 -0.026 -0.009 gendermale -0.446 0.265 -1.68 0.094 -0.968 0.076 age:gendermale 0.014 0.006 2.45 0.015 0.003 0.024 The modeling equation for this scenario is: \\[ \\begin{align} \\widehat{y} &= b_0 + b_1*x_1 + b_2*x_2 + b_3*x_1*x_2\\\\ \\widehat{\\mbox{score}} &= b_0 + b_{\\mbox{age}}*\\mbox{age} + b_{\\mbox{male}}*\\mathbb{1}[\\mbox{is male}] + b_{\\mbox{age,male}}*\\mbox{age}*\\mathbb{1}[\\mbox{is male}] \\\\ \\end{align} \\] Oof, that’s a lot of rows in the regression table output and a lot of terms in the model equation. The fourth term being added on the right hand side of the equation corresponds to the interaction term. Let’s simplify things by considering men and women separately. First, recall that \\(\\mathbb{1}[\\mbox{is male}]\\) equals 1 if a particular observation (or row in evals) corresponds to a male instructor. In this case, using the values from the regression table the fitted value of \\(\\widehat{\\mbox{score}}\\) is: \\[ \\begin{align} \\widehat{\\mbox{score}} &= b_0 + b_{\\mbox{age}}*\\mbox{age} + b_{\\mbox{male}}*\\mathbb{1}[\\mbox{is male}] + b_{\\mbox{age,male}}*\\mbox{age}*\\mathbb{1}[\\mbox{is male}] \\\\ &= b_0 + b_{\\mbox{age}}*\\mbox{age} + b_{\\mbox{male}}*1 + b_{\\mbox{age,male}}\\mbox{age}*1 \\\\ &= \\left(b_0 + b_{\\mbox{male}}\\right) + \\left(b_{\\mbox{age}} + b_{\\mbox{age,male}} \\right)*\\mbox{age} \\\\ &= \\left(4.883 + -0.446\\right) + \\left(-0.018 + 0.014 \\right)*\\mbox{age} \\\\ &= 4.437 -0.004*\\mbox{age} \\end{align} \\] Second, recall that \\(\\mathbb{1}[\\mbox{is male}]\\) equals 0 if a particular observation corresponds to a female instructors. Again, using the values from the regression table the fitted value of \\(\\widehat{\\mbox{score}}\\) is: \\[ \\begin{align} \\widehat{\\mbox{score}} &= b_0 + b_{\\mbox{age}}*\\mbox{age} + b_{\\mbox{male}}*\\mathbb{1}[\\mbox{is male}] + b_{\\mbox{age,male}}*\\mbox{age}*\\mathbb{1}[\\mbox{is male}] \\\\ &= b_0 + b_{\\mbox{age}}*\\mbox{age} + b_{\\mbox{male}}*0 + b_{\\mbox{age,male}}\\mbox{age}*0 \\\\ &= b_0 + b_{\\mbox{age}}*\\mbox{age}\\\\ &= 4.883 -0.018*\\mbox{age} \\end{align} \\] Let’s summarize these values in a table: Table 7.8: Comparison of male and female intercepts and age slopes Gender Intercept Slope for age Male instructors 4.44 -0.004 Female instructors 4.88 -0.018 We see that while male instructors have a lower intercept, as they age, they have a less steep associated average decrease in teaching scores: 0.004 teaching score units per year as opposed to -0.018 for women. This is consistent with the different slopes and intercepts of the red and blue regression lines fit in Figure 7.4. Recall our definition of a model having an interaction effect: when the associated effect of one variable, in this case age, depends on the value of another variable, in this case gender. But how do we know when it’s appropriate to include an interaction effect? For example, which is the more appropriate model? The regular multiple regression model without an interaction term we saw in Section 7.2.2 or the multiple regression model with the interaction term we just saw? We’ll revisit this question in Chapter 11 on “inference for regression.” 7.2.4 Observed/fitted values and residuals Now say we want to apply the above calculations for male and female instructors for all 463 instructors in the evals dataset. As our multiple regression models get more and more complex, computing such values by hand gets more and more tedious. The get_regression_points() function spares us this tedium and returns all fitted values and all residuals. For simplicity, let’s focus only on the fitted interaction model, which is saved in score_model_interaction. regression_points <- get_regression_points(score_model_interaction) regression_points Table 7.9: Regression points (first 5 rows of 463) ID score age gender score_hat residual 1 4.7 36 female 4.25 0.448 2 4.1 36 female 4.25 -0.152 3 3.9 36 female 4.25 -0.352 4 4.8 36 female 4.25 0.548 5 4.6 59 male 4.20 0.399 Recall the format of the output: score corresponds to \\(y\\) the observed value score_hat corresponds to \\(\\widehat{y} = \\widehat{\\mbox{score}}\\) the fitted value residual corresponds to the residual \\(y - \\widehat{y}\\) 7.2.5 Residual analysis As always, let’s perform a residual analysis first with a histogram: ggplot(regression_points, aes(x = residual)) + geom_histogram(binwidth = 0.25, color = "white") + labs(x = "Residual") Figure 7.6: Interaction model histogram of residuals Second, the residuals as compared to the predictor variables: \\(x_1\\): numerical explanatory/predictor variable of age \\(x_2\\): categorical explanatory/predictor variable of gender ggplot(regression_points, aes(x = age, y = residual)) + geom_point() + labs(x = "age", y = "Residual") + geom_hline(yintercept = 0, col = "blue", size = 1) + facet_wrap(~gender) Figure 7.7: Interaction model residuals vs predictor 7.3 Related topics 7.3.1 More on the correlation coefficient Recall in Table 7.2, we saw that the correlation coefficient between Income in thousands of dollars and credit card Balance was 0.464. What if in instead we looked at the correlation coefficient between Income and credit card Balance, but where Income was in dollars and not thousands of dollars? This can be done by multiplying Income by 1000. library(ISLR) data(Credit) Credit %>% select(Balance, Income) %>% mutate(Income = Income * 1000) %>% cor() Table 7.10: Correlation between income (in $) and credit card balance Balance Income Balance 1.000 0.464 Income 0.464 1.000 We see it is the same! We say that the correlation coefficient is invariant to linear transformations! In other words, the correlation between \\(x\\) and \\(y\\) will be the same as the correlation between \\(a\\times x + b\\) and \\(y\\) where \\(a, b\\) are numerical values (real numbers in mathematical terms). 7.3.2 Simpson’s Paradox Recall in Section 7.1, we saw the two following seemingly contradictory results when studying the relationship between credit card balance, credit limit, and income. On the one hand, the right hand plot of Figure 7.1 suggested that credit card balance and income were positively related: Figure 7.8: Relationship between credit card balance and credit limit/income On the other hand, the multiple regression in Table 7.3, suggested that when modeling credit card balance as a function of both credit limit and income at the same time, credit limit has a negative relationship with balance, as evidenced by the slope of -7.66. How can this be? First, let’s dive a little deeper into the explanatory variable Limit. Figure 7.9 shows a histogram of all 400 values of Limit, along with vertical red lines that cut up the data into quartiles, meaning: 25% of credit limits were between $0 and $3088. Let’s call this the “low” credit limit bracket. 25% of credit limits were between $3088 and $4622. Let’s call this the “medium-low” credit limit bracket. 25% of credit limits were between $4622 and $5873. Let’s call this the “medium-high” credit limit bracket. 25% of credit limits were over $5873. Let’s call this the “high” credit limit bracket. Figure 7.9: Histogram of credit limits and quartiles Let’s now display The scatterplot showing the relationship between credit card balance and limit (the right-hand plot of Figure 7.1). The scatterplot showing the relationship between credit card balance and limit now with a color aesthetic added corresponding to the credit limit bracket. Figure 7.10: Relationship between credit card balance and income for different credit limit brackets where in the right-hand plot the Red points (bottom-left) correspond to the low credit limit bracket Green points correspond to the medium-low credit limit bracket Blue points correspond to the medium-high credit limit bracket Purple points (top-right) correspond to the high credit limit bracket The left-hand plot focuses of the relationship between balance and income in aggregate, but the right-hand plot focuses on the relationship between balance and income broken down by credit limit bracket. Whereas in aggregate there is an overall positive relationship, when broken down we now see that for the low (red points) medium-low (green points) medium-high (blue points) income bracket groups, the strong positive relationship between credit card balance and income disappears! Only for the high bracket does the relationship stay somewhat positive. In this example credit limit is a confounding variable for credit card balance and income. 7.4 Conclusion 7.4.1 What’s to come? Congratulations! We’re ready to proceed to the third portion of this book: “statistical inference” using a new package called infer. Once we’ve covered Chapters 8 on sampling, 9 on confidence intervals, and 10 on hypothesis testing, we’ll come back to the models we’ve seen in “data modeling” in Chapter 11 on inference for regression. As we said at the end of Chapter 6, we’ll see why we’ve been conducting the residual analyses from Subsections {#model3residuals} and {#model4residuals}; we are actually verifying some very important assumptions that must be met for the std_error (standard error), p_value, conf_low and conf_high (the end-points of the confidence intervals) columns in our regression tables to have valid interpretation. 7.4.2 Script of R code An R script file of all R code used in this chapter is available here. "], ["8-sampling.html", "8 Sampling 8.1 Terminology 8.2 “In real life” sampling 8.3 Virtual sampling 8.4 Repeated virtual sampling 8.5 Central Limit Theorem 8.6 Conclusion", " 8 Sampling In this chapter we kick off the third segment of this book, statistical inference, by learning about sampling. The concepts behind sampling form the basis of confidence intervals and hypothesis testing, which we’ll cover in Chapters 9 and 10 respectively. We will see that the tools that you learned in the data science segment of this book (data visualization, “tidy” data format, and data wrangling) will also play an important role here in the development of your understanding. As mentioned before, the concepts throughout this text all build into a culmination allowing you to “think with data.” Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(moderndive) 8.1 Terminology Before we can start studying sampling, we need to define some terminology. Population: The population is the (usually) large pool of observational units that we are interested in. Population parameter: A population parameter is a numerical quantify of interest about a population, such as a proportion or a mean. Census: An enumeration of every member of a population. Ex: the Decennial United States census. Sample: A sample is a smaller collection of observational units that is selected from the population. We would like to infer about the population based on this sample. Sampling: Sampling refers to the process of selecting observations from a population. There are both random and non-random ways this can be done. Representative sampling: A sample is said be a representative sample if the characteristics of observational units selected are a good approximation of the characteristics from the original population. Generalizability: Generalizability refers to the largest group in which it makes sense to make inferences about from the sample collected. This is directly related to how the sample was selected. Bias: Bias corresponds to a favoring of one group in a population over another group. Or put differently, when certain members of a population have a higher chance of being included in a sample than others. Statistic: A statistic is a calculation based on one or more variables measured in the sample. Point estimates/sample statistics: These are statistics, computed based on a sample, that estimate an unknown population parameter. 8.2 “In real life” sampling Consider the following “sampling bowl” consisting of 2400 balls, which are either red, white, or green. We are interested in knowing the proportion of balls in the sampling bowl that are red, but do not wish to manually count the number of balls out of 2400 that are red. In other words, we’re not interested in conducting a census. So instead we attempt to estimate the proportion red by using the sampling “shovel” to extract a sample of size \\(n = 50\\) balls, and count the proportion of these that are red. However, before we extracted a sample using this shovel, we made sure to give the balls a good stir, ensuring we have random sampling. Figure 8.1: Sampling from a sampling bowl We put students to the task of estimating the proportion of balls in the tub that are red, because frankly, we’re too lazy to do so ourselves! Groups of students “in real life” took random samples of size \\(n = 50\\). Thank you Niko, Sophie, Caitlin, Yaw, and Drew for doing double duty! In other words, we have 10 samples of size \\(n = 50\\): bowl_samples Table 8.1: In real life: 10 samples of size 50 group red white green n Kathleen and Max 18 32 0 50 Sean, Jack, and CJ 18 32 0 50 X and Judy 22 28 0 50 James and Jacob 21 29 0 50 Hannah and Siya 16 34 0 50 Niko, Sophie, and Caitlin 14 36 0 50 Niko, Sophie, and Caitlin 19 31 0 50 Aleja and Ray 20 30 0 50 Yaw and Drew 16 34 0 50 Yaw and Drew 21 29 0 50 For each sample of size \\(n\\) = 50, what is the sample proportion that are red? In other words, what are the point estimates \\(\\widehat{p}\\) based on a sample of size \\(n = 50\\) of \\(p\\), the true proportion of balls in the tub that is red? We can compute this using the mutate() function from the dplyr package we studied extensively in Chapter 5: bowl_samples <- bowl_samples %>% mutate(prop_red = red / n) %>% select(group, prop_red) bowl_samples Table 8.2: In real life: 10 sample proportions red based on samples of size 50 group prop_red Kathleen and Max 0.36 Sean, Jack, and CJ 0.36 X and Judy 0.44 James and Jacob 0.42 Hannah and Siya 0.32 Niko, Sophie, and Caitlin 0.28 Niko, Sophie, and Caitlin 0.38 Aleja and Ray 0.40 Yaw and Drew 0.32 Yaw and Drew 0.42 We see that one group got a sample proportion \\(\\widehat{p}\\) as low as 28% while another got a sample proportion \\(\\widehat{p}\\) as high as 0.44. Why are these different? Why is there this variation? Because of sampling variability! Sampling is inherently random, so for a sample of \\(n = 50\\) balls, we’ll never get exactly the same number of red balls. Let’s visualize this using our data visualization skills that you honed in Chapter 3! Let’s investigate the distribution of these 10 sample proportion red \\(\\widehat{p}\\) each based on a random sample of size \\(n = 50\\) using a histogram, an appropriate visualization since prop_red is numerical: Figure 8.2: In real life: 10 sample proportions red based on 10 samples of size 50 Let’s ask ourselves some questions: Where is the histogram centered? What is the spread of this histogram? Recall from Section 5.4 the mean and the standard deviation are two summary statistics that would answer this question: bowl_samples %>% summarize(mean = mean(prop_red), sd = sd(prop_red)) mean sd 0.37 0.052 What you have just unpacked are some very deep and very subtle concepts in statistical inference: The histogram in Figure 8.2 is called the sampling distribution of \\(\\widehat{p}\\) based on samples of size \\(n=50\\). It describes how values of the sample proportion red will vary from sample-to-sample due to the aforementioned sampling variability. If the sampling is done in an unbiased and random fashion, in other words we made sure to stir the bowl before we sampled, then the sampling distribution will be guaranteed to be centered at the true unknown population proportion red, or in other words the true number of balls out of 2400 that are red. In this case, these 10 values of \\(\\widehat{p}\\) are centered at 0.37. The spread of this histogram, as quantified by the standard deviation of 0.052, is called the standard error. It quantifies the variability of our estimates for \\(\\widehat{p}\\). 8.3 Virtual sampling In the moderndive package, we’ve included a data frame called bowl that actually is a virtual version of the above sampling bowl in Figure 8.1 with all 2400 balls! While we present a snap shot of the first 10 rows of bowl below, you should View() it in RStudio to convince yourselves that bowl is indeed a virtual version of the image above. View(bowl) Table 8.3: First 10 balls in virtual sampling bowl ball_ID color 1 white 2 white 3 white 4 red 5 white 6 white 7 red 8 white 9 red 10 white Note that the balls are not actually marked with numbers; the variable ball_ID is merely used as an identification variable for each row of bowl. Recall our previous discussion on identification variables in Subsection 4.2.2 in the “Data Tidying” Chapter 4. Let’s replicate what the groups of students did above but virtually. We are going to now simulate using a computer what our students did by hand in Table 8.1 using the rep_sample_n() function. The rep_sample_n() function takes the following arguments: tbl: a data frame representing the population you wish to infer about. We’ll set this to bowl, since this is the (virtual) population of interest. size: the sample size \\(n\\) in question. We’ll set this to 50, mimicking the number of slots in the sampling “shovel” in the image in Figure 8.1. replace: A logical TRUE/FALSE value indicating whether or not to put each ball back into the bowl after we’ve sampled it. In our case, we’ll set this to FALSE since we are sampling 50 balls at once, not 50 single balls individually. reps: the number of samples of size \\(n =\\) size to extract. We’ll set this to 10, mimicking the data we have in Table 8.1. Let’s apply this function to mimic our situation above and View() the data. The output is rather large, so we won’t display it below. all_samples <- rep_sample_n(bowl, size = 50, reps = 10) View(all_samples) Scrolling through the spreadsheet viewer, you’ll notice The values of replicate (1 through 10) come in bunches of 50, representing the 10 groups of respective samples of size \\(n\\) = 50. The ball_ID identification variable is all over the place, suggesting we really are (virtually) randomly sampling balls. color represents the color of each of the virtually sampled balls. What is the proportion red for each group as denoted by the replicate variable? Again, let’s leverage your data ninja skills from Chapter 5. bowl_samples_virtual <- all_samples %>% mutate(is_red = color == "red") %>% group_by(replicate) %>% summarize(prop_red = mean(is_red)) bowl_samples_virtual Table 8.4: Virtual simulation: 10 sample proportions red based on samples of size 50 replicate prop_red 1 0.34 2 0.36 3 0.40 4 0.38 5 0.36 6 0.30 7 0.36 8 0.38 9 0.26 10 0.46 Compare Tables 8.2 and Table 8.4; they are similar in output format and also the resulting prop_red are similar in values. Let’s plot this using the same histogram code as in Figure 8.2, but switching out bowl_samples for bowl_samples_virtual: Figure 8.3: Virtual simulation: 10 sample proportions red based on 10 samples of size 50 We’ve replicated the sampling distribution, but using simulated random samples, instead of the “in real life” random samples that our students collected in Table 8.1. Let’s compute the center of this histogram and its standard deviation, which has a specific name: the standard error. bowl_samples_virtual %>% summarize(mean = mean(prop_red), sd = sd(prop_red)) mean sd 0.37 0.052 8.4 Repeated virtual sampling Say we were feeling particularly unkind to Yaw and Drew and made them draw not 10 samples of size \\(n = 50\\), but TEN THOUSAND such samples. They would probably be at work for days! This is where computer simulations really come in handy: doing repetitive and boring tasks repeatedly. To achieve this virtually, we just use the same code as above but set reps = 10000: # Draw one million samples of size n = 50 all_samples <- rep_sample_n(bowl, size = 50, reps = 10000) # For each sample, as marked by the variable `replicate`, compute the proportion red bowl_samples_virtual <- all_samples %>% mutate(is_red = (color == "red")) %>% group_by(replicate) %>% summarize(prop_red = mean(is_red)) # Plot the histogram ggplot(bowl_samples_virtual, aes(x = prop_red)) + geom_histogram(binwidth = 0.02, color = "white") + labs(x = "Sample proportion red in sample of size n=50", y="Number of samples", title = "Sample proportion red in ten samples of size n=50") Figure 8.4: Virtual simulation: Ten thousand sample proportions red based on ten thousand samples of size 50 This distribution looks an awful lot like the bell-shaped normal distribution. That’s because it is the normal distribution! Let’s compute the center of this sampling distribution and the standard error again: bowl_samples_virtual %>% summarize(mean = mean(prop_red), sd = sd(prop_red)) mean sd 0.37 0.052 Learning check (LC8.1) Repeat the above repeated virtual sampling exercise for 10,000 samples of size \\(n\\) = 100. What do you notice is different about the histogram, i.e. the sampling distribution? (LC8.2) Repeat the above repeated virtual sampling exercise for 10,000 samples of size \\(n\\) = 25. What do you notice is different about the histogram, i.e. the sampling distribution, when compared to the instances when the samples were of size \\(n\\) = 50 and \\(n\\) = 100? (LC8.3) Repeat the above repeated virtual sampling exercise for 10,000 samples of size \\(n\\) = 50, but where the population is the pennies dataset in the moderndive package representing 800 pennies and where the population parameter of interest is the mean year of minting of the 800 pennies. See the help file ?pennies for more information about this dataset. 8.5 Central Limit Theorem What you have just shown in the previous section is a very famous theorem, or mathematically proven truth, called the Central Limit Theorem. It loosely states that when samples means and sample proportions are based on larger and larger samples, the sampling distribution corresponding to these point estimates get More and more normal More and more narrow Shuyi Chiou, Casey Dunn, and Pathikrit Bhattacharyya created the following 3m38s video explaining this crucial theorem to statistics using as examples, what else? The average weight of wild bunny rabbits! The average wing span of dragons! 8.6 Conclusion 8.6.1 What’s to come? This chapter serves as an introduction to the theoretical underpinning of the statistical inference techniques that will be discussed in greater detail in Chapter 9 for confidence intervals and Chapter 10 for hypothesis testing. 8.6.2 Script of R code An R script file of all R code used in this chapter is available here. "], -["9-ci.html", "9 Confidence Intervals 9.1 Sneak peak of infer 9.2 Under construction… 9.3 Bootstrapping 9.4 Relation to hypothesis testing 9.5 Effect size 9.6 Conclusion", " 9 Confidence Intervals Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. Please check out our sneak peak of infer below in the meanwhile. For more details on infer visit https://infer.netlify.com/. 9.1 Sneak peak of infer Question: Of all the cars in the mtcars dataset, do automatic cars get better gas mileage than manual cars? Approach: 95% confidence interval for difference in means. library(dplyr) library(ggplot2) library(infer) # Clean data mtcars <- mtcars %>% as_tibble() %>% mutate(am = factor(am)) # Simulate sampling distribution of two-sample difference in means: sampling_distribution <- mtcars %>% specify(mpg ~ am) %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "diff in means", order = c("1", "0")) # Compute 95% confidence interval: conf_int <- sampling_distribution %>% pull(stat) %>% quantile(probs = c(0.025, 0.975)) # Visualize: plot <- sampling_distribution %>% visualize() plot + geom_vline(xintercept = conf_int, col = "red", size = 1) 9.2 Under construction… The content here will be deprecated with a shift to using the infer package in the months to come. Definition: Confidence Interval A confidence interval gives a range of plausible values for a parameter. It depends on a specified confidence level with higher confidence levels corresponding to wider confidence intervals and lower confidence levels corresponding to narrower confidence intervals. Common confidence levels include 90%, 95%, and 99%. Usually we don’t just begin chapters with a definition, but confidence intervals are simple to define and play an important role in the sciences and any field that uses data. You can think of a confidence interval as playing the role of a net when fishing. Instead of just trying to catch a fish with a single spear (estimating an unknown parameter by using a single point estimate/statistic), we can use a net to try to provide a range of possible locations for the fish (use a range of possible values based around our statistic to make a plausible guess as to the location of the parameter). Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(infer) library(mosaic) library(knitr) library(ggplot2movies) 9.3 Bootstrapping Just as we did in Chapter 10 with the Lady Tasting Tea when making hypotheses about a population total with which we would like to test which one is more plausible, we can also use computation to infer conclusions about a population quantitative statistic such as the mean. In this case, we will focus on constructing confidence intervals to produce plausible values for a population mean. (We can do a similar analysis for a population median or other summary measure as well.) Traditionally, the way to construct confidence intervals for a mean is to assume a normal distribution for the population or to invoke the Central Limit Theorem and get, what often appears to be magic, results. (This is similar to what was done in Section 10.10.) These methods are often not intuitive, especially for those that lack a strong mathematical background. They also come with their fair share of assumptions and often turn Statistics, a field that is full of tons of useful applications to many different fields and disciplines, into a robotic procedural-based topic. It doesn’t have to be that way! In this section, we will introduce the concept of bootstrapping. It will be a useful tool that will allow us to estimate the variability of our statistic from sample to sample. One neat feature of bootstrapping is that it enables us to approximate the sampling distribution and estimate the distribution’s standard deviation using ONLY the information in the one selected (original) sample. It sounds just as plagued with the magical type qualities of traditional theory-based inference on initial glance but we will see that it provides an intuitive and useful way to make inferences, especially when the samples are of medium to large size. To introduce the concept of bootstrapping, we again will use the movies dataset in the ggplot2movies data frame. Recall that you can also glance at this data frame using the View function and look at the help documentation for movies using the ? function. We will explore many other features of this dataset in the chapters to come, but here we will be focusing on the rating variable corresponding to the average IMDB user rating. You may notice that this dataset is quite large: 58,788 movies have data collected about them here. This will correspond to our population of ALL movies. Remember from Chapter 8 that our population is rarely known. We use this dataset as our population here to show you the power of bootstrapping in estimating population parameters. We’ll see how confidence intervals built using the bootstrap distribution perform at including our population parameter of interest. Here we can actually calculate these values since our population is known, but remember that in general this isn’t the case. Let’s take a look at what the distribution of our population ratings looks like. We’ll see that we will use the distribution of our sample(s) as an estimate of this population histogram. movies %>% ggplot(aes(x = rating)) + geom_histogram(color = "white", bins = 20) Figure 9.1: Population ratings histogram Learning check (LC9.1) Why was a histogram chosen as the plot to make for the rating variable above? (LC9.2) What does the shape of the rating histogram tell us about how IMDB users rate movies? What stands out about the plot? It’s important to think about what our goal is here. We would like to produce a confidence interval for the population mean rating. We will have to pretend for a moment that we don’t have all 58,788 movies. Let’s say that we only have a random sample of 50 movies from this dataset instead. In order to get a random sample, we can use the resample function in the mosaic package with replace = FALSE. We could also use the sample_n function from dplyr. set.seed(2017) movies_sample <- movies %>% sample_n(50) The sample_n function has filtered the data frame movies “at random” to choose only 50 rows from the larger movies data frame. We store information on these 50 movies in the movies_sample data frame. Let’s now explore what the rating variable looks like for these 50 movies: ggplot(data = movies_sample, aes(x = rating)) + geom_histogram(color = "white", bins = 20) Figure 9.2: Sample ratings histogram Remember that we can think of this histogram as an estimate of our population distribution histogram that we saw above. We are interested in the population mean rating and trying to find a range of plausible values for that value. A good start in guessing the population mean is to use the mean of our sample rating from the movies_sample data: (movies_sample_mean <- movies_sample %>% summarize(mean = mean(rating))) # A tibble: 1 x 1 mean <dbl> 1 5.89 Note the use of the ( ) at the beginning and the end of this creation of the movies_sample_mean object. If you’d like to print out your newly created object, you can enclose it in the parentheses as we have here. This value of 5.894 is just one guess at the population mean. The idea behind bootstrapping is to sample with replacement from the original sample to create new resamples of the same size as our original sample. Returning to our example, let’s investigate what one such resample of the movies_sample dataset accomplishes. We can create one resample/bootstrap sample by using the resample function in the mosaic package. boot1 <- resample(movies_sample) %>% arrange(orig.id) The important thing to note here is the original row numbers from the movies_sample data frame in the far right column called orig.ids. Since we are sampling with replacement, there is a strong likelihood that some of the 50 observational units are going to be selected again. You may be asking yourself what does this mean and how does this lead us to creating a distribution for the sample mean. Recall that the original sample mean of our data was calculated using the summarize function above. Learning check (LC9.3) What happens if we change the seed to our pseudo-random generation? Try it above when we used resample to describe the resulting movies_sample. (LC9.4) Why is sampling at random important from the movies data frame? Why don’t we just pick Action movies and do bootstrapping with this Action movies subset? (LC9.5) What was the purpose of assuming we didn’t have access to the full movies dataset here? Before we had a calculated mean in our original sample of 5.894. Let’s calculate the mean of ratings in our bootstrapped sample: (movies_boot1_mean <- boot1 %>% summarize(mean = mean(rating))) # A tibble: 1 x 1 mean <dbl> 1 5.69 More than likely the calculated bootstrap sample mean is different than the original sample mean. This is what was meant earlier by the sample means having some variability. What we are trying to do is replicate many different samples being taken from a larger population. Our best guess at what the population looks like is multiple copies of the sample we collected. We then can sample from that larger “created” population by generating bootstrap samples. Similar to what we did in the previous section, we can repeat this process using the do function followed by an asterisk. Let’s look at 10 different bootstrap means for ratings from movies_sample. Note the use of the resample function here. do(10) * (resample(movies_sample) %>% summarize(mean = mean(rating))) mean 1 5.94 2 5.57 3 5.83 4 6.29 5 6.03 6 5.92 7 6.00 8 5.85 9 6.10 10 5.61 You should see some variability begin to tease its way out here. Many of the simulated means will be close to our original sample mean but many will stray pretty far away. This occurs because outliers may have been selected a couple of times in the resampling or small values were selected more than larger. There are myriad reasons why this might be the case. So what’s the next step now? Just as we repeated the repetitions thousands of times with the “Lady Tasting Tea” example, we can do a similar thing here: trials <- do(5000) * summarize(resample(movies_sample), mean = mean(rating)) ggplot(data = trials, mapping = aes(x = mean)) + geom_histogram(bins = 30, color = "white") Figure 9.3: Bootstrapped means histogram The shape of this resulting distribution may look familiar to you. It resembles the well-known normal (bell-shaped) curve. At this point, we can easily calculate a confidence interval. In fact, we have a couple different options. We will first use the percentiles of the distribution we just created to isolate the middle 95% of values. This will correspond to our 95% confidence interval for the population mean rating, denoted by \\(\\mu\\). (ciq_mean_rating <- confint(trials, level = 0.95, method = "quantile")) name lower upper level method estimate 1 mean 5.46 6.3 0.95 percentile 5.89 It’s always important at this point to interpret the results of this confidence interval calculation. In this context, we can say something like the following: Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.46 and 6.3. This statement may seem a little confusing to you. Another way to think about this is that this confidence interval was constructed using the sample data by a procedure that is 95% reliable in that of 100 generated confidence intervals based on 100 different random samples, we expect on average that 95 of them will capture the true unknown parameter. This also means that we will get invalid results 5% of the time. Just as we had a trade-off with \\(\\alpha\\) and \\(\\beta\\) with hypothesis tests, we have a similar trade-off here with setting the confidence level. To further reiterate this point, the graphic below from Diez, Barr, and Çetinkaya-Rundel (2014) shows us that if we repeated a confidence interval process 25 times with 25 different samples, we would expect about 95% of them to actually contain the population parameter of interest. This parameter is marked with a dotted vertical line. We can see that only one confidence interval does not overlap with this value. (The one marked in red.) Therefore 24 in 25 (96%), which is quite close to our 95% reliability, do include the population parameter. Figure 9.4: Confidence interval coverage plot from OpenIntro Remember that we are pretending like we don’t know what the mean IMDB rating for ALL movies is. Our population here is all of the movies listed in the movies data frame from ggplot2movies. So does our bootstrapped confidence interval here contain the actual mean value? movies %>% summarize(mean_rating = mean(rating)) # A tibble: 1 x 1 mean_rating <dbl> 1 5.93 We see here that the population mean does fall in our range of plausible values generated from the bootstrapped samples. We can also get an idea of how the theory-based inference techniques would have approximated this confidence interval by using the formula \\[\\bar{x} \\pm (2 * SE),\\] where \\(\\bar{x}\\) is our original sample mean and \\(SE\\) stands for standard error and corresponds to the standard deviation of the bootstrap distribution. The value of 2 here corresponds to it being a 95% confidence interval. (95% of the values in a normal distribution fall within 2 standard deviations of the mean.) This formula assumes that the bootstrap distribution is symmetric and bell-shaped. This is often the case with bootstrap distributions, especially those in which the original distribution of the sample is not highly skewed. Definition: standard error The standard error is the standard deviation of the sampling distribution. The sampling distribution may be approximated by the bootstrap distribution or the null distribution depending on the context. Traditional theory-based methodologies for inference also have formulas for standard errors, assuming some conditions are met. To compute this type of confidence interval, we only need to make a slight modification to the confint function seen above. (The expression after the \\(\\pm\\) sign is known as the margin of error.) (cise_mean_rating <- confint(trials, level = 0.95, method = "stderr")) name lower upper level method estimate margin.of.error 1 mean 5.47 6.32 0.95 stderr 5.89 0.425 Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.467 and 6.316. Learning check (LC9.6) Reproduce the bootstrapping above using a sample of size 50 instead of 25. What changes do you see? (LC9.7) Reproduce the bootstrapping above using a sample of size 5 instead of 25. What changes do you see? (LC9.8) How does the sample size affect the analysis above? (LC9.9) Why must bootstrap samples be the same size as the original sample? 9.3.1 Review of bootstrapping We can summarize the process to generate a bootstrap distribution here in a series of steps that clearly identify the terminology we will use (R. Lock et al. 2012). Generate bootstrap samples by sampling with replacement from the original sample, using the same sample size. Compute the statistic of interest, called a bootstrap statistic, for each of the bootstrap samples. Collect the statistics for many bootstrap samples to create a bootstrap distribution. Visually, we can represent this process in the following diagram. Figure 9.5: Bootstrapping diagram from Lock5 textbook 9.4 Relation to hypothesis testing Recall that we found a statistically significant difference in the sample mean of romance movie ratings compared to the sample mean of action movie ratings. We concluded Chapter 10 by attempting to understand just how much greater we could expect the population mean romance movie rating to be compared to the population mean action movie rating. In order to do so, we will calculate a confidence interval for the difference \\(\\mu_r - \\mu_a\\). We’ll then go back to our population parameter values and see if our confidence interval contains our parameter value. We could use bootstrapping in a way similar to that done above, except now on a difference in sample means, to create a distribution and then use the confint function with the option of quantile to determine a confidence interval for the plausible values of the difference in population means. This is an excellent programming activity and the reader is urged to try to do so. Recall what the randomization/null distribution looked like for our simulated shuffled sample means: Note all this code was moved over from hypothesis testing (movies_trimmed <- movies %>% select(title, year, rating, Action, Romance)) # A tibble: 58,788 x 5 title year rating Action Romance <chr> <int> <dbl> <int> <int> 1 $ 1971 6.40 0 0 2 $1000 a Touchdown 1939 6.00 0 0 3 $21 a Day Once a Month 1941 8.20 0 0 4 $40,000 1996 8.20 0 0 5 $50,000 Climax Show, The 1975 3.40 0 0 6 $pent 2000 4.30 0 0 7 $windle 2002 5.30 1 0 8 '15' 2002 6.70 0 0 9 '38 1987 6.60 0 0 10 '49-'17 1917 6.00 0 0 # ... with 58,778 more rows movies_trimmed <- movies_trimmed %>% filter(!(Action == 1 & Romance == 1)) movies_trimmed <- movies_trimmed %>% mutate(genre = ifelse(Action == 1, "Action", ifelse(Romance == 1, "Romance", "Neither"))) %>% filter(genre != "Neither") %>% select(-Action, -Romance) set.seed(2017) movies_genre_sample <- movies_trimmed %>% group_by(genre) %>% sample_n(34) %>% ungroup() mean_ratings <- movies_genre_sample %>% group_by(genre) %>% summarize(mean = mean(rating)) obs_diff <- diff(mean_ratings$mean) shuffled_ratings <- #movies_trimmed %>% movies_genre_sample %>% mutate(genre = shuffle(genre)) %>% group_by(genre) %>% summarize(mean = mean(rating)) diff(shuffled_ratings$mean) [1] -0.132 set.seed(2017) many_shuffles <- do(5000) * (movies_genre_sample %>% mutate(genre = shuffle(genre)) %>% group_by(genre) %>% summarize(mean = mean(rating)) ) rand_distn <- many_shuffles %>% group_by(.index) %>% summarize(diffmean = diff(mean)) head(rand_distn, 10) # A tibble: 10 x 2 .index diffmean <dbl> <dbl> 1 1.00 -0.132 2 2.00 -0.197 3 3.00 -0.0265 4 4.00 0.715 5 5.00 -0.474 6 6.00 -0.121 7 7.00 -0.174 8 8.00 -0.209 9 9.00 -0.00882 10 10.0 -0.332 ggplot(data = rand_distn, mapping = aes(x = diffmean)) + geom_histogram(color = "white", bins = 20) Figure 9.6: Simulated shuffled sample means histogram With this null distribution being quite symmetric and bell-shaped, the standard error method introduced above likely provides a good estimate of a range of plausible values for \\(\\mu_r - \\mu_a\\). Another nice option here is that we can use the standard deviation of the null/randomization distribution we just found with our hypothesis test. (std_err <- rand_distn %>% summarize(se = sd(diffmean))) # A tibble: 1 x 1 se <dbl> 1 0.340 We can use the general formula of \\(statistic \\pm (2 * SE)\\) for a confidence interval to obtain the following result for plausible values of the difference in population means at the 95% level. (lower <- obs_diff - (2 * std_err)) se 1 0.269 (upper <- obs_diff + (2 * std_err)) se 1 1.63 We can, therefore, say that we are 95% confident that the population mean rating for romance movies is between 0.269 and 1.631 points higher than for that of action movies. The important thing to check here is whether 0 is contained in the confidence interval. If it is, it is plausible that the difference in the two population means between the two groups is 0. This means that the null hypothesis is plausible. The results of the hypothesis test and the confidence interval should match as they do here. We rejected the null hypothesis with hypothesis testing and we have evidence here that the mean rating for romance movies is higher than for action movies. 9.5 Effect size The phrase effect size has been thrown around recently as an alternative to \\(p\\)-values. In combination with the confidence interval, it can be often more valuable than just looking at the results of a hypothesis test. It depends on the scientific discipline exactly what is meant by “effect size” but, in general, it refers to the magnitude of the difference between group measurements. For our two sample problem involving movies, it is the observed difference in sample means obs_diff. It’s worthy of mention here that confidence intervals are always centered at the observed statistic. In other words, if you are looking at a confidence interval and someone asks you what the “effect size” is you can simply find the midpoint of the stated confidence interval. Learning check (LC9.10) Check to see whether the difference in population mean ratings for the two genres falls in the confidence interval we found here. Are we guaranteed that it will fall in the range of plausible values? (LC9.11) Why do you think many scientific fields are shifting to preferring inclusion of confidence intervals in articles over just \\(p\\)-values and hypothesis tests? (LC9.12) Why is 95% related to a value of 2 in the margin of error? What would approximate values be for 90% and for 99%? (LC9.13) Why is a 95% confidence interval wider than a 90% confidence interval? Explain by using a concrete example from everyday life about what is meant by “confidence.” (LC9.14) How would confidence intervals correspond to one-sided hypothesis tests? (LC9.15) There is a relationship between the significance level and the confidence level. What do you think it is? (LC9.16) The moment the phrase “standard error” is mentioned, there seems to be someone that says “The standard error is \\(s\\) divided by the square root of \\(n\\).” This standard error formula is used in the theory-based procedure for an inference on one mean. But… does it always work? For samp1, samp2, and samp3 below, do the following: produce a bootstrap distribution based on the sample calculate the standard deviation of the bootstrap distribution compare this value of the standard error to what you obtain when you calculate the standard deviation of the sample \\(s\\) divided by \\(\\sqrt{n}\\). df1 <- data_frame(samp1 = rexp(50)) df2 <- data_frame(samp2 = rnorm(100)) df3 <- data_frame(samp3 = rbeta(20, 5, 5)) Describe how \\(s / \\sqrt{n}\\) does in approximating the standard error for these three samples and their corresponding bootstrap distributions. 9.6 Conclusion 9.6.1 What’s to come? This concludes the Inference unit of this book. You should now have a thorough introduction into topics in both data science and statistics. In the last chapter of the textbook, we’ll summarize the purpose of this book as well as present an excellent example of what goes into making an effective story via data. 9.6.2 Script of R code An R script file of all R code used in this chapter is available here. "], -["10-hypo.html", "10 Hypothesis Testing 10.1 Sneak peak of infer 10.2 Under construction… 10.3 When inference is not needed 10.4 Basics of hypothesis testing 10.5 Criminal trial analogy 10.6 Types of errors in hypothesis testing 10.7 Statistical significance 10.8 Example: Revisiting the Lady Tasting Tea 10.9 Example: Comparing two means 10.10 Building theory-based methods using computation 10.11 Resampling-based inference for regression 10.12 Theory-based inference for regression 10.13 Conclusion", " 10 Hypothesis Testing Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. Please check out our sneak peak of infer below in the meanwhile. For more details on infer visit https://infer.netlify.com/. 10.1 Sneak peak of infer Question: Of all the cars in the mtcars dataset, do automatic cars get better gas mileage than manual cars? Approach: Two-sample test for difference in means. library(dplyr) library(ggplot2) library(infer) # Clean data mtcars <- mtcars %>% as_tibble() %>% mutate(am = factor(am)) # Observed test statistic obs_stat <- mtcars %>% group_by(am) %>% summarize(mean = mean(mpg)) %>% summarize(obs_stat = diff(mean)) %>% pull(obs_stat) # Simulate null distribution of two-sample difference in means: null_distribution <- mtcars %>% specify(mpg ~ am) %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "diff in means", order = c("1", "0")) # Visualize: plot <- null_distribution %>% visualize() plot + geom_vline(xintercept = obs_stat, col = "red", size = 1) 10.2 Under construction… The content here will be deprecated with a shift to using the infer package in the months to come. We saw some of the main concepts of hypothesis testing introduced in Chapter 8. We will expand further on these ideas here and also provide a framework for understanding hypothesis tests in general. Instead of presenting you with lots of different formulas and scenarios, we hope to build a way to think about all hypothesis tests. You can then adapt to different scenarios as needed down the road when you encounter different statistical situations. The same can be said for confidence intervals. There is one general framework that applies to all confidence intervals and we will elaborate on this further in Chapter 9. The specifics may change slightly for each variation, but the important idea is to understand the general framework so that you can apply it to more specific problems. We believe that this approach is much better in the long-term than teaching you specific tests and confidence intervals rigorously. You can find fully-worked out examples for five common hypothesis tests and their corresponding confidence intervals in Appendix B. We recommend that you carefully review these examples as they also cover how the general frameworks apply to traditional normal-based methodologies like the \\(t\\)-test and normal-theory confidence intervals. You’ll see there that these methods are just approximations for the general computational frameworks, but require conditions to be met for their results to be valid. The general frameworks using randomization, simulation, and bootstrapping do not hold the same sorts of restrictions and further advance computational thinking, which is one big reason for their emphasis throughout this textbook. Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(mosaic) library(knitr) library(nycflights13) library(ggplot2movies) library(broom) 10.3 When inference is not needed Before we delve into the two techniques of inference (hypothesis testing and confidence intervals), it’s good to remember that there are cases where you need not perform a rigorous statistical inference. An important and time-saving skill is to ALWAYS do exploratory data analysis using dplyr and ggplot2 before thinking about running a hypothesis test. Let’s look at such an example selecting a sample of flights traveling to Boston and to San Francisco from New York City in the flights data frame in the nycflights13 package. (We will remove flights with missing data first using na.omit and then sample 100 flights going to each of the two airports.) bos_sfo <- flights %>% na.omit() %>% filter(dest %in% c("BOS", "SFO")) %>% group_by(dest) %>% sample_n(100) Suppose we were interested in seeing if the air_time to SFO in San Francisco was statistically greater than the air_time to BOS in Boston. As suggested, let’s begin with some exploratory data analysis to get a sense for how the two variables of air_time and dest relate for these two destination airports: bos_sfo_summary <- bos_sfo %>% group_by(dest) %>% summarize(mean_time = mean(air_time), sd_time = sd(air_time)) kable(bos_sfo_summary) dest mean_time sd_time BOS 38.7 3.91 SFO 346.2 16.71 Looking at these results, we can clearly see that SFO air_time is much larger than BOS air_time. The standard deviation is also extremely informative here. Learning check (LC10.1) Could we make the same type of immediate conclusion that SFO had a statistically greater air_time if, say, its corresponding standard deviation was 200 minutes? What about 100 minutes? Explain. To further understand just how different the air_time variable is for BOS and SFO, let’s look at a boxplot: ggplot(data = bos_sfo, mapping = aes(x = dest, y = air_time)) + geom_boxplot() Since there is no overlap at all, we can conclude that the air_time for San Francisco flights is statistically greater (at any level of significance) than the air_time for Boston flights. This is a clear example of not needing to do anything more than some simple descriptive statistics to get an appropriate inferential conclusion. This is one reason why you should ALWAYS investigate the sample data first using dplyr and ggplot2 via exploratory data analysis. As you get more and more practice with hypothesis testing, you’ll be better able to determine in many cases whether or not the results will be statistically significant. There are circumstances where it is difficult to tell, but you should always try to make a guess FIRST about significance after you have completed your data exploration and before you actually begin the inferential techniques. 10.4 Basics of hypothesis testing In a hypothesis test, we will use data from a sample to help us decide between two competing hypotheses about a population. We make these hypotheses more concrete by specifying them in terms of at least one population parameter of interest. We refer to the competing claims about the population as the null hypothesis, denoted by \\(H_0\\), and the alternative (or research) hypothesis, denoted by \\(H_a\\). The roles of these two hypotheses are NOT interchangeable. The claim for which we seek significant evidence is assigned to the alternative hypothesis. The alternative is usually what the experimenter or researcher wants to establish or find evidence for. Usually, the null hypothesis is a claim that there really is “no effect” or “no difference.” In many cases, the null hypothesis represents the status quo or that nothing interesting is happening. We assess the strength of evidence by assuming the null hypothesis is true and determining how unlikely it would be to see sample results/statistics as extreme (or more extreme) as those in the original sample. Hypothesis testing brings about many weird and incorrect notions in the scientific community and society at large. One reason for this is that statistics has traditionally been thought of as this magic box of algorithms and procedures to get to results and this has been readily apparent if you do a Google search of “flowchart statistics hypothesis tests”. There are so many different complex ways to determine which test is appropriate. You’ll see that we don’t need to rely on these complicated series of assumptions and procedures to conduct a hypothesis test any longer. These methods were introduced in a time when computers weren’t powerful. Your cellphone (in 2016) has more power than the computers that sent NASA astronauts to the moon after all. We’ll see that ALL hypothesis tests can be broken down into the following framework given by Allen Downey here: Figure 10.1: Hypothesis Testing Framework Before we hop into this framework, we will provide another way to think about hypothesis testing that may be useful. 10.5 Criminal trial analogy We can think of hypothesis testing in the same context as a criminal trial in the United States. A criminal trial in the United States is a familiar situation in which a choice between two contradictory claims must be made. The accuser of the crime must be judged either guilty or not guilty. Under the U.S. system of justice, the individual on trial is initially presumed not guilty. Only STRONG EVIDENCE to the contrary causes the not guilty claim to be rejected in favor of a guilty verdict. The phrase “beyond a reasonable doubt” is often used to set the cutoff value for when enough evidence has been given to convict. Theoretically, we should never say “The person is innocent.” but instead “There is not sufficient evidence to show that the person is guilty.” Now let’s compare that to how we look at a hypothesis test. The decision about the population parameter(s) must be judged to follow one of two hypotheses. We initially assume that \\(H_0\\) is true. The null hypothesis \\(H_0\\) will be rejected (in favor of \\(H_a\\)) only if the sample evidence strongly suggests that \\(H_0\\) is false. If the sample does not provide such evidence, \\(H_0\\) will not be rejected. The analogy to “beyond a reasonable doubt” in hypothesis testing is what is known as the significance level. This will be set before conducting the hypothesis test and is denoted as \\(\\alpha\\). Common values for \\(\\alpha\\) are 0.1, 0.01, and 0.05. 10.5.1 Two possible conclusions Therefore, we have two possible conclusions with hypothesis testing: Reject \\(H_0\\) Fail to reject \\(H_0\\) Gut instinct says that “Fail to reject \\(H_0\\)” should say “Accept \\(H_0\\)” but this technically is not correct. Accepting \\(H_0\\) is the same as saying that a person is innocent. We cannot show that a person is innocent; we can only say that there was not enough substantial evidence to find the person guilty. When you run a hypothesis test, you are the jury of the trial. You decide whether there is enough evidence to convince yourself that \\(H_a\\) is true (“the person is guilty”) or that there was not enough evidence to convince yourself \\(H_a\\) is true (“the person is not guilty”). You must convince yourself (using statistical arguments) which hypothesis is the correct one given the sample information. Important note: Therefore, DO NOT WRITE “Accept \\(H_0\\)” any time you conduct a hypothesis test. Instead write “Fail to reject \\(H_0\\).” 10.6 Types of errors in hypothesis testing Unfortunately, just as a jury or a judge can make an incorrect decision in regards to a criminal trial by reaching the wrong verdict, there is some chance we will reach the wrong conclusion via a hypothesis test about a population parameter. As with criminal trials, this comes from the fact that we don’t have complete information, but rather a sample from which to try to infer about a population. The possible erroneous conclusions in a criminal trial are an innocent person is convicted (found guilty) or a guilty person is set free (found not guilty). The possible errors in a hypothesis test are rejecting \\(H_0\\) when in fact \\(H_0\\) is true (Type I Error) or failing to reject \\(H_0\\) when in fact \\(H_0\\) is false (Type II Error). The risk of error is the price researchers pay for basing an inference about a population on a sample. With any reasonable sample-based procedure, there is some chance that a Type I error will be made and some chance that a Type II error will occur. To help understand the concepts of Type I error and Type II error, observe the following table: Figure 10.2: Type I and Type II errors If we are using sample data to make inferences about a parameter, we run the risk of making a mistake. Obviously, we want to minimize our chance of error; we want a small probability of drawing an incorrect conclusion. The probability of a Type I Error occurring is denoted by \\(\\alpha\\) and is called the significance level of a hypothesis test The probability of a Type II Error is denoted by \\(\\beta\\). Formally, we can define \\(\\alpha\\) and \\(\\beta\\) in regards to the table above, but for hypothesis tests instead of a criminal trial. \\(\\alpha\\) corresponds to the probability of rejecting \\(H_0\\) when, in fact, \\(H_0\\) is true. \\(\\beta\\) corresponds to the probability of failing to reject \\(H_0\\) when, in fact, \\(H_0\\) is false. Ideally, we want \\(\\alpha = 0\\) and \\(\\beta = 0\\), meaning that the chance of making an error does not exist. When we have to use incomplete information (sample data), it is not possible to have both \\(\\alpha = 0\\) and \\(\\beta = 0\\). We will always have the possibility of at least one error existing when we use sample data. Usually, what is done is that \\(\\alpha\\) is set before the hypothesis test is conducted and then the evidence is judged against that significance level. Common values for \\(\\alpha\\) are 0.05, 0.01, and 0.10. If \\(\\alpha = 0.05\\), we are using a testing procedure that, used over and over with different samples, rejects a TRUE null hypothesis five percent of the time. So if we can set \\(\\alpha\\) to be whatever we want, why choose 0.05 instead of 0.01 or even better 0.0000000000000001? Well, a small \\(\\alpha\\) means the test procedure requires the evidence against \\(H_0\\) to be very strong before we can reject \\(H_0\\). This means we will almost never reject \\(H_0\\) if \\(\\alpha\\) is very small. If we almost never reject \\(H_0\\), the probability of a Type II Error – failing to reject \\(H_0\\) when we should – will increase! Thus, as \\(\\alpha\\) decreases, \\(\\beta\\) increases and as \\(\\alpha\\) increases, \\(\\beta\\) decreases. We, therefore, need to strike a balance in \\(\\alpha\\) and \\(\\beta\\) and the common values for \\(\\alpha\\) of 0.05, 0.01, and 0.10 usually lead to a nice balance. Learning check (LC10.2) Reproduce the table above about errors, but for a hypothesis test, instead of the one provided for a criminal trial. 10.6.1 Logic of hypothesis testing Take a random sample (or samples) from a population (or multiple populations) If the sample data are consistent with the null hypothesis, do not reject the null hypothesis. If the sample data are inconsistent with the null hypothesis (in the direction of the alternative hypothesis), reject the null hypothesis and conclude that there is evidence the alternative hypothesis is true (based on the particular sample collected). 10.7 Statistical significance The idea that sample results are more extreme than we would reasonably expect to see by random chance if the null hypothesis were true is the fundamental idea behind statistical hypothesis tests. If data at least as extreme would be very unlikely if the null hypothesis were true, we say the data are statistically significant. Statistically significant data provide convincing evidence against the null hypothesis in favor of the alternative, and allow us to generalize our sample results to the claim about the population. Learning check (LC10.3) What is wrong about saying “The defendant is innocent.” based on the US system of criminal trials? (LC10.4) What is the purpose of hypothesis testing? (LC10.5) What are some flaws with hypothesis testing? How could we alleviate them? 10.8 Example: Revisiting the Lady Tasting Tea Recall the “There is Only One Test” diagram from earlier: Figure 10.3: Hypothesis Testing Framework We will now walk through how each of the steps to the diagram apply to determining whether the lady tasting tea was actually better than chance at determining whether or not milk was added first. We will see that the process of creating a null distribution is a statistical way to quantifying surprise. 10.8.1 Data Let’s assume as we did in Chapter 8 that the lady is correct in determining whether milk was added first or not in 9 out of 10 trials. Our data, therefore, may look something like x Correct Correct Correct Incorrect Correct Correct Correct Correct Correct Correct 10.8.2 Test statistic \\(\\delta\\) We are interested in the number of Correct out of our 10 trials. We can denote this number of successes using the symbol \\(t\\), where \\(t\\) corresponds to total. This is our test statistic \\(\\delta\\) in this case. 10.8.3 Observed effect \\(\\delta^*\\) The actual observed value of the test statistic from our observed sample is \\(\\hat{t}_{obs} = 9\\). Thus, \\(\\delta^* = 9\\). 10.8.4 Model of \\(H_0\\) Our null hypothesis is that the lady is only as good as chance at guessing correctly. Hypotheses always correspond to parameters and are denoted with Greek letters. Thus, symbolically, we have \\(H_0: \\tau = 5\\). Since we are assuming chance and we have 10 flips with 0.5 probability of success of each flip, we have \\(\\tau = 10 \\times 0.5 = 5\\). 10.8.5 Simulated data We now want to use this null hypothesis to simulate the test statistic assuming that the null hypothesis is true. Therefore, we want to figure out a way to simulate 10 trials, getting either the choice Correct or Incorrect, assuming that the probability of success (getting it Correct) in any given trial is 0.5. Tactile simulation When you are presented with a hypothesis testing problem, frequently the most challenging portion is setting up how to simulate the data assuming the null hypothesis is true. To facilitate with this, setting up a tactile, hands on experiment can help. In this case, flipping a fair coin is a great way to simulate this process. This simulates how the sample could be collected assuming the null hypothesis is true. To simulate 10 trials, we could flip the fair coin and record Heads as Correct and Tails as Incorrect. Some simulated data using this coin flipping procedure may look like the following. Note that this data frame is not tidy, but is a convenient way to look at the results of the simulation in this wide format. The numbers on the far left correspond to the number of the trial. Table 10.1: A table of three sets of 10 coin flips sample1 sample2 sample3 1 Correct Correct Correct 2 Correct Incorrect Incorrect 3 Incorrect Incorrect Correct 4 Incorrect Incorrect Correct 5 Correct Incorrect Incorrect 6 Correct Incorrect Correct 7 Incorrect Incorrect Correct 8 Incorrect Correct Incorrect 9 Incorrect Correct Incorrect 10 Incorrect Correct Incorrect We then use the formula for the Test Statistic to determine the simulated test statistic for each of these simulated samples. So in this case we have \\(t_1 = 4\\), \\(t_2 = 4\\), \\(t_3 = 5\\) 10.8.6 Distribution of \\(\\delta\\) under \\(H_0\\) We could continue this process, say, 5000 times by flipping a coin in sets of 10 for 5000 repetitions and counting and taking note of how many heads out of 10 we have for each set. It’s at this point that you surely realize that a computer can do this procedure much faster and more efficient than the tactile experiment with a coin. Recall that we’ve already created the distribution of 5000 such coin flips and we’ve stored these values in the heads variable in the simGuesses data frame: simGuesses <- do(5000) * rflip(10) ggplot(data = simGuesses, aes(x = factor(heads))) + geom_bar() 10.8.7 The p-value Definition: \\(p\\)-value: The p-value is the probability of observing a sample statistic as extreme or more extreme than what was observed, assuming that the null hypothesis of a by chance operation is true. This definition may be a little intimidating the first time you read it, but it’s important to come back to this “The Lady Tasting Tea” problem whenever you encounter \\(p\\)-values as you begin to learn about the concept. Here the \\(p\\)-value corresponds to how many times in our null distribution of heads 9 or more heads occurred. We can use another neat feature of R to calculate the \\(p\\)-value for this problem. Note that “more extreme” in this case corresponds to looking at values of 9 or greater since our alternative hypothesis invokes a right-tail test corresponding to a “greater than” hypothesis of \\(H_a: \\tau > 5\\). In other words, we are looking to see how likely it is for the lady to pick 9 or more correct instead of 9 or less correct. We’d like to go in the right direction. pvalue_tea <- simGuesses %>% filter(heads >= 9) %>% nrow() / nrow(simGuesses) Let’s walk through each step of this calculation: First, pvalue_tea will be the name of our calculated \\(p\\)-value and the assignment operator <- directs us to this naming. We are working with the simGuesses data frame here so that comes immediately before the pipe operator. We would like to only focus on the rows in our simGuesses data frame that have heads values of 9 or 10. This represents simulated statistics “as extreme or more extreme” than what we observed (9 correct guesses out of 10). To get a glimpse of what we have up to this point, run simGuesses %>% filter(heads >= 9) %>% View(). Now that we have changed the focus to only those rows that have number of heads out of 10 flips corresponding to 9 or more, we count how many of those there are. The function nrow gives how many entries are in this filtered data frame and lastly we calculate the proportion that are at least as extreme as our observed value of 9 by dividing by the number of total simulations (5,000). We can see that the observed statistic of 9 correct guesses is not a likely outcome assuming the null hypothesis is true. Only around 1% of the outcomes in our 5000 simulations fall at or above 9 successes. We have evidence supporting the conclusion that the person is actually better than just guessing at random at determining whether milk has been added first or not. To better visualize this we can also make use of blue shading on the histogram corresponding to the \\(p\\)-value: ggplot(data = simGuesses, aes(x = factor(heads), fill = (heads >= 9))) + geom_bar() + labs(x = "heads") Figure 10.4: Barplot of heads with p-value highlighted This helps us better see just how few of the values of heads are at our observed value or more extreme. This idea of a \\(p\\)-value can be extended to the more traditional methods using normal and \\(t\\) distributions in the traditional way that introductory statistics has been presented. These traditional methods were used because statisticians haven’t always been able to do 5000 simulations on the computer within seconds. We’ll elaborate on this more in a few sections. Learning check (LC10.6) How could we make Table 10.1 into a tidy data frame? (LC10.7) What is meant by “pseudo-random number generation?” (LC10.8) How can simulation be used to help us address the question of whether or not an observed result is statistically significant? (LC10.9) In Chapter 3, we noted that barplots should be used when creating a plot of categorical variables. Why are we using barplots to make a plot of a numerical variable heads in this chapter? 10.9 Example: Comparing two means 10.9.1 Randomization/permutation We will now focus on building hypotheses looking at the difference between two population means in an example. We will denote population means using the Greek symbol \\(\\mu\\) (pronounced “mu”). Thus, we will be looking to see if one group “out-performs” another group. This is quite possibly the most common type of statistical inference and serves as a basis for many other types of analyses when comparing the relationship between two variables. Our null hypothesis will be of the form \\(H_0: \\mu_1 = \\mu_2\\), which can also be written as \\(H_0: \\mu_1 - \\mu_2 = 0\\). Our alternative hypothesis will be of the form \\(H_0: \\mu_1 \\star \\mu_2\\) (or \\(H_a: \\mu_1 - \\mu_2 \\, \\star \\, 0\\)) where \\(\\star\\) = \\(<\\), \\(\\ne\\), or \\(>\\) depending on the context of the problem. You needn’t focus on these new symbols too much at this point. It will just be a shortcut way for us to describe our hypotheses. As we saw earlier, simulation is a valuable tool when conducting inferences based on one population variable. We will see that the process of randomization (also known as permutation) will be valuable in conducting tests comparing quantitative values from two groups. 10.9.2 Comparing action and romance movies The movies dataset in the ggplot2movies package contains information on a large number of movies that have been rated by users of IMDB.com (Wickham 2015). We are interested in the question here of whether Action movies are rated higher on IMDB than Romance movies. We will first need to do a little bit of data wrangling using the ideas from Chapter 5 to get the data in the form that we would like: (movies_trimmed <- movies %>% select(title, year, rating, Action, Romance)) # A tibble: 58,788 x 5 title year rating Action Romance <chr> <int> <dbl> <int> <int> 1 $ 1971 6.40 0 0 2 $1000 a Touchdown 1939 6.00 0 0 3 $21 a Day Once a Month 1941 8.20 0 0 4 $40,000 1996 8.20 0 0 5 $50,000 Climax Show, The 1975 3.40 0 0 6 $pent 2000 4.30 0 0 7 $windle 2002 5.30 1 0 8 '15' 2002 6.70 0 0 9 '38 1987 6.60 0 0 10 '49-'17 1917 6.00 0 0 # ... with 58,778 more rows Note that Action and Romance are binary variables here. To remove any overlap of movies (and potential confusion) that are both Action and Romance, we will remove them from our population: movies_trimmed <- movies_trimmed %>% filter(!(Action == 1 & Romance == 1)) We will now create a new variable called genre that specifies whether a movie in our movies_trimmed data frame is an "Action" movie, a "Romance" movie, or "Neither". We aren’t really interested in the "Neither" category here so we will exclude those rows as well. Lastly, the Action and Romance columns are not needed anymore since they are encoded in the genre column. movies_trimmed <- movies_trimmed %>% mutate(genre = ifelse(Action == 1, "Action", ifelse(Romance == 1, "Romance", "Neither"))) %>% filter(genre != "Neither") %>% select(-Action, -Romance) We are left with 8878 movies in our population dataset that focuses on only "Action" and "Romance" movies. Learning check (LC10.10) Why are the different genre variables stored as binary variables (1s and 0s) instead of just listing the genre as a column of values like “Action”, “Comedy”, etc.? (LC10.11) What complications could come above with us excluding action romance movies? Should we question the results of our hypothesis test? Explain. Let’s now visualize the distributions of rating across both levels of genre. Think about what type(s) of plot is/are appropriate here before you proceed: ggplot(data = movies_trimmed, aes(x = genre, y = rating)) + geom_boxplot() Figure 10.5: Rating vs genre in the population We can see that the middle 50% of ratings for "Action" movies is more spread out than that of "Romance" movies in the population. "Romance" has outliers at both the top and bottoms of the scale though. We are initially interested in comparing the mean rating across these two groups so a faceted histogram may also be useful: ggplot(data = movies_trimmed, mapping = aes(x = rating)) + geom_histogram(binwidth = 1, color = "white", fill = "dodgerblue") + facet_grid(genre ~ .) Figure 10.6: Faceted histogram of genre vs rating Important note: Remember that we hardly ever have access to the population values as we do here. This example and the nycflights13 dataset were used to create a common flow from chapter to chapter. In nearly all circumstances, we’ll be needing to use only a sample of the population to try to infer conclusions about the unknown population parameter values. These examples do show a nice relationship between statistics (where data is usually small and more focused on experimental settings) and data science (where data is frequently large and collected without experimental conditions). 10.9.3 Sampling \\(\\rightarrow\\) randomization We can use hypothesis testing to investigate ways to determine, for example, whether a treatment has an effect over a control and other ways to statistically analyze if one group performs better than, worse than, or different than another. We will also use confidence intervals to determine the size of the effect, if it exists. You’ll see more on this in Chapter 9. We are interested here in seeing how we can use a random sample of action movies and a random sample of romance movies from movies to determine if a statistical difference exists in the mean ratings of each group. Learning check (LC10.12) Define the relevant parameters here in terms of the populations of movies. 10.9.4 Data Let’s select a random sample of 34 action movies and a random sample of 34 romance movies. (The number 34 was chosen somewhat arbitrarily here.) set.seed(2017) movies_genre_sample <- movies_trimmed %>% group_by(genre) %>% sample_n(34) %>% ungroup() Note the addition of the ungroup() function here. This will be useful shortly in allowing us to shuffle the values of rating across genre. Our analysis does not work without this ungroup() function since the data stays grouped by the levels of genre without it. We can now observe the distributions of our two sample ratings for both groups. Remember that these plots should be rough approximations of our population distributions of movie ratings for "Action" and "Romance" in our population of all movies in the movies data frame. ggplot(data = movies_genre_sample, aes(x = genre, y = rating)) + geom_boxplot() Figure 10.7: Genre vs rating for our sample ggplot(data = movies_genre_sample, mapping = aes(x = rating)) + geom_histogram(binwidth = 1, color = "white", fill = "dodgerblue") + facet_grid(genre ~ .) Figure 10.8: Genre vs rating for our sample as faceted histogram Learning check (LC10.13) What single value could we change to improve the approximation using the sample distribution on the population distribution? Do we have reason to believe, based on the sample distributions of rating over the two groups of genre, that there is a significant difference between the mean rating for action movies compared to romance movies? It’s hard to say just based on the plots. The boxplot does show that the median sample rating is higher for romance movies, but the histogram isn’t as clear. The two groups have somewhat differently shaped distributions but they are both over similar values of rating. It’s often useful to calculate the mean and standard deviation as well, conditioned on the two levels. summary_ratings <- movies_genre_sample %>% group_by(genre) %>% summarize(mean = mean(rating), std_dev = sd(rating), n = n()) summary_ratings %>% kable() genre mean std_dev n Action 5.11 1.49 34 Romance 6.06 1.15 34 Learning check (LC10.14) Why did we not specify na.rm = TRUE here as we did in Chapter 5? We see that the sample mean rating for romance movies, \\(\\bar{x}_{r}\\), is greater than the similar measure for action movies, \\(\\bar{x}_a\\). But is it statistically significantly greater (thus, leading us to conclude that the means are statistically different)? The standard deviation can provide some insight here but with these standard deviations being so similar it’s still hard to say for sure. Learning check (LC10.15) Why might the standard deviation provide some insight about the means being statistically different or not? 10.9.5 Model of \\(H_0\\) The hypotheses we specified can also be written in another form to better give us an idea of what we will be simulating to create our null distribution. \\(H_0: \\mu_r - \\mu_a = 0\\) \\(H_a: \\mu_r - \\mu_a \\ne 0\\) 10.9.6 Test statistic \\(\\delta\\) We are, therefore, interested in seeing whether the difference in the sample means, \\(\\bar{x}_r - \\bar{x}_a\\), is statistically different than 0. R has a built-in command that can calculate the difference in these two sample means. 10.9.7 Observed effect \\(\\delta^*\\) mean_ratings <- movies_genre_sample %>% group_by(genre) %>% summarize(mean = mean(rating)) obs_diff <- diff(mean_ratings$mean) We see here that the diff function calculates \\(\\bar{x}_r - \\bar{x}_a = 6.062 - 5.112 = 0.95\\). We will now proceed similarly to how we conducted the hypothesis test above for the Lady Tasting Tea using simulation. Our goal is figure out a random process with which to simulate the null hypothesis being true. Earlier in this chapter, we used flipping of a fair coin as the random process we were simulating with the null hypothesis being true (\\(H_0: \\tau = 5\\)). 10.9.8 Simulated data Tactile simulation Here, with us assuming the two population means are equal (\\(H_0: \\mu_r - \\mu_a = 0\\)), we can look at this from a tactile point of view by using index cards. There are \\(n_r = 34\\) data elements corresponding to romance movies and \\(n_a = 34\\) for action movies. We can write the 34 ratings from our sample for romance movies on one set of 34 index cards and the 34 ratings for action movies on another set of 34 index cards. (Note that the sample sizes need not be the same.) The next step is to put the two stacks of index cards together, creating a new set of 68 cards. If we assume that the two population means are equal, we are saying that there is no association between ratings and genre (romance vs action). We can use the index cards to create two new stacks for romance and action movies. First, we must shuffle all the cards thoroughly. After doing so, in this case with equal values of sample sizes, we split the deck in half. We then calculate the new sample mean rating of the romance deck, and also the new sample mean rating of the action deck. This creates one simulation of the samples that were collected originally. We next want to calculate a statistic from these two samples. Instead of actually doing the calculation using index cards, we can use R as we have before to simulate this process. shuffled_ratings <- #movies_trimmed %>% movies_genre_sample %>% mutate(genre = shuffle(genre)) %>% group_by(genre) %>% summarize(mean = mean(rating)) diff(shuffled_ratings$mean) [1] -0.132 Learning check (LC10.16) How would the tactile shuffling of index cards change if we had different samples of say 20 action movies and 60 romance movies? Describe each step that would change. (LC10.17) Why are we taking the difference in the means of the cards in the new shuffled decks? 10.9.9 Distribution of \\(\\delta\\) under \\(H_0\\) The only new command here is shuffle from the mosaic package, which does what we would expect it to do. It simulates a shuffling of the ratings between the two levels of genre just as we could have done with index cards. We can now proceed in a similar way to what we have done previously with the Lady Tasting Tea example by repeating this process many times to create a null distribution of simulated differences in sample means. set.seed(2017) many_shuffles <- do(5000) * (movies_genre_sample %>% mutate(genre = shuffle(genre)) %>% group_by(genre) %>% summarize(mean = mean(rating)) ) It is a good idea here to View the many_shuffles data frame via View(many_shuffles). We need to figure out a way to subtract the first value of mean from the second value of mean for each of the 5000 simulations. This is a little tricky but the group_by function comes to our rescue here: rand_distn <- many_shuffles %>% group_by(.index) %>% summarize(diffmean = diff(mean)) head(rand_distn, 10) # A tibble: 10 x 2 .index diffmean <dbl> <dbl> 1 1.00 -0.132 2 2.00 -0.197 3 3.00 -0.0265 4 4.00 0.715 5 5.00 -0.474 6 6.00 -0.121 7 7.00 -0.174 8 8.00 -0.209 9 9.00 -0.00882 10 10.0 -0.332 We can now plot the distribution of these simulated differences in means: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = "white", bins = 20) Figure 10.9: Simulated differences in means histogram 10.9.10 The p-value Remember that we are interested in seeing where our observed sample mean difference of 0.95 falls on this null/randomization distribution. We are interested in simply a difference here so “more extreme” corresponds to values in both tails on the distribution. Let’s shade our null distribution to show a visual representation of our \\(p\\)-value: ggplot(data = rand_distn, aes(x = diffmean, fill = (abs(diffmean) >= obs_diff))) + geom_histogram(color = "white", bins = 20) Figure 10.10: Shaded histogram to show p-value Remember that the observed difference in means was 0.95. We have shaded green all values at or above that value and also shaded green those values at or below its negative value (since this is a two-tailed test). We can add a vertical line to represent both the observed difference and its negative instead. To better estimate how large the \\(p\\)-value will be, we also increase the number of bins to 100 here from 20: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = "white", bins = 100) + geom_vline(xintercept = obs_diff, color = "red") + geom_vline(xintercept = -obs_diff, color = "red") Figure 10.11: Histogram with vertical lines corresponding to observed statistic At this point, it is important to take a guess as to what the \\(p\\)-value may be. We can see that there are only a few shuffled differences as extreme or more extreme than our observed effect (in both directions). Maybe we guess that this \\(p\\)-value is somewhere around 2%, or maybe 3%, but certainly not 30% or more. **You’ll find yourself getting very strange results if you’ve messed up the signs in your calculation of the \\(p\\)-value so you should always check first that you have a range of reasonable values after looking at the histogram for the \\(p\\)-value. Lastly, we calculate the \\(p\\)-value directly using dplyr: (pvalue_movies <- rand_distn %>% filter(abs(diffmean) >= obs_diff) %>% nrow() / nrow(rand_distn)) [1] 0.0042 We have around 0.42% of values as extreme or more extreme than our observed effect in both directions. Assuming we are using a 5% significance level for \\(\\alpha\\), we have evidence supporting the conclusion that the mean rating for romance movies is different from that of action movies. The next important idea is to better understand just how much higher of a mean rating can we expect the romance movies to have compared to that of action movies. This can be addressed by creating a 95% confidence interval as we will explore in Chapter 9. Learning check (LC10.18) Conduct the same analysis comparing action movies versus romantic movies using the median rating instead of the mean rating? Make sure to use the %>% as much as possible. What was different and what was the same? (LC10.19) What conclusions can you make from viewing the faceted histogram looking at rating versus genre that you couldn’t see when looking at the boxplot? (LC10.20) Describe in a paragraph how we used Allen Downey’s diagram to conclude if a statistical difference existed between mean movie ratings for action and romance movies. (LC10.21) Why are we relatively confident that the distributions of the sample ratings will be good approximations of the population distributions of ratings for the two genres? (LC10.22) Using the definition of “\\(p\\)-value”, write in words what the \\(p\\)-value represents for the hypothesis test above comparing the mean rating of romance to action movies. (LC10.23) What is the value of the \\(p\\)-value for the hypothesis test comparing the mean rating of romance to action movies? (LC10.24) Do the results of the hypothesis test match up with the original plots we made looking at the population of movies? Why or why not? 10.9.11 Summary To review, these are the steps one would take whenever you’d like to do a hypothesis test comparing values from the distributions of two groups: Simulate many samples using a random process that matches the way the original data were collected and that assumes the null hypothesis is true. Collect the values of a sample statistic for each sample created using this random process to build a randomization distribution. Assess the significance of the original sample by determining where its sample statistic lies in the randomization distribution. If the proportion of values as extreme or more extreme than the observed statistic in the randomization distribution is smaller than the pre-determined significance level \\(\\alpha\\), we reject \\(H_0\\). Otherwise, we fail to reject \\(H_0\\). (If no significance level is given, one can assume \\(\\alpha = 0.05\\).) 10.10 Building theory-based methods using computation As a point of reference, we will now discuss the traditional theory-based way to conduct the hypothesis test for determining if there is a statistically significant difference in the sample mean rating of Action movies versus Romance movies. This method and ones like it work very well when the assumptions are met in order to run the test. They are based on probability models and distributions such as the normal and \\(t\\)-distributions. These traditional methods have been used for many decades back to the time when researchers didn’t have access to computers that could run 5000 simulations in under a minute. They had to base their methods on probability theory instead. Many fields and researchers continue to use these methods and that is the biggest reason for their inclusion here. It’s important to remember that a \\(t\\)-test or a \\(z\\)-test is really just an approximation of what you have seen in this chapter already using simulation and randomization. The focus here is on understanding how the shape of the \\(t\\)-curve comes about without digging big into the mathematical underpinnings. 10.10.1 Example: \\(t\\)-test for two independent samples What is commonly done in statistics is the process of normalization. What this entails is calculating the mean and standard deviation of a variable. Then you subtract the mean from each value of your variable and divide by the standard deviation. The most common normalization is known as the \\(z\\)-score. The formula for a \\(z\\)-score is \\[Z = \\frac{x - \\mu}{\\sigma},\\] where \\(x\\) represent the value of a variable, \\(\\mu\\) represents the mean of the variable, and \\(\\sigma\\) represents the standard deviation of the variable. Thus, if your variable has 10 elements, each one has a corresponding \\(z\\)-score that gives how many standard deviations away that value is from its mean. \\(z\\)-scores are normally distributed with mean 0 and standard deviation 1. They have the common, bell-shaped pattern seen below. Recall, that we hardly ever know the mean and standard deviation of the population of interest. This is almost always the case when considering the means of two independent groups. To help account for us not knowing the population parameter values, we can use the sample statistics instead, but this comes with a bit of a price in terms of complexity. Another form of normalization occurs when we need to use the sample standard deviations as estimates for the unknown population standard deviations. This normalization is often called the \\(t\\)-score. For the two independent samples case like what we have for comparing action movies to romance movies, the formula is \\[T =\\dfrac{ (\\bar{x}_1 - \\bar{x}_2) - (\\mu_1 - \\mu_2)}{ \\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}} }\\] There is a lot to try to unpack here. \\(\\bar{x}_1\\) is the sample mean response of the first group \\(\\bar{x}_2\\) is the sample mean response of the second group \\(\\mu_1\\) is the population mean response of the first group \\(\\mu_2\\) is the population mean response of the second group \\(s_1\\) is the sample standard deviation of the response of the first group \\(s_2\\) is the sample standard deviation of the response of the second group \\(n_1\\) is the sample size of the first group \\(n_2\\) is the sample size of the second group Assuming that the null hypothesis is true (\\(H_0: \\mu_1 - \\mu_2 = 0\\)), \\(T\\) is said to be distributed following a \\(t\\) distribution with degrees of freedom equal to the smaller value of \\(n_1 - 1\\) and \\(n_2 - 1\\). The “degrees of freedom” can be thought of measuring how different the \\(t\\) distribution will be as compared to a normal distribution. Small sample sizes lead to small degrees of freedom and, thus, \\(t\\) distributions that have more values in the tails of their distributions. Large sample sizes lead to large degrees of freedom and, thus, \\(t\\) distributions that closely align with the standard normal, bell-shaped curve. So, assuming \\(H_0\\) is true, our formula simplifies a bit: \\[T =\\dfrac{ \\bar{x}_1 - \\bar{x}_2}{ \\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}} }.\\] We have already built an approximation for what we think the distribution of \\(\\delta = \\bar{x}_1 - \\bar{x}_2\\) looks like using randomization above. Recall this distribution: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = "white", bins = 20) Figure 10.12: Simulated differences in means histogram If we’d like to have a guess as to what the distribution of \\(T\\) might look like instead, we need only to divide every value in rand_distn by \\[\\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}}.\\] As we did before, we will assign Romance to be group 1 and Action to be group 2. (This was done since Romance comes second alphabetically and the reason why we have a number mismatch below with 1 and 2.) Remember that we’ve already calculated these values: kable(summary_ratings) genre mean std_dev n Action 5.11 1.49 34 Romance 6.06 1.15 34 We will create some shortcuts here so you can see the value being calculated for the denominator of \\(T\\). s1 <- summary_ratings$std_dev[2] s2 <- summary_ratings$std_dev[1] n1 <- summary_ratings$n[2] n2 <- summary_ratings$n[1] Here, we have \\(s_1 = 1.149\\), \\(s_2 = 1.489\\), \\(n_1 = 34\\), and \\(n_2 = 34\\). We can calculate the denominator via (denom_T <- sqrt( (s1^2 / n1) + (s2^2 / n2) )) [1] 0.323 Now if we divide all of the values of diffmean in rand_distn by denom_T we can have a simulated distribution of \\(T\\) test statistics instead: rand_distn <- rand_distn %>% mutate(t_stat = diffmean / denom_T) ggplot(data = rand_distn, aes(x = t_stat)) + geom_histogram(color = "white", bins = 20) Figure 10.13: Simulated T statistics histogram We see that the shape of this distribution is the same as that of diffmean. The scale has changed though with t_stat having less spread than diffmean. A traditional \\(t\\)-test doesn’t look at this simulated distribution, but instead it looks at the \\(t\\)-curve with degrees of freedom equal to 33 (the minimum of \\(n_1 = 34 - 1 = 33\\) and \\(n_2 = 34 - 1 = 33\\)). This curve is frequently called a density curve and this is the reason why we specify the use of y = ..density.. here in the geom_histogram. We now overlay what this \\(t\\)-curve looks like on top of the histogram showing the simulated \\(T\\) statistics. ggplot(data = rand_distn, mapping = aes(x = t_stat)) + geom_histogram(aes(y = ..density..), color = "white", binwidth = 0.3) + stat_function(fun = dt, args = list(df = min(n1 - 1, n2 - 1)), color = "royalblue", size = 2) We can see that the curve does a good job of approximating the randomization distribution here. (More on when to expect for this to be the case when we discuss conditions for the \\(t\\)-test in a bit.) To calculate the \\(p\\)-value in this case, we need to figure out how much of the total area under the \\(t\\)-curve is at our observed \\(T\\)-statistic or more, plus also adding the area under the curve at the negative value of the observed \\(T\\)-statistic or below. (Remember this is a two-tailed test so we are looking for a difference–values in the tails of either direction.) Just as we converted all of the simulated values to \\(T\\)-statistics, we must also do so for our observed effect \\(\\delta^*\\): (t_obs <- obs_diff / denom_T) [1] 2.95 So graphically we are interested in finding the percentage of values that are at or above 2.945 or at or below -2.945. ggplot(data = rand_distn, mapping = aes(x = t_stat)) + stat_function(fun = dt, args = list(df = min(n1 - 1, n2 - 1)), color = "royalblue", size = 2) + geom_vline(xintercept = t_obs, color = "red") + geom_vline(xintercept = -t_obs, color = "red") At this point, you should make a guess as to what a reasonable value may be for the \\(p\\)-value. Let’s say the \\(p\\)-value is 0.01 or so. To actually perform this calculation by hand, you’d need to do some calculus. Let’s have R do it for us instead using the pt function. pt(t_obs, df = min(n1 - 1, n2 - 1), lower.tail = FALSE) + pt(-t_obs, df = min(n1 - 1, n2 - 1), lower.tail = TRUE) [1] 0.00588 10.10.2 Conditions for t-test In order for the results of the \\(t\\)-test to be valid, three conditions must be met: Independent observations in both samples Nearly normal populations OR large sample sizes (\\(n \\ge 30\\)) Independently selected samples Condition 1: This is met since we sampled at random using R from our population. Condition 2: Recall from Figure 10.6, that we know how the populations are distributed. Both of them are close to normally distributed. If we are a little concerned about this assumption, we also do have samples of size larger than 30 (\\(n_1 = n_2 = 34\\)). Condition 3: This is met since there is no natural pairing of a movie in the Action group to a movie in the Romance group. Since all three conditions are met, we can be reasonably certain that the theory-based test will match the results of the randomization-based test using shuffling. Remember that theory-based tests can produce some incorrect results in these assumptions are not carefully checked. The only assumption for randomization and computational-based methods is that the sample is selected at random. They are our preference and we strongly believe they should be yours as well, but it’s important to also see how the theory-based tests can be done and used as an approximation for the computational techniques until at least more researchers are using these techniques that utilize the power of computers. 10.11 Resampling-based inference for regression We can also use the concept of shuffling to determine the standard error of our null distribution and conduct a hypothesis test for a population slope. Let’s go back to our example on Alaskan flights that represent a sample of all Alaskan flights departing NYC in 2013 from Section 3.3. Let’s test to see if we have evidence that a positive relationship exists between the departure delay and arrival delay for Alaskan flights. We will set up this hypothesis testing process as we have each before via the “There is Only One Test” diagram in Figure 10.1. 10.11.1 Data Our data is stored in alaska_flights and we are focused on the 50 measurements of dep_delay and arr_delay there. # To ensure the random sample of 50 flights is the same for # anyone using this code set.seed(2017) # Load Alaska data, deleting rows that have missing departure delay # or arrival delay data alaska_flights <- flights %>% filter(carrier == "AS") %>% filter(!is.na(dep_delay) & !is.na(arr_delay)) %>% # Select 50 flights that don't have missing delay data sample_n(50) 10.11.2 Test statistic \\(\\delta\\) Our test statistic here is the sample slope coefficient that we denote with \\(b_1\\). 10.11.3 Observed effect \\(\\delta^*\\) delay_fit <- lm(formula = arr_delay ~ dep_delay, data = alaska_flights) (b1_obs <- tidy(delay_fit)$estimate[2]) [1] 1.22 The calculated slope value from our observed sample is \\(b_1 = 1.218\\). 10.11.4 Model of \\(H_0\\) We are looking to see if a positive relationship exists so \\(H_a: \\beta_1 > 0\\). Our null hypothesis is always in terms of equality so we have \\(H_0: \\beta_1 = 0\\). 10.11.5 Simulated data Now to simulate the null hypothesis being true and recreating how our sample was created, we need to think about what it means for \\(\\beta_1\\) to be zero. If \\(\\beta_1 = 0\\), we said above that there is no relationship between the departure delay and arrival delay. If there is no relationship, then any one of the arrival delay values could have just as likely occurred with any of the other departure delay values instead of the one that it actually did fall with. We, therefore, have another example of shuffling in our simulating of data. Tactile simulation We could use a deck of 100 note cards to create a tactile simulation of this shuffling process. We would write the 50 different values of departure delays on each of the 50 cards, one per card. We would then do the same thing for the 50 arrival delays putting them on one per card. Next, we would lay out each of the 50 departure delay cards and we would shuffle the arrival delay deck. Then, after shuffling the deck well, we would disperse the cards one per each one of the departure delay cards. We would then enter these new values in for arrival delay and compute a sample slope based on this shuffling. We could repeat this process many times, keeping track of our sample slope after each shuffle. 10.11.6 Distribution of \\(\\delta\\) under \\(H_0\\) We can build our randomization distribution in much the same way we did before using the do and shuffle functions. Here we will take advantage of the coef function we saw earlier to extract the slope and intercept coefficients. (Our focus will be on the slope here though.) rand_slope_distn <- do(5000) * (lm(formula = arr_delay ~ shuffle(dep_delay), data = alaska_flights) %>% coef()) names(rand_slope_distn) [1] "Intercept" "dep_delay" We see that the names of our columns are Intercept and dep_delay. We want to look at dep_delay since that corresponds to the slope coefficients. ggplot(data = rand_slope_distn, mapping = aes(x = dep_delay)) + geom_histogram(color = "white", bins = 20) 10.11.7 The p-value Recall that we want to see where our observed sample slope \\(\\delta^* = 1.218\\) falls on this distribution and then count all of the values to the right of it corresponding to \\(H_a: \\beta_0 > 0\\). To get a sense for where our values falls, we can shade all values at least as big as \\(\\delta^*\\). ggplot(data = rand_slope_distn, aes(x = dep_delay, fill = (dep_delay >= b1_obs))) + geom_histogram(color = "white", bins = 20) Figure 10.14: Shaded histogram to show p-value Since 1.218 falls far to the right of this plot, we can say that we have a \\(p\\)-value of 0. We, thus, have evidence to reject the null hypothesis in support of there being a positive association between the departure delay and arrival delay of all Alaskan flights from NYC in 2013. Learning check (LC10.25) Repeat the inference above but this time for the correlation coefficient instead of the slope. 10.12 Theory-based inference for regression Recall the regression output table from Section ?? with delay_fit being a least squares linear regression fit with arr_delay as the response and dep_delay as the predictor in the alaska_flights data frame created in Section ??. term estimate std.error statistic p.value (Intercept) -14.15 2.809 -5.04 0 dep_delay 1.22 0.136 8.95 0 We saw in Section ?? that random samples have variability and, thus, statistics from those samples have variability as defined by the sampling distribution. Recall from Section ?? that alaska_flights represents only a random sample of 50 Alaska Airlines flights and not all flights. Hence if we repeated the analysis but with another random sample of 50 flights, the fitted line would likely change slightly due to sampling variability. In this case, there is a true population least squares line is defined by the formula \\(y = \\beta_0 + \\beta_1 x + \\epsilon\\) where \\(\\beta_0\\) is the true population intercept parameter \\(\\beta_1\\) is the true population slope parameter \\(\\epsilon\\) represents the error term \\(\\epsilon\\) corresponds to the part of the response variable \\(y\\) that remains unexplained after considering the predictor variable \\(x\\). We will see in Section 10.12.2 that ideally they should exhibit no systematic pattern in that they are normally distributed, have mean 0, and constant variance. The values \\(b_0 = -14.155\\) and \\(b_1 = 1.218\\) are point estimates of \\(\\beta_0\\) and \\(\\beta_1\\), and thus the second column of the regression output table that has their values is called estimate. The third column std.error represents the standard errors for each estimate using a theory-based approach. The rows of the fourth and fifth columns correspond to theory-based hypothesis tests testing \\(H_0: \\beta_0 = 0 \\mbox{ vs. } H_1: \\beta_0 \\neq 0\\) and \\(H_0: \\beta_1 = 0 \\mbox{ vs. } H_1: \\beta_1 \\neq 0\\). Of particular interest is the second hypothesis test because if \\(\\beta_1 = 0\\) then \\(y = \\beta_0 + \\epsilon\\). Hence the value of \\(y\\) does not depend on the value of \\(x\\) at all, in other words there is no relationship between them. Recall that any hypothesis test involves 1) an observed test statistic and 2) a \\(p\\)-value resulting from the comparison of the observed test statistic to a null distribution. The columns “statistic” and “p.value” correspond to these values. In our example, since the \\(p\\)-value corresponding to the hypothesis test \\(H_0: \\beta_1 = 0 \\mbox{ vs. } H_1: \\beta_1 \\neq 0\\) is 0, for any value of \\(\\alpha\\) we would reject \\(H_0\\) in favor of \\(H_1\\) and declare that there is a significant relationship between arrival delay and departure delay. For the conclusions of the hypothesis tests for regression to be valid, there are certain conditions that must be met, in particular relating to the behavior of the residuals. We will address these assumptions in the upcoming Subsection 10.12.1. 10.12.1 Conditions for regression In order for all inferences from regression to be valid (in particular the hypothesis tests from Subsection 10.12, certain conditions must roughly hold. Nearly normal residuals with mean 0 and constant variance. (Check quantile-quantile plot of standardized residuals.) Equal variances across explanatory variable. (Check residual plot for non-uniform patterns.) Independent observations. (Check residual plot for no time series-like patterns.) As you can see the residuals will play a large role in determining whether the conditions are met. In particular, the first two conditions can be roughly interpreted as requiring that there being no systematic pattern to the residuals. The residuals \\(\\widehat{\\epsilon}_i\\) are estimates for the error term \\(\\epsilon\\) we discussed with the true population regression line, and this is a big reason why they play an important role in validating regression assumptions. 10.12.2 Residual analysis The following diagram will help you to keep track of what is meant by a residual. Consider the observation marked by the blue dot: Recall that \\(y_i\\) is the observed value of the arr_delay variable (y-position of blue dot), \\(\\widehat{y}_i\\) is the fitted value of the arr_delay (value that is being pointed to on the red line), and the residual is \\(\\widehat{\\epsilon}_i = y_i - \\hat{y}_i\\). We can quickly extract the values of all 50 residuals by using the augment() function in the broom package. Specifically, we are interested in the .fitted and .resid variables. Let’s look at the residuals corresponding to the first six rows of data. regression_points <- augment(delay_fit) %>% select(arr_delay, dep_delay, .fitted, .resid) regression_points %>% head() %>% kable() arr_delay dep_delay .fitted .resid -38 -3 -17.808 -20.19 86 69 69.864 16.14 -38 3 -10.502 -27.50 61 53 50.381 10.62 3 12 0.457 2.54 21 2 -11.720 32.72 Let’s begin by analyzing the distribution of the residuals. We would expect the shape of the distribution to be symmetric and roughly bell-shaped with a peak near zero and fewer and fewer values going into the tails on both the left and right sides. ggplot(data = regression_points, mapping = aes(x = .resid)) + geom_histogram(binwidth = 10, color = "white") + geom_vline(xintercept = 0, color = "blue") Next, we create a scatterplot looking at how the fitted values relate to the residual values. ggplot(data = regression_points, mapping = aes(x = .fitted, y = .resid)) + geom_point() + geom_abline(intercept = 0, slope = 0, color = "blue") Figure 10.15: Fitted versus Residuals plot Lastly, we create a quantile-quantile plot that compares the residual values to what would be expected from a bell-shaped distribution (in particular, the normal distribution). ggplot(data = regression_points, mapping = aes(sample = .resid)) + stat_qq() Figure 10.16: QQ Plot of residuals Checking conditions: We are looking to see if the points are scattered about the blue line at 0 relatively evenly as we look from left to right in Figure 10.15. We have some reason for concern here as the large lump of values on the left are much more dispersed than those on the right. The second condition is invalidated if there is a trigonometric pattern of up and down throughout the fitted by residual plot in Figure 10.15. That is not the case here. We look at the quantile-quantile plot (“Q-Q plot” for short) for the third condition in Figure 10.16. We are looking to see if the residuals fall on a straight line with what we would expect if they were normally distributed. We see some curvature here as well. We should begin to wonder if regression was valid here with both condition 1 and condition 3 in question. We have reason to doubt whether a linear regression is valid here. Unfortunately, all too frequently regressions are run without checking these assumptions carefully. While small deviations from the assumptions can be OK, larger violations can completely invalidate the results and make any inferences improbable and questionable. 10.13 Conclusion 10.13.1 What’s to come? This chapter examined the basics of hypothesis testing with terminology and also an example of how to apply the “There is Only One Test” diagram to the Lady Tasting Tea example presented in Chapter 8 and to an example on comparing the IMDB ratings of action movies and romance movies. Lastly, we looked at how to use resampling and theory-based methods on regression. We’ll see in Chapter 9 how we can provide a range of possible values for an unknown population parameter instead of just running a Yes/No decision from a hypothesis test. 10.13.2 Script of R code An R script file of all R code used in this chapter is available here. "], +["9-ci.html", "9 Confidence Intervals 9.1 Sneak peak of infer 9.2 Under construction… 9.3 Bootstrapping 9.4 Relation to hypothesis testing 9.5 Effect size 9.6 Conclusion", " 9 Confidence Intervals Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. Please check out our sneak peak of infer below in the meanwhile. For more details on infer visit https://infer.netlify.com/. 9.1 Sneak peak of infer Question: Of all the cars in the mtcars dataset, do automatic cars get better gas mileage than manual cars? Approach: 95% confidence interval for difference in means. library(dplyr) library(ggplot2) library(infer) # Clean data mtcars <- mtcars %>% as_tibble() %>% mutate(am = factor(am)) # Simulate sampling distribution of two-sample difference in means: sampling_distribution <- mtcars %>% specify(mpg ~ am) %>% generate(reps = 1000, type = "bootstrap") %>% calculate(stat = "diff in means", order = c("1", "0")) # Compute 95% confidence interval: conf_int <- sampling_distribution %>% pull(stat) %>% quantile(probs = c(0.025, 0.975)) # Visualize: plot <- sampling_distribution %>% visualize() plot + geom_vline(xintercept = conf_int, col = "red", size = 1) 9.2 Under construction… The content here will be deprecated with a shift to using the infer package in the months to come. Definition: Confidence Interval A confidence interval gives a range of plausible values for a parameter. It depends on a specified confidence level with higher confidence levels corresponding to wider confidence intervals and lower confidence levels corresponding to narrower confidence intervals. Common confidence levels include 90%, 95%, and 99%. Usually we don’t just begin chapters with a definition, but confidence intervals are simple to define and play an important role in the sciences and any field that uses data. You can think of a confidence interval as playing the role of a net when fishing. Instead of just trying to catch a fish with a single spear (estimating an unknown parameter by using a single point estimate/statistic), we can use a net to try to provide a range of possible locations for the fish (use a range of possible values based around our statistic to make a plausible guess as to the location of the parameter). Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(infer) library(mosaic) library(knitr) library(ggplot2movies) 9.3 Bootstrapping Just as we did in Chapter 10 with the Lady Tasting Tea when making hypotheses about a population total with which we would like to test which one is more plausible, we can also use computation to infer conclusions about a population quantitative statistic such as the mean. In this case, we will focus on constructing confidence intervals to produce plausible values for a population mean. (We can do a similar analysis for a population median or other summary measure as well.) Traditionally, the way to construct confidence intervals for a mean is to assume a normal distribution for the population or to invoke the Central Limit Theorem and get, what often appears to be magic, results. (This is similar to what was done in Section 10.10.) These methods are often not intuitive, especially for those that lack a strong mathematical background. They also come with their fair share of assumptions and often turn Statistics, a field that is full of tons of useful applications to many different fields and disciplines, into a robotic procedural-based topic. It doesn’t have to be that way! In this section, we will introduce the concept of bootstrapping. It will be a useful tool that will allow us to estimate the variability of our statistic from sample to sample. One neat feature of bootstrapping is that it enables us to approximate the sampling distribution and estimate the distribution’s standard deviation using ONLY the information in the one selected (original) sample. It sounds just as plagued with the magical type qualities of traditional theory-based inference on initial glance but we will see that it provides an intuitive and useful way to make inferences, especially when the samples are of medium to large size. To introduce the concept of bootstrapping, we again will use the movies dataset in the ggplot2movies data frame. Recall that you can also glance at this data frame using the View function and look at the help documentation for movies using the ? function. We will explore many other features of this dataset in the chapters to come, but here we will be focusing on the rating variable corresponding to the average IMDB user rating. You may notice that this dataset is quite large: 58,788 movies have data collected about them here. This will correspond to our population of ALL movies. Remember from Chapter 8 that our population is rarely known. We use this dataset as our population here to show you the power of bootstrapping in estimating population parameters. We’ll see how confidence intervals built using the bootstrap distribution perform at including our population parameter of interest. Here we can actually calculate these values since our population is known, but remember that in general this isn’t the case. Let’s take a look at what the distribution of our population ratings looks like. We’ll see that we will use the distribution of our sample(s) as an estimate of this population histogram. movies %>% ggplot(aes(x = rating)) + geom_histogram(color = "white", bins = 20) Figure 9.1: Population ratings histogram Learning check (LC9.1) Why was a histogram chosen as the plot to make for the rating variable above? (LC9.2) What does the shape of the rating histogram tell us about how IMDB users rate movies? What stands out about the plot? It’s important to think about what our goal is here. We would like to produce a confidence interval for the population mean rating. We will have to pretend for a moment that we don’t have all 58,788 movies. Let’s say that we only have a random sample of 50 movies from this dataset instead. In order to get a random sample, we can use the resample function in the mosaic package with replace = FALSE. We could also use the sample_n function from dplyr. set.seed(2017) movies_sample <- movies %>% sample_n(50) The sample_n function has filtered the data frame movies “at random” to choose only 50 rows from the larger movies data frame. We store information on these 50 movies in the movies_sample data frame. Let’s now explore what the rating variable looks like for these 50 movies: ggplot(data = movies_sample, aes(x = rating)) + geom_histogram(color = "white", bins = 20) Figure 9.2: Sample ratings histogram Remember that we can think of this histogram as an estimate of our population distribution histogram that we saw above. We are interested in the population mean rating and trying to find a range of plausible values for that value. A good start in guessing the population mean is to use the mean of our sample rating from the movies_sample data: (movies_sample_mean <- movies_sample %>% summarize(mean = mean(rating))) # A tibble: 1 x 1 mean <dbl> 1 5.89 Note the use of the ( ) at the beginning and the end of this creation of the movies_sample_mean object. If you’d like to print out your newly created object, you can enclose it in the parentheses as we have here. This value of 5.894 is just one guess at the population mean. The idea behind bootstrapping is to sample with replacement from the original sample to create new resamples of the same size as our original sample. Returning to our example, let’s investigate what one such resample of the movies_sample dataset accomplishes. We can create one resample/bootstrap sample by using the resample function in the mosaic package. boot1 <- resample(movies_sample) %>% arrange(orig.id) The important thing to note here is the original row numbers from the movies_sample data frame in the far right column called orig.ids. Since we are sampling with replacement, there is a strong likelihood that some of the 50 observational units are going to be selected again. You may be asking yourself what does this mean and how does this lead us to creating a distribution for the sample mean. Recall that the original sample mean of our data was calculated using the summarize function above. Learning check (LC9.3) What happens if we change the seed to our pseudo-random generation? Try it above when we used resample to describe the resulting movies_sample. (LC9.4) Why is sampling at random important from the movies data frame? Why don’t we just pick Action movies and do bootstrapping with this Action movies subset? (LC9.5) What was the purpose of assuming we didn’t have access to the full movies dataset here? Before we had a calculated mean in our original sample of 5.894. Let’s calculate the mean of ratings in our bootstrapped sample: (movies_boot1_mean <- boot1 %>% summarize(mean = mean(rating))) # A tibble: 1 x 1 mean <dbl> 1 5.69 More than likely the calculated bootstrap sample mean is different than the original sample mean. This is what was meant earlier by the sample means having some variability. What we are trying to do is replicate many different samples being taken from a larger population. Our best guess at what the population looks like is multiple copies of the sample we collected. We then can sample from that larger “created” population by generating bootstrap samples. Similar to what we did in the previous section, we can repeat this process using the do function followed by an asterisk. Let’s look at 10 different bootstrap means for ratings from movies_sample. Note the use of the resample function here. do(10) * (resample(movies_sample) %>% summarize(mean = mean(rating))) mean 1 5.94 2 5.57 3 5.83 4 6.29 5 6.03 6 5.92 7 6.00 8 5.85 9 6.10 10 5.61 You should see some variability begin to tease its way out here. Many of the simulated means will be close to our original sample mean but many will stray pretty far away. This occurs because outliers may have been selected a couple of times in the resampling or small values were selected more than larger. There are myriad reasons why this might be the case. So what’s the next step now? Just as we repeated the repetitions thousands of times with the “Lady Tasting Tea” example, we can do a similar thing here: trials <- do(5000) * summarize(resample(movies_sample), mean = mean(rating)) ggplot(data = trials, mapping = aes(x = mean)) + geom_histogram(bins = 30, color = "white") Figure 9.3: Bootstrapped means histogram The shape of this resulting distribution may look familiar to you. It resembles the well-known normal (bell-shaped) curve. At this point, we can easily calculate a confidence interval. In fact, we have a couple different options. We will first use the percentiles of the distribution we just created to isolate the middle 95% of values. This will correspond to our 95% confidence interval for the population mean rating, denoted by \\(\\mu\\). (ciq_mean_rating <- confint(trials, level = 0.95, method = "quantile")) name lower upper level method estimate 1 mean 5.46 6.3 0.95 percentile 5.89 It’s always important at this point to interpret the results of this confidence interval calculation. In this context, we can say something like the following: Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.46 and 6.3. This statement may seem a little confusing to you. Another way to think about this is that this confidence interval was constructed using the sample data by a procedure that is 95% reliable in that of 100 generated confidence intervals based on 100 different random samples, we expect on average that 95 of them will capture the true unknown parameter. This also means that we will get invalid results 5% of the time. Just as we had a trade-off with \\(\\alpha\\) and \\(\\beta\\) with hypothesis tests, we have a similar trade-off here with setting the confidence level. To further reiterate this point, the graphic below from Diez, Barr, and Çetinkaya-Rundel (2014) shows us that if we repeated a confidence interval process 25 times with 25 different samples, we would expect about 95% of them to actually contain the population parameter of interest. This parameter is marked with a dotted vertical line. We can see that only one confidence interval does not overlap with this value. (The one marked in red.) Therefore 24 in 25 (96%), which is quite close to our 95% reliability, do include the population parameter. Figure 9.4: Confidence interval coverage plot from OpenIntro Remember that we are pretending like we don’t know what the mean IMDB rating for ALL movies is. Our population here is all of the movies listed in the movies data frame from ggplot2movies. So does our bootstrapped confidence interval here contain the actual mean value? movies %>% summarize(mean_rating = mean(rating)) # A tibble: 1 x 1 mean_rating <dbl> 1 5.93 We see here that the population mean does fall in our range of plausible values generated from the bootstrapped samples. We can also get an idea of how the theory-based inference techniques would have approximated this confidence interval by using the formula \\[\\bar{x} \\pm (2 * SE),\\] where \\(\\bar{x}\\) is our original sample mean and \\(SE\\) stands for standard error and corresponds to the standard deviation of the bootstrap distribution. The value of 2 here corresponds to it being a 95% confidence interval. (95% of the values in a normal distribution fall within 2 standard deviations of the mean.) This formula assumes that the bootstrap distribution is symmetric and bell-shaped. This is often the case with bootstrap distributions, especially those in which the original distribution of the sample is not highly skewed. Definition: standard error The standard error is the standard deviation of the sampling distribution. The sampling distribution may be approximated by the bootstrap distribution or the null distribution depending on the context. Traditional theory-based methodologies for inference also have formulas for standard errors, assuming some conditions are met. To compute this type of confidence interval, we only need to make a slight modification to the confint function seen above. (The expression after the \\(\\pm\\) sign is known as the margin of error.) (cise_mean_rating <- confint(trials, level = 0.95, method = "stderr")) name lower upper level method estimate margin.of.error 1 mean 5.47 6.32 0.95 stderr 5.89 0.425 Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.467 and 6.316. Learning check (LC9.6) Reproduce the bootstrapping above using a sample of size 50 instead of 25. What changes do you see? (LC9.7) Reproduce the bootstrapping above using a sample of size 5 instead of 25. What changes do you see? (LC9.8) How does the sample size affect the analysis above? (LC9.9) Why must bootstrap samples be the same size as the original sample? 9.3.1 Review of bootstrapping We can summarize the process to generate a bootstrap distribution here in a series of steps that clearly identify the terminology we will use (R. Lock et al. 2012). Generate bootstrap samples by sampling with replacement from the original sample, using the same sample size. Compute the statistic of interest, called a bootstrap statistic, for each of the bootstrap samples. Collect the statistics for many bootstrap samples to create a bootstrap distribution. Visually, we can represent this process in the following diagram. Figure 9.5: Bootstrapping diagram from Lock5 textbook 9.4 Relation to hypothesis testing Recall that we found a statistically significant difference in the sample mean of romance movie ratings compared to the sample mean of action movie ratings. We concluded Chapter 10 by attempting to understand just how much greater we could expect the population mean romance movie rating to be compared to the population mean action movie rating. In order to do so, we will calculate a confidence interval for the difference \\(\\mu_r - \\mu_a\\). We’ll then go back to our population parameter values and see if our confidence interval contains our parameter value. We could use bootstrapping in a way similar to that done above, except now on a difference in sample means, to create a distribution and then use the confint function with the option of quantile to determine a confidence interval for the plausible values of the difference in population means. This is an excellent programming activity and the reader is urged to try to do so. Recall what the randomization/null distribution looked like for our simulated shuffled sample means: Note all this code was moved over from hypothesis testing (movies_trimmed <- movies %>% select(title, year, rating, Action, Romance)) # A tibble: 58,788 x 5 title year rating Action Romance <chr> <int> <dbl> <int> <int> 1 $ 1971 6.4 0 0 2 $1000 a Touchdown 1939 6.0 0 0 3 $21 a Day Once a Month 1941 8.2 0 0 4 $40,000 1996 8.2 0 0 5 $50,000 Climax Show, The 1975 3.4 0 0 6 $pent 2000 4.3 0 0 7 $windle 2002 5.3 1 0 8 '15' 2002 6.7 0 0 9 '38 1987 6.6 0 0 10 '49-'17 1917 6.0 0 0 # ... with 58,778 more rows movies_trimmed <- movies_trimmed %>% filter(!(Action == 1 & Romance == 1)) movies_trimmed <- movies_trimmed %>% mutate(genre = ifelse(Action == 1, "Action", ifelse(Romance == 1, "Romance", "Neither"))) %>% filter(genre != "Neither") %>% select(-Action, -Romance) set.seed(2017) movies_genre_sample <- movies_trimmed %>% group_by(genre) %>% sample_n(34) %>% ungroup() mean_ratings <- movies_genre_sample %>% group_by(genre) %>% summarize(mean = mean(rating)) obs_diff <- diff(mean_ratings$mean) shuffled_ratings <- #movies_trimmed %>% movies_genre_sample %>% mutate(genre = shuffle(genre)) %>% group_by(genre) %>% summarize(mean = mean(rating)) diff(shuffled_ratings$mean) [1] -0.132 set.seed(2017) many_shuffles <- do(5000) * (movies_genre_sample %>% mutate(genre = shuffle(genre)) %>% group_by(genre) %>% summarize(mean = mean(rating)) ) rand_distn <- many_shuffles %>% group_by(.index) %>% summarize(diffmean = diff(mean)) head(rand_distn, 10) # A tibble: 10 x 2 .index diffmean <dbl> <dbl> 1 1 -0.13235 2 2 -0.19706 3 3 -0.02647 4 4 0.71471 5 5 -0.47353 6 6 -0.12059 7 7 -0.17353 8 8 -0.20882 9 9 -0.00882 10 10 -0.33235 ggplot(data = rand_distn, mapping = aes(x = diffmean)) + geom_histogram(color = "white", bins = 20) Figure 9.6: Simulated shuffled sample means histogram With this null distribution being quite symmetric and bell-shaped, the standard error method introduced above likely provides a good estimate of a range of plausible values for \\(\\mu_r - \\mu_a\\). Another nice option here is that we can use the standard deviation of the null/randomization distribution we just found with our hypothesis test. (std_err <- rand_distn %>% summarize(se = sd(diffmean))) # A tibble: 1 x 1 se <dbl> 1 0.34 We can use the general formula of \\(statistic \\pm (2 * SE)\\) for a confidence interval to obtain the following result for plausible values of the difference in population means at the 95% level. (lower <- obs_diff - (2 * std_err)) se 1 0.269 (upper <- obs_diff + (2 * std_err)) se 1 1.63 We can, therefore, say that we are 95% confident that the population mean rating for romance movies is between 0.269 and 1.631 points higher than for that of action movies. The important thing to check here is whether 0 is contained in the confidence interval. If it is, it is plausible that the difference in the two population means between the two groups is 0. This means that the null hypothesis is plausible. The results of the hypothesis test and the confidence interval should match as they do here. We rejected the null hypothesis with hypothesis testing and we have evidence here that the mean rating for romance movies is higher than for action movies. 9.5 Effect size The phrase effect size has been thrown around recently as an alternative to \\(p\\)-values. In combination with the confidence interval, it can be often more valuable than just looking at the results of a hypothesis test. It depends on the scientific discipline exactly what is meant by “effect size” but, in general, it refers to the magnitude of the difference between group measurements. For our two sample problem involving movies, it is the observed difference in sample means obs_diff. It’s worthy of mention here that confidence intervals are always centered at the observed statistic. In other words, if you are looking at a confidence interval and someone asks you what the “effect size” is you can simply find the midpoint of the stated confidence interval. Learning check (LC9.10) Check to see whether the difference in population mean ratings for the two genres falls in the confidence interval we found here. Are we guaranteed that it will fall in the range of plausible values? (LC9.11) Why do you think many scientific fields are shifting to preferring inclusion of confidence intervals in articles over just \\(p\\)-values and hypothesis tests? (LC9.12) Why is 95% related to a value of 2 in the margin of error? What would approximate values be for 90% and for 99%? (LC9.13) Why is a 95% confidence interval wider than a 90% confidence interval? Explain by using a concrete example from everyday life about what is meant by “confidence.” (LC9.14) How would confidence intervals correspond to one-sided hypothesis tests? (LC9.15) There is a relationship between the significance level and the confidence level. What do you think it is? (LC9.16) The moment the phrase “standard error” is mentioned, there seems to be someone that says “The standard error is \\(s\\) divided by the square root of \\(n\\).” This standard error formula is used in the theory-based procedure for an inference on one mean. But… does it always work? For samp1, samp2, and samp3 below, do the following: produce a bootstrap distribution based on the sample calculate the standard deviation of the bootstrap distribution compare this value of the standard error to what you obtain when you calculate the standard deviation of the sample \\(s\\) divided by \\(\\sqrt{n}\\). df1 <- data_frame(samp1 = rexp(50)) df2 <- data_frame(samp2 = rnorm(100)) df3 <- data_frame(samp3 = rbeta(20, 5, 5)) Describe how \\(s / \\sqrt{n}\\) does in approximating the standard error for these three samples and their corresponding bootstrap distributions. 9.6 Conclusion 9.6.1 What’s to come? This concludes the Inference unit of this book. You should now have a thorough introduction into topics in both data science and statistics. In the last chapter of the textbook, we’ll summarize the purpose of this book as well as present an excellent example of what goes into making an effective story via data. 9.6.2 Script of R code An R script file of all R code used in this chapter is available here. "], +["10-hypo.html", "10 Hypothesis Testing 10.1 Sneak peak of infer 10.2 Under construction… 10.3 When inference is not needed 10.4 Basics of hypothesis testing 10.5 Criminal trial analogy 10.6 Types of errors in hypothesis testing 10.7 Statistical significance 10.8 Example: Revisiting the Lady Tasting Tea 10.9 Example: Comparing two means 10.10 Building theory-based methods using computation 10.11 Resampling-based inference for regression 10.12 Theory-based inference for regression 10.13 Conclusion", " 10 Hypothesis Testing Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. Please check out our sneak peak of infer below in the meanwhile. For more details on infer visit https://infer.netlify.com/. 10.1 Sneak peak of infer Question: Of all the cars in the mtcars dataset, do automatic cars get better gas mileage than manual cars? Approach: Two-sample test for difference in means. library(dplyr) library(ggplot2) library(infer) # Clean data mtcars <- mtcars %>% as_tibble() %>% mutate(am = factor(am)) # Observed test statistic obs_stat <- mtcars %>% group_by(am) %>% summarize(mean = mean(mpg)) %>% summarize(obs_stat = diff(mean)) %>% pull(obs_stat) # Simulate null distribution of two-sample difference in means: null_distribution <- mtcars %>% specify(mpg ~ am) %>% hypothesize(null = "independence") %>% generate(reps = 1000, type = "permute") %>% calculate(stat = "diff in means", order = c("1", "0")) # Visualize: plot <- null_distribution %>% visualize() plot + geom_vline(xintercept = obs_stat, col = "red", size = 1) 10.2 Under construction… The content here will be deprecated with a shift to using the infer package in the months to come. We saw some of the main concepts of hypothesis testing introduced in Chapter 8. We will expand further on these ideas here and also provide a framework for understanding hypothesis tests in general. Instead of presenting you with lots of different formulas and scenarios, we hope to build a way to think about all hypothesis tests. You can then adapt to different scenarios as needed down the road when you encounter different statistical situations. The same can be said for confidence intervals. There is one general framework that applies to all confidence intervals and we will elaborate on this further in Chapter 9. The specifics may change slightly for each variation, but the important idea is to understand the general framework so that you can apply it to more specific problems. We believe that this approach is much better in the long-term than teaching you specific tests and confidence intervals rigorously. You can find fully-worked out examples for five common hypothesis tests and their corresponding confidence intervals in Appendix B. We recommend that you carefully review these examples as they also cover how the general frameworks apply to traditional normal-based methodologies like the \\(t\\)-test and normal-theory confidence intervals. You’ll see there that these methods are just approximations for the general computational frameworks, but require conditions to be met for their results to be valid. The general frameworks using randomization, simulation, and bootstrapping do not hold the same sorts of restrictions and further advance computational thinking, which is one big reason for their emphasis throughout this textbook. Needed packages Let’s load all the packages needed for this chapter (this assumes you’ve already installed them). If needed, read Section 2.3 for information on how to install and load R packages. library(dplyr) library(ggplot2) library(mosaic) library(knitr) library(nycflights13) library(ggplot2movies) library(broom) 10.3 When inference is not needed Before we delve into the two techniques of inference (hypothesis testing and confidence intervals), it’s good to remember that there are cases where you need not perform a rigorous statistical inference. An important and time-saving skill is to ALWAYS do exploratory data analysis using dplyr and ggplot2 before thinking about running a hypothesis test. Let’s look at such an example selecting a sample of flights traveling to Boston and to San Francisco from New York City in the flights data frame in the nycflights13 package. (We will remove flights with missing data first using na.omit and then sample 100 flights going to each of the two airports.) bos_sfo <- flights %>% na.omit() %>% filter(dest %in% c("BOS", "SFO")) %>% group_by(dest) %>% sample_n(100) Suppose we were interested in seeing if the air_time to SFO in San Francisco was statistically greater than the air_time to BOS in Boston. As suggested, let’s begin with some exploratory data analysis to get a sense for how the two variables of air_time and dest relate for these two destination airports: bos_sfo_summary <- bos_sfo %>% group_by(dest) %>% summarize(mean_time = mean(air_time), sd_time = sd(air_time)) kable(bos_sfo_summary) dest mean_time sd_time BOS 38.7 3.91 SFO 346.2 16.71 Looking at these results, we can clearly see that SFO air_time is much larger than BOS air_time. The standard deviation is also extremely informative here. Learning check (LC10.1) Could we make the same type of immediate conclusion that SFO had a statistically greater air_time if, say, its corresponding standard deviation was 200 minutes? What about 100 minutes? Explain. To further understand just how different the air_time variable is for BOS and SFO, let’s look at a boxplot: ggplot(data = bos_sfo, mapping = aes(x = dest, y = air_time)) + geom_boxplot() Since there is no overlap at all, we can conclude that the air_time for San Francisco flights is statistically greater (at any level of significance) than the air_time for Boston flights. This is a clear example of not needing to do anything more than some simple descriptive statistics to get an appropriate inferential conclusion. This is one reason why you should ALWAYS investigate the sample data first using dplyr and ggplot2 via exploratory data analysis. As you get more and more practice with hypothesis testing, you’ll be better able to determine in many cases whether or not the results will be statistically significant. There are circumstances where it is difficult to tell, but you should always try to make a guess FIRST about significance after you have completed your data exploration and before you actually begin the inferential techniques. 10.4 Basics of hypothesis testing In a hypothesis test, we will use data from a sample to help us decide between two competing hypotheses about a population. We make these hypotheses more concrete by specifying them in terms of at least one population parameter of interest. We refer to the competing claims about the population as the null hypothesis, denoted by \\(H_0\\), and the alternative (or research) hypothesis, denoted by \\(H_a\\). The roles of these two hypotheses are NOT interchangeable. The claim for which we seek significant evidence is assigned to the alternative hypothesis. The alternative is usually what the experimenter or researcher wants to establish or find evidence for. Usually, the null hypothesis is a claim that there really is “no effect” or “no difference.” In many cases, the null hypothesis represents the status quo or that nothing interesting is happening. We assess the strength of evidence by assuming the null hypothesis is true and determining how unlikely it would be to see sample results/statistics as extreme (or more extreme) as those in the original sample. Hypothesis testing brings about many weird and incorrect notions in the scientific community and society at large. One reason for this is that statistics has traditionally been thought of as this magic box of algorithms and procedures to get to results and this has been readily apparent if you do a Google search of “flowchart statistics hypothesis tests”. There are so many different complex ways to determine which test is appropriate. You’ll see that we don’t need to rely on these complicated series of assumptions and procedures to conduct a hypothesis test any longer. These methods were introduced in a time when computers weren’t powerful. Your cellphone (in 2016) has more power than the computers that sent NASA astronauts to the moon after all. We’ll see that ALL hypothesis tests can be broken down into the following framework given by Allen Downey here: Figure 10.1: Hypothesis Testing Framework Before we hop into this framework, we will provide another way to think about hypothesis testing that may be useful. 10.5 Criminal trial analogy We can think of hypothesis testing in the same context as a criminal trial in the United States. A criminal trial in the United States is a familiar situation in which a choice between two contradictory claims must be made. The accuser of the crime must be judged either guilty or not guilty. Under the U.S. system of justice, the individual on trial is initially presumed not guilty. Only STRONG EVIDENCE to the contrary causes the not guilty claim to be rejected in favor of a guilty verdict. The phrase “beyond a reasonable doubt” is often used to set the cutoff value for when enough evidence has been given to convict. Theoretically, we should never say “The person is innocent.” but instead “There is not sufficient evidence to show that the person is guilty.” Now let’s compare that to how we look at a hypothesis test. The decision about the population parameter(s) must be judged to follow one of two hypotheses. We initially assume that \\(H_0\\) is true. The null hypothesis \\(H_0\\) will be rejected (in favor of \\(H_a\\)) only if the sample evidence strongly suggests that \\(H_0\\) is false. If the sample does not provide such evidence, \\(H_0\\) will not be rejected. The analogy to “beyond a reasonable doubt” in hypothesis testing is what is known as the significance level. This will be set before conducting the hypothesis test and is denoted as \\(\\alpha\\). Common values for \\(\\alpha\\) are 0.1, 0.01, and 0.05. 10.5.1 Two possible conclusions Therefore, we have two possible conclusions with hypothesis testing: Reject \\(H_0\\) Fail to reject \\(H_0\\) Gut instinct says that “Fail to reject \\(H_0\\)” should say “Accept \\(H_0\\)” but this technically is not correct. Accepting \\(H_0\\) is the same as saying that a person is innocent. We cannot show that a person is innocent; we can only say that there was not enough substantial evidence to find the person guilty. When you run a hypothesis test, you are the jury of the trial. You decide whether there is enough evidence to convince yourself that \\(H_a\\) is true (“the person is guilty”) or that there was not enough evidence to convince yourself \\(H_a\\) is true (“the person is not guilty”). You must convince yourself (using statistical arguments) which hypothesis is the correct one given the sample information. Important note: Therefore, DO NOT WRITE “Accept \\(H_0\\)” any time you conduct a hypothesis test. Instead write “Fail to reject \\(H_0\\).” 10.6 Types of errors in hypothesis testing Unfortunately, just as a jury or a judge can make an incorrect decision in regards to a criminal trial by reaching the wrong verdict, there is some chance we will reach the wrong conclusion via a hypothesis test about a population parameter. As with criminal trials, this comes from the fact that we don’t have complete information, but rather a sample from which to try to infer about a population. The possible erroneous conclusions in a criminal trial are an innocent person is convicted (found guilty) or a guilty person is set free (found not guilty). The possible errors in a hypothesis test are rejecting \\(H_0\\) when in fact \\(H_0\\) is true (Type I Error) or failing to reject \\(H_0\\) when in fact \\(H_0\\) is false (Type II Error). The risk of error is the price researchers pay for basing an inference about a population on a sample. With any reasonable sample-based procedure, there is some chance that a Type I error will be made and some chance that a Type II error will occur. To help understand the concepts of Type I error and Type II error, observe the following table: Figure 10.2: Type I and Type II errors If we are using sample data to make inferences about a parameter, we run the risk of making a mistake. Obviously, we want to minimize our chance of error; we want a small probability of drawing an incorrect conclusion. The probability of a Type I Error occurring is denoted by \\(\\alpha\\) and is called the significance level of a hypothesis test The probability of a Type II Error is denoted by \\(\\beta\\). Formally, we can define \\(\\alpha\\) and \\(\\beta\\) in regards to the table above, but for hypothesis tests instead of a criminal trial. \\(\\alpha\\) corresponds to the probability of rejecting \\(H_0\\) when, in fact, \\(H_0\\) is true. \\(\\beta\\) corresponds to the probability of failing to reject \\(H_0\\) when, in fact, \\(H_0\\) is false. Ideally, we want \\(\\alpha = 0\\) and \\(\\beta = 0\\), meaning that the chance of making an error does not exist. When we have to use incomplete information (sample data), it is not possible to have both \\(\\alpha = 0\\) and \\(\\beta = 0\\). We will always have the possibility of at least one error existing when we use sample data. Usually, what is done is that \\(\\alpha\\) is set before the hypothesis test is conducted and then the evidence is judged against that significance level. Common values for \\(\\alpha\\) are 0.05, 0.01, and 0.10. If \\(\\alpha = 0.05\\), we are using a testing procedure that, used over and over with different samples, rejects a TRUE null hypothesis five percent of the time. So if we can set \\(\\alpha\\) to be whatever we want, why choose 0.05 instead of 0.01 or even better 0.0000000000000001? Well, a small \\(\\alpha\\) means the test procedure requires the evidence against \\(H_0\\) to be very strong before we can reject \\(H_0\\). This means we will almost never reject \\(H_0\\) if \\(\\alpha\\) is very small. If we almost never reject \\(H_0\\), the probability of a Type II Error – failing to reject \\(H_0\\) when we should – will increase! Thus, as \\(\\alpha\\) decreases, \\(\\beta\\) increases and as \\(\\alpha\\) increases, \\(\\beta\\) decreases. We, therefore, need to strike a balance in \\(\\alpha\\) and \\(\\beta\\) and the common values for \\(\\alpha\\) of 0.05, 0.01, and 0.10 usually lead to a nice balance. Learning check (LC10.2) Reproduce the table above about errors, but for a hypothesis test, instead of the one provided for a criminal trial. 10.6.1 Logic of hypothesis testing Take a random sample (or samples) from a population (or multiple populations) If the sample data are consistent with the null hypothesis, do not reject the null hypothesis. If the sample data are inconsistent with the null hypothesis (in the direction of the alternative hypothesis), reject the null hypothesis and conclude that there is evidence the alternative hypothesis is true (based on the particular sample collected). 10.7 Statistical significance The idea that sample results are more extreme than we would reasonably expect to see by random chance if the null hypothesis were true is the fundamental idea behind statistical hypothesis tests. If data at least as extreme would be very unlikely if the null hypothesis were true, we say the data are statistically significant. Statistically significant data provide convincing evidence against the null hypothesis in favor of the alternative, and allow us to generalize our sample results to the claim about the population. Learning check (LC10.3) What is wrong about saying “The defendant is innocent.” based on the US system of criminal trials? (LC10.4) What is the purpose of hypothesis testing? (LC10.5) What are some flaws with hypothesis testing? How could we alleviate them? 10.8 Example: Revisiting the Lady Tasting Tea Recall the “There is Only One Test” diagram from earlier: Figure 10.3: Hypothesis Testing Framework We will now walk through how each of the steps to the diagram apply to determining whether the lady tasting tea was actually better than chance at determining whether or not milk was added first. We will see that the process of creating a null distribution is a statistical way to quantifying surprise. 10.8.1 Data Let’s assume as we did in Chapter 8 that the lady is correct in determining whether milk was added first or not in 9 out of 10 trials. Our data, therefore, may look something like x Correct Correct Correct Incorrect Correct Correct Correct Correct Correct Correct 10.8.2 Test statistic \\(\\delta\\) We are interested in the number of Correct out of our 10 trials. We can denote this number of successes using the symbol \\(t\\), where \\(t\\) corresponds to total. This is our test statistic \\(\\delta\\) in this case. 10.8.3 Observed effect \\(\\delta^*\\) The actual observed value of the test statistic from our observed sample is \\(\\hat{t}_{obs} = 9\\). Thus, \\(\\delta^* = 9\\). 10.8.4 Model of \\(H_0\\) Our null hypothesis is that the lady is only as good as chance at guessing correctly. Hypotheses always correspond to parameters and are denoted with Greek letters. Thus, symbolically, we have \\(H_0: \\tau = 5\\). Since we are assuming chance and we have 10 flips with 0.5 probability of success of each flip, we have \\(\\tau = 10 \\times 0.5 = 5\\). 10.8.5 Simulated data We now want to use this null hypothesis to simulate the test statistic assuming that the null hypothesis is true. Therefore, we want to figure out a way to simulate 10 trials, getting either the choice Correct or Incorrect, assuming that the probability of success (getting it Correct) in any given trial is 0.5. Tactile simulation When you are presented with a hypothesis testing problem, frequently the most challenging portion is setting up how to simulate the data assuming the null hypothesis is true. To facilitate with this, setting up a tactile, hands on experiment can help. In this case, flipping a fair coin is a great way to simulate this process. This simulates how the sample could be collected assuming the null hypothesis is true. To simulate 10 trials, we could flip the fair coin and record Heads as Correct and Tails as Incorrect. Some simulated data using this coin flipping procedure may look like the following. Note that this data frame is not tidy, but is a convenient way to look at the results of the simulation in this wide format. The numbers on the far left correspond to the number of the trial. Table 10.1: A table of three sets of 10 coin flips sample1 sample2 sample3 1 Correct Correct Correct 2 Correct Incorrect Incorrect 3 Incorrect Incorrect Correct 4 Incorrect Incorrect Correct 5 Correct Incorrect Incorrect 6 Correct Incorrect Correct 7 Incorrect Incorrect Correct 8 Incorrect Correct Incorrect 9 Incorrect Correct Incorrect 10 Incorrect Correct Incorrect We then use the formula for the Test Statistic to determine the simulated test statistic for each of these simulated samples. So in this case we have \\(t_1 = 4\\), \\(t_2 = 4\\), \\(t_3 = 5\\) 10.8.6 Distribution of \\(\\delta\\) under \\(H_0\\) We could continue this process, say, 5000 times by flipping a coin in sets of 10 for 5000 repetitions and counting and taking note of how many heads out of 10 we have for each set. It’s at this point that you surely realize that a computer can do this procedure much faster and more efficient than the tactile experiment with a coin. Recall that we’ve already created the distribution of 5000 such coin flips and we’ve stored these values in the heads variable in the simGuesses data frame: simGuesses <- do(5000) * rflip(10) ggplot(data = simGuesses, aes(x = factor(heads))) + geom_bar() 10.8.7 The p-value Definition: \\(p\\)-value: The p-value is the probability of observing a sample statistic as extreme or more extreme than what was observed, assuming that the null hypothesis of a by chance operation is true. This definition may be a little intimidating the first time you read it, but it’s important to come back to this “The Lady Tasting Tea” problem whenever you encounter \\(p\\)-values as you begin to learn about the concept. Here the \\(p\\)-value corresponds to how many times in our null distribution of heads 9 or more heads occurred. We can use another neat feature of R to calculate the \\(p\\)-value for this problem. Note that “more extreme” in this case corresponds to looking at values of 9 or greater since our alternative hypothesis invokes a right-tail test corresponding to a “greater than” hypothesis of \\(H_a: \\tau > 5\\). In other words, we are looking to see how likely it is for the lady to pick 9 or more correct instead of 9 or less correct. We’d like to go in the right direction. pvalue_tea <- simGuesses %>% filter(heads >= 9) %>% nrow() / nrow(simGuesses) Let’s walk through each step of this calculation: First, pvalue_tea will be the name of our calculated \\(p\\)-value and the assignment operator <- directs us to this naming. We are working with the simGuesses data frame here so that comes immediately before the pipe operator. We would like to only focus on the rows in our simGuesses data frame that have heads values of 9 or 10. This represents simulated statistics “as extreme or more extreme” than what we observed (9 correct guesses out of 10). To get a glimpse of what we have up to this point, run simGuesses %>% filter(heads >= 9) %>% View(). Now that we have changed the focus to only those rows that have number of heads out of 10 flips corresponding to 9 or more, we count how many of those there are. The function nrow gives how many entries are in this filtered data frame and lastly we calculate the proportion that are at least as extreme as our observed value of 9 by dividing by the number of total simulations (5,000). We can see that the observed statistic of 9 correct guesses is not a likely outcome assuming the null hypothesis is true. Only around 1% of the outcomes in our 5000 simulations fall at or above 9 successes. We have evidence supporting the conclusion that the person is actually better than just guessing at random at determining whether milk has been added first or not. To better visualize this we can also make use of blue shading on the histogram corresponding to the \\(p\\)-value: ggplot(data = simGuesses, aes(x = factor(heads), fill = (heads >= 9))) + geom_bar() + labs(x = "heads") Figure 10.4: Barplot of heads with p-value highlighted This helps us better see just how few of the values of heads are at our observed value or more extreme. This idea of a \\(p\\)-value can be extended to the more traditional methods using normal and \\(t\\) distributions in the traditional way that introductory statistics has been presented. These traditional methods were used because statisticians haven’t always been able to do 5000 simulations on the computer within seconds. We’ll elaborate on this more in a few sections. Learning check (LC10.6) How could we make Table 10.1 into a tidy data frame? (LC10.7) What is meant by “pseudo-random number generation?” (LC10.8) How can simulation be used to help us address the question of whether or not an observed result is statistically significant? (LC10.9) In Chapter 3, we noted that barplots should be used when creating a plot of categorical variables. Why are we using barplots to make a plot of a numerical variable heads in this chapter? 10.9 Example: Comparing two means 10.9.1 Randomization/permutation We will now focus on building hypotheses looking at the difference between two population means in an example. We will denote population means using the Greek symbol \\(\\mu\\) (pronounced “mu”). Thus, we will be looking to see if one group “out-performs” another group. This is quite possibly the most common type of statistical inference and serves as a basis for many other types of analyses when comparing the relationship between two variables. Our null hypothesis will be of the form \\(H_0: \\mu_1 = \\mu_2\\), which can also be written as \\(H_0: \\mu_1 - \\mu_2 = 0\\). Our alternative hypothesis will be of the form \\(H_0: \\mu_1 \\star \\mu_2\\) (or \\(H_a: \\mu_1 - \\mu_2 \\, \\star \\, 0\\)) where \\(\\star\\) = \\(<\\), \\(\\ne\\), or \\(>\\) depending on the context of the problem. You needn’t focus on these new symbols too much at this point. It will just be a shortcut way for us to describe our hypotheses. As we saw earlier, simulation is a valuable tool when conducting inferences based on one population variable. We will see that the process of randomization (also known as permutation) will be valuable in conducting tests comparing quantitative values from two groups. 10.9.2 Comparing action and romance movies The movies dataset in the ggplot2movies package contains information on a large number of movies that have been rated by users of IMDB.com (Wickham 2015). We are interested in the question here of whether Action movies are rated higher on IMDB than Romance movies. We will first need to do a little bit of data wrangling using the ideas from Chapter 5 to get the data in the form that we would like: (movies_trimmed <- movies %>% select(title, year, rating, Action, Romance)) # A tibble: 58,788 x 5 title year rating Action Romance <chr> <int> <dbl> <int> <int> 1 $ 1971 6.4 0 0 2 $1000 a Touchdown 1939 6.0 0 0 3 $21 a Day Once a Month 1941 8.2 0 0 4 $40,000 1996 8.2 0 0 5 $50,000 Climax Show, The 1975 3.4 0 0 6 $pent 2000 4.3 0 0 7 $windle 2002 5.3 1 0 8 '15' 2002 6.7 0 0 9 '38 1987 6.6 0 0 10 '49-'17 1917 6.0 0 0 # ... with 58,778 more rows Note that Action and Romance are binary variables here. To remove any overlap of movies (and potential confusion) that are both Action and Romance, we will remove them from our population: movies_trimmed <- movies_trimmed %>% filter(!(Action == 1 & Romance == 1)) We will now create a new variable called genre that specifies whether a movie in our movies_trimmed data frame is an "Action" movie, a "Romance" movie, or "Neither". We aren’t really interested in the "Neither" category here so we will exclude those rows as well. Lastly, the Action and Romance columns are not needed anymore since they are encoded in the genre column. movies_trimmed <- movies_trimmed %>% mutate(genre = ifelse(Action == 1, "Action", ifelse(Romance == 1, "Romance", "Neither"))) %>% filter(genre != "Neither") %>% select(-Action, -Romance) We are left with 8878 movies in our population dataset that focuses on only "Action" and "Romance" movies. Learning check (LC10.10) Why are the different genre variables stored as binary variables (1s and 0s) instead of just listing the genre as a column of values like “Action”, “Comedy”, etc.? (LC10.11) What complications could come above with us excluding action romance movies? Should we question the results of our hypothesis test? Explain. Let’s now visualize the distributions of rating across both levels of genre. Think about what type(s) of plot is/are appropriate here before you proceed: ggplot(data = movies_trimmed, aes(x = genre, y = rating)) + geom_boxplot() Figure 10.5: Rating vs genre in the population We can see that the middle 50% of ratings for "Action" movies is more spread out than that of "Romance" movies in the population. "Romance" has outliers at both the top and bottoms of the scale though. We are initially interested in comparing the mean rating across these two groups so a faceted histogram may also be useful: ggplot(data = movies_trimmed, mapping = aes(x = rating)) + geom_histogram(binwidth = 1, color = "white", fill = "dodgerblue") + facet_grid(genre ~ .) Figure 10.6: Faceted histogram of genre vs rating Important note: Remember that we hardly ever have access to the population values as we do here. This example and the nycflights13 dataset were used to create a common flow from chapter to chapter. In nearly all circumstances, we’ll be needing to use only a sample of the population to try to infer conclusions about the unknown population parameter values. These examples do show a nice relationship between statistics (where data is usually small and more focused on experimental settings) and data science (where data is frequently large and collected without experimental conditions). 10.9.3 Sampling \\(\\rightarrow\\) randomization We can use hypothesis testing to investigate ways to determine, for example, whether a treatment has an effect over a control and other ways to statistically analyze if one group performs better than, worse than, or different than another. We will also use confidence intervals to determine the size of the effect, if it exists. You’ll see more on this in Chapter 9. We are interested here in seeing how we can use a random sample of action movies and a random sample of romance movies from movies to determine if a statistical difference exists in the mean ratings of each group. Learning check (LC10.12) Define the relevant parameters here in terms of the populations of movies. 10.9.4 Data Let’s select a random sample of 34 action movies and a random sample of 34 romance movies. (The number 34 was chosen somewhat arbitrarily here.) set.seed(2017) movies_genre_sample <- movies_trimmed %>% group_by(genre) %>% sample_n(34) %>% ungroup() Note the addition of the ungroup() function here. This will be useful shortly in allowing us to shuffle the values of rating across genre. Our analysis does not work without this ungroup() function since the data stays grouped by the levels of genre without it. We can now observe the distributions of our two sample ratings for both groups. Remember that these plots should be rough approximations of our population distributions of movie ratings for "Action" and "Romance" in our population of all movies in the movies data frame. ggplot(data = movies_genre_sample, aes(x = genre, y = rating)) + geom_boxplot() Figure 10.7: Genre vs rating for our sample ggplot(data = movies_genre_sample, mapping = aes(x = rating)) + geom_histogram(binwidth = 1, color = "white", fill = "dodgerblue") + facet_grid(genre ~ .) Figure 10.8: Genre vs rating for our sample as faceted histogram Learning check (LC10.13) What single value could we change to improve the approximation using the sample distribution on the population distribution? Do we have reason to believe, based on the sample distributions of rating over the two groups of genre, that there is a significant difference between the mean rating for action movies compared to romance movies? It’s hard to say just based on the plots. The boxplot does show that the median sample rating is higher for romance movies, but the histogram isn’t as clear. The two groups have somewhat differently shaped distributions but they are both over similar values of rating. It’s often useful to calculate the mean and standard deviation as well, conditioned on the two levels. summary_ratings <- movies_genre_sample %>% group_by(genre) %>% summarize(mean = mean(rating), std_dev = sd(rating), n = n()) summary_ratings %>% kable() genre mean std_dev n Action 5.11 1.49 34 Romance 6.06 1.15 34 Learning check (LC10.14) Why did we not specify na.rm = TRUE here as we did in Chapter 5? We see that the sample mean rating for romance movies, \\(\\bar{x}_{r}\\), is greater than the similar measure for action movies, \\(\\bar{x}_a\\). But is it statistically significantly greater (thus, leading us to conclude that the means are statistically different)? The standard deviation can provide some insight here but with these standard deviations being so similar it’s still hard to say for sure. Learning check (LC10.15) Why might the standard deviation provide some insight about the means being statistically different or not? 10.9.5 Model of \\(H_0\\) The hypotheses we specified can also be written in another form to better give us an idea of what we will be simulating to create our null distribution. \\(H_0: \\mu_r - \\mu_a = 0\\) \\(H_a: \\mu_r - \\mu_a \\ne 0\\) 10.9.6 Test statistic \\(\\delta\\) We are, therefore, interested in seeing whether the difference in the sample means, \\(\\bar{x}_r - \\bar{x}_a\\), is statistically different than 0. R has a built-in command that can calculate the difference in these two sample means. 10.9.7 Observed effect \\(\\delta^*\\) mean_ratings <- movies_genre_sample %>% group_by(genre) %>% summarize(mean = mean(rating)) obs_diff <- diff(mean_ratings$mean) We see here that the diff function calculates \\(\\bar{x}_r - \\bar{x}_a = 6.062 - 5.112 = 0.95\\). We will now proceed similarly to how we conducted the hypothesis test above for the Lady Tasting Tea using simulation. Our goal is figure out a random process with which to simulate the null hypothesis being true. Earlier in this chapter, we used flipping of a fair coin as the random process we were simulating with the null hypothesis being true (\\(H_0: \\tau = 5\\)). 10.9.8 Simulated data Tactile simulation Here, with us assuming the two population means are equal (\\(H_0: \\mu_r - \\mu_a = 0\\)), we can look at this from a tactile point of view by using index cards. There are \\(n_r = 34\\) data elements corresponding to romance movies and \\(n_a = 34\\) for action movies. We can write the 34 ratings from our sample for romance movies on one set of 34 index cards and the 34 ratings for action movies on another set of 34 index cards. (Note that the sample sizes need not be the same.) The next step is to put the two stacks of index cards together, creating a new set of 68 cards. If we assume that the two population means are equal, we are saying that there is no association between ratings and genre (romance vs action). We can use the index cards to create two new stacks for romance and action movies. First, we must shuffle all the cards thoroughly. After doing so, in this case with equal values of sample sizes, we split the deck in half. We then calculate the new sample mean rating of the romance deck, and also the new sample mean rating of the action deck. This creates one simulation of the samples that were collected originally. We next want to calculate a statistic from these two samples. Instead of actually doing the calculation using index cards, we can use R as we have before to simulate this process. shuffled_ratings <- #movies_trimmed %>% movies_genre_sample %>% mutate(genre = shuffle(genre)) %>% group_by(genre) %>% summarize(mean = mean(rating)) diff(shuffled_ratings$mean) [1] -0.132 Learning check (LC10.16) How would the tactile shuffling of index cards change if we had different samples of say 20 action movies and 60 romance movies? Describe each step that would change. (LC10.17) Why are we taking the difference in the means of the cards in the new shuffled decks? 10.9.9 Distribution of \\(\\delta\\) under \\(H_0\\) The only new command here is shuffle from the mosaic package, which does what we would expect it to do. It simulates a shuffling of the ratings between the two levels of genre just as we could have done with index cards. We can now proceed in a similar way to what we have done previously with the Lady Tasting Tea example by repeating this process many times to create a null distribution of simulated differences in sample means. set.seed(2017) many_shuffles <- do(5000) * (movies_genre_sample %>% mutate(genre = shuffle(genre)) %>% group_by(genre) %>% summarize(mean = mean(rating)) ) It is a good idea here to View the many_shuffles data frame via View(many_shuffles). We need to figure out a way to subtract the first value of mean from the second value of mean for each of the 5000 simulations. This is a little tricky but the group_by function comes to our rescue here: rand_distn <- many_shuffles %>% group_by(.index) %>% summarize(diffmean = diff(mean)) head(rand_distn, 10) # A tibble: 10 x 2 .index diffmean <dbl> <dbl> 1 1 -0.13235 2 2 -0.19706 3 3 -0.02647 4 4 0.71471 5 5 -0.47353 6 6 -0.12059 7 7 -0.17353 8 8 -0.20882 9 9 -0.00882 10 10 -0.33235 We can now plot the distribution of these simulated differences in means: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = "white", bins = 20) Figure 10.9: Simulated differences in means histogram 10.9.10 The p-value Remember that we are interested in seeing where our observed sample mean difference of 0.95 falls on this null/randomization distribution. We are interested in simply a difference here so “more extreme” corresponds to values in both tails on the distribution. Let’s shade our null distribution to show a visual representation of our \\(p\\)-value: ggplot(data = rand_distn, aes(x = diffmean, fill = (abs(diffmean) >= obs_diff))) + geom_histogram(color = "white", bins = 20) Figure 10.10: Shaded histogram to show p-value Remember that the observed difference in means was 0.95. We have shaded green all values at or above that value and also shaded green those values at or below its negative value (since this is a two-tailed test). We can add a vertical line to represent both the observed difference and its negative instead. To better estimate how large the \\(p\\)-value will be, we also increase the number of bins to 100 here from 20: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = "white", bins = 100) + geom_vline(xintercept = obs_diff, color = "red") + geom_vline(xintercept = -obs_diff, color = "red") Figure 10.11: Histogram with vertical lines corresponding to observed statistic At this point, it is important to take a guess as to what the \\(p\\)-value may be. We can see that there are only a few shuffled differences as extreme or more extreme than our observed effect (in both directions). Maybe we guess that this \\(p\\)-value is somewhere around 2%, or maybe 3%, but certainly not 30% or more. **You’ll find yourself getting very strange results if you’ve messed up the signs in your calculation of the \\(p\\)-value so you should always check first that you have a range of reasonable values after looking at the histogram for the \\(p\\)-value. Lastly, we calculate the \\(p\\)-value directly using dplyr: (pvalue_movies <- rand_distn %>% filter(abs(diffmean) >= obs_diff) %>% nrow() / nrow(rand_distn)) [1] 0.0042 We have around 0.42% of values as extreme or more extreme than our observed effect in both directions. Assuming we are using a 5% significance level for \\(\\alpha\\), we have evidence supporting the conclusion that the mean rating for romance movies is different from that of action movies. The next important idea is to better understand just how much higher of a mean rating can we expect the romance movies to have compared to that of action movies. This can be addressed by creating a 95% confidence interval as we will explore in Chapter 9. Learning check (LC10.18) Conduct the same analysis comparing action movies versus romantic movies using the median rating instead of the mean rating? Make sure to use the %>% as much as possible. What was different and what was the same? (LC10.19) What conclusions can you make from viewing the faceted histogram looking at rating versus genre that you couldn’t see when looking at the boxplot? (LC10.20) Describe in a paragraph how we used Allen Downey’s diagram to conclude if a statistical difference existed between mean movie ratings for action and romance movies. (LC10.21) Why are we relatively confident that the distributions of the sample ratings will be good approximations of the population distributions of ratings for the two genres? (LC10.22) Using the definition of “\\(p\\)-value”, write in words what the \\(p\\)-value represents for the hypothesis test above comparing the mean rating of romance to action movies. (LC10.23) What is the value of the \\(p\\)-value for the hypothesis test comparing the mean rating of romance to action movies? (LC10.24) Do the results of the hypothesis test match up with the original plots we made looking at the population of movies? Why or why not? 10.9.11 Summary To review, these are the steps one would take whenever you’d like to do a hypothesis test comparing values from the distributions of two groups: Simulate many samples using a random process that matches the way the original data were collected and that assumes the null hypothesis is true. Collect the values of a sample statistic for each sample created using this random process to build a randomization distribution. Assess the significance of the original sample by determining where its sample statistic lies in the randomization distribution. If the proportion of values as extreme or more extreme than the observed statistic in the randomization distribution is smaller than the pre-determined significance level \\(\\alpha\\), we reject \\(H_0\\). Otherwise, we fail to reject \\(H_0\\). (If no significance level is given, one can assume \\(\\alpha = 0.05\\).) 10.10 Building theory-based methods using computation As a point of reference, we will now discuss the traditional theory-based way to conduct the hypothesis test for determining if there is a statistically significant difference in the sample mean rating of Action movies versus Romance movies. This method and ones like it work very well when the assumptions are met in order to run the test. They are based on probability models and distributions such as the normal and \\(t\\)-distributions. These traditional methods have been used for many decades back to the time when researchers didn’t have access to computers that could run 5000 simulations in under a minute. They had to base their methods on probability theory instead. Many fields and researchers continue to use these methods and that is the biggest reason for their inclusion here. It’s important to remember that a \\(t\\)-test or a \\(z\\)-test is really just an approximation of what you have seen in this chapter already using simulation and randomization. The focus here is on understanding how the shape of the \\(t\\)-curve comes about without digging big into the mathematical underpinnings. 10.10.1 Example: \\(t\\)-test for two independent samples What is commonly done in statistics is the process of normalization. What this entails is calculating the mean and standard deviation of a variable. Then you subtract the mean from each value of your variable and divide by the standard deviation. The most common normalization is known as the \\(z\\)-score. The formula for a \\(z\\)-score is \\[Z = \\frac{x - \\mu}{\\sigma},\\] where \\(x\\) represent the value of a variable, \\(\\mu\\) represents the mean of the variable, and \\(\\sigma\\) represents the standard deviation of the variable. Thus, if your variable has 10 elements, each one has a corresponding \\(z\\)-score that gives how many standard deviations away that value is from its mean. \\(z\\)-scores are normally distributed with mean 0 and standard deviation 1. They have the common, bell-shaped pattern seen below. Recall, that we hardly ever know the mean and standard deviation of the population of interest. This is almost always the case when considering the means of two independent groups. To help account for us not knowing the population parameter values, we can use the sample statistics instead, but this comes with a bit of a price in terms of complexity. Another form of normalization occurs when we need to use the sample standard deviations as estimates for the unknown population standard deviations. This normalization is often called the \\(t\\)-score. For the two independent samples case like what we have for comparing action movies to romance movies, the formula is \\[T =\\dfrac{ (\\bar{x}_1 - \\bar{x}_2) - (\\mu_1 - \\mu_2)}{ \\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}} }\\] There is a lot to try to unpack here. \\(\\bar{x}_1\\) is the sample mean response of the first group \\(\\bar{x}_2\\) is the sample mean response of the second group \\(\\mu_1\\) is the population mean response of the first group \\(\\mu_2\\) is the population mean response of the second group \\(s_1\\) is the sample standard deviation of the response of the first group \\(s_2\\) is the sample standard deviation of the response of the second group \\(n_1\\) is the sample size of the first group \\(n_2\\) is the sample size of the second group Assuming that the null hypothesis is true (\\(H_0: \\mu_1 - \\mu_2 = 0\\)), \\(T\\) is said to be distributed following a \\(t\\) distribution with degrees of freedom equal to the smaller value of \\(n_1 - 1\\) and \\(n_2 - 1\\). The “degrees of freedom” can be thought of measuring how different the \\(t\\) distribution will be as compared to a normal distribution. Small sample sizes lead to small degrees of freedom and, thus, \\(t\\) distributions that have more values in the tails of their distributions. Large sample sizes lead to large degrees of freedom and, thus, \\(t\\) distributions that closely align with the standard normal, bell-shaped curve. So, assuming \\(H_0\\) is true, our formula simplifies a bit: \\[T =\\dfrac{ \\bar{x}_1 - \\bar{x}_2}{ \\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}} }.\\] We have already built an approximation for what we think the distribution of \\(\\delta = \\bar{x}_1 - \\bar{x}_2\\) looks like using randomization above. Recall this distribution: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = "white", bins = 20) Figure 10.12: Simulated differences in means histogram If we’d like to have a guess as to what the distribution of \\(T\\) might look like instead, we need only to divide every value in rand_distn by \\[\\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}}.\\] As we did before, we will assign Romance to be group 1 and Action to be group 2. (This was done since Romance comes second alphabetically and the reason why we have a number mismatch below with 1 and 2.) Remember that we’ve already calculated these values: kable(summary_ratings) genre mean std_dev n Action 5.11 1.49 34 Romance 6.06 1.15 34 We will create some shortcuts here so you can see the value being calculated for the denominator of \\(T\\). s1 <- summary_ratings$std_dev[2] s2 <- summary_ratings$std_dev[1] n1 <- summary_ratings$n[2] n2 <- summary_ratings$n[1] Here, we have \\(s_1 = 1.149\\), \\(s_2 = 1.489\\), \\(n_1 = 34\\), and \\(n_2 = 34\\). We can calculate the denominator via (denom_T <- sqrt( (s1^2 / n1) + (s2^2 / n2) )) [1] 0.323 Now if we divide all of the values of diffmean in rand_distn by denom_T we can have a simulated distribution of \\(T\\) test statistics instead: rand_distn <- rand_distn %>% mutate(t_stat = diffmean / denom_T) ggplot(data = rand_distn, aes(x = t_stat)) + geom_histogram(color = "white", bins = 20) Figure 10.13: Simulated T statistics histogram We see that the shape of this distribution is the same as that of diffmean. The scale has changed though with t_stat having less spread than diffmean. A traditional \\(t\\)-test doesn’t look at this simulated distribution, but instead it looks at the \\(t\\)-curve with degrees of freedom equal to 33 (the minimum of \\(n_1 = 34 - 1 = 33\\) and \\(n_2 = 34 - 1 = 33\\)). This curve is frequently called a density curve and this is the reason why we specify the use of y = ..density.. here in the geom_histogram. We now overlay what this \\(t\\)-curve looks like on top of the histogram showing the simulated \\(T\\) statistics. ggplot(data = rand_distn, mapping = aes(x = t_stat)) + geom_histogram(aes(y = ..density..), color = "white", binwidth = 0.3) + stat_function(fun = dt, args = list(df = min(n1 - 1, n2 - 1)), color = "royalblue", size = 2) We can see that the curve does a good job of approximating the randomization distribution here. (More on when to expect for this to be the case when we discuss conditions for the \\(t\\)-test in a bit.) To calculate the \\(p\\)-value in this case, we need to figure out how much of the total area under the \\(t\\)-curve is at our observed \\(T\\)-statistic or more, plus also adding the area under the curve at the negative value of the observed \\(T\\)-statistic or below. (Remember this is a two-tailed test so we are looking for a difference–values in the tails of either direction.) Just as we converted all of the simulated values to \\(T\\)-statistics, we must also do so for our observed effect \\(\\delta^*\\): (t_obs <- obs_diff / denom_T) [1] 2.95 So graphically we are interested in finding the percentage of values that are at or above 2.945 or at or below -2.945. ggplot(data = rand_distn, mapping = aes(x = t_stat)) + stat_function(fun = dt, args = list(df = min(n1 - 1, n2 - 1)), color = "royalblue", size = 2) + geom_vline(xintercept = t_obs, color = "red") + geom_vline(xintercept = -t_obs, color = "red") At this point, you should make a guess as to what a reasonable value may be for the \\(p\\)-value. Let’s say the \\(p\\)-value is 0.01 or so. To actually perform this calculation by hand, you’d need to do some calculus. Let’s have R do it for us instead using the pt function. pt(t_obs, df = min(n1 - 1, n2 - 1), lower.tail = FALSE) + pt(-t_obs, df = min(n1 - 1, n2 - 1), lower.tail = TRUE) [1] 0.00588 10.10.2 Conditions for t-test In order for the results of the \\(t\\)-test to be valid, three conditions must be met: Independent observations in both samples Nearly normal populations OR large sample sizes (\\(n \\ge 30\\)) Independently selected samples Condition 1: This is met since we sampled at random using R from our population. Condition 2: Recall from Figure 10.6, that we know how the populations are distributed. Both of them are close to normally distributed. If we are a little concerned about this assumption, we also do have samples of size larger than 30 (\\(n_1 = n_2 = 34\\)). Condition 3: This is met since there is no natural pairing of a movie in the Action group to a movie in the Romance group. Since all three conditions are met, we can be reasonably certain that the theory-based test will match the results of the randomization-based test using shuffling. Remember that theory-based tests can produce some incorrect results in these assumptions are not carefully checked. The only assumption for randomization and computational-based methods is that the sample is selected at random. They are our preference and we strongly believe they should be yours as well, but it’s important to also see how the theory-based tests can be done and used as an approximation for the computational techniques until at least more researchers are using these techniques that utilize the power of computers. 10.11 Resampling-based inference for regression We can also use the concept of shuffling to determine the standard error of our null distribution and conduct a hypothesis test for a population slope. Let’s go back to our example on Alaskan flights that represent a sample of all Alaskan flights departing NYC in 2013 from Section 3.3. Let’s test to see if we have evidence that a positive relationship exists between the departure delay and arrival delay for Alaskan flights. We will set up this hypothesis testing process as we have each before via the “There is Only One Test” diagram in Figure 10.1. 10.11.1 Data Our data is stored in alaska_flights and we are focused on the 50 measurements of dep_delay and arr_delay there. # To ensure the random sample of 50 flights is the same for # anyone using this code set.seed(2017) # Load Alaska data, deleting rows that have missing departure delay # or arrival delay data alaska_flights <- flights %>% filter(carrier == "AS") %>% filter(!is.na(dep_delay) & !is.na(arr_delay)) %>% # Select 50 flights that don't have missing delay data sample_n(50) 10.11.2 Test statistic \\(\\delta\\) Our test statistic here is the sample slope coefficient that we denote with \\(b_1\\). 10.11.3 Observed effect \\(\\delta^*\\) delay_fit <- lm(formula = arr_delay ~ dep_delay, data = alaska_flights) (b1_obs <- tidy(delay_fit)$estimate[2]) [1] 1.22 The calculated slope value from our observed sample is \\(b_1 = 1.218\\). 10.11.4 Model of \\(H_0\\) We are looking to see if a positive relationship exists so \\(H_a: \\beta_1 > 0\\). Our null hypothesis is always in terms of equality so we have \\(H_0: \\beta_1 = 0\\). 10.11.5 Simulated data Now to simulate the null hypothesis being true and recreating how our sample was created, we need to think about what it means for \\(\\beta_1\\) to be zero. If \\(\\beta_1 = 0\\), we said above that there is no relationship between the departure delay and arrival delay. If there is no relationship, then any one of the arrival delay values could have just as likely occurred with any of the other departure delay values instead of the one that it actually did fall with. We, therefore, have another example of shuffling in our simulating of data. Tactile simulation We could use a deck of 100 note cards to create a tactile simulation of this shuffling process. We would write the 50 different values of departure delays on each of the 50 cards, one per card. We would then do the same thing for the 50 arrival delays putting them on one per card. Next, we would lay out each of the 50 departure delay cards and we would shuffle the arrival delay deck. Then, after shuffling the deck well, we would disperse the cards one per each one of the departure delay cards. We would then enter these new values in for arrival delay and compute a sample slope based on this shuffling. We could repeat this process many times, keeping track of our sample slope after each shuffle. 10.11.6 Distribution of \\(\\delta\\) under \\(H_0\\) We can build our randomization distribution in much the same way we did before using the do and shuffle functions. Here we will take advantage of the coef function we saw earlier to extract the slope and intercept coefficients. (Our focus will be on the slope here though.) rand_slope_distn <- do(5000) * (lm(formula = arr_delay ~ shuffle(dep_delay), data = alaska_flights) %>% coef()) names(rand_slope_distn) [1] "Intercept" "dep_delay" We see that the names of our columns are Intercept and dep_delay. We want to look at dep_delay since that corresponds to the slope coefficients. ggplot(data = rand_slope_distn, mapping = aes(x = dep_delay)) + geom_histogram(color = "white", bins = 20) 10.11.7 The p-value Recall that we want to see where our observed sample slope \\(\\delta^* = 1.218\\) falls on this distribution and then count all of the values to the right of it corresponding to \\(H_a: \\beta_0 > 0\\). To get a sense for where our values falls, we can shade all values at least as big as \\(\\delta^*\\). ggplot(data = rand_slope_distn, aes(x = dep_delay, fill = (dep_delay >= b1_obs))) + geom_histogram(color = "white", bins = 20) Figure 10.14: Shaded histogram to show p-value Since 1.218 falls far to the right of this plot, we can say that we have a \\(p\\)-value of 0. We, thus, have evidence to reject the null hypothesis in support of there being a positive association between the departure delay and arrival delay of all Alaskan flights from NYC in 2013. Learning check (LC10.25) Repeat the inference above but this time for the correlation coefficient instead of the slope. 10.12 Theory-based inference for regression Recall the regression output table from Section ?? with delay_fit being a least squares linear regression fit with arr_delay as the response and dep_delay as the predictor in the alaska_flights data frame created in Section ??. term estimate std.error statistic p.value (Intercept) -14.15 2.809 -5.04 0 dep_delay 1.22 0.136 8.95 0 We saw in Section ?? that random samples have variability and, thus, statistics from those samples have variability as defined by the sampling distribution. Recall from Section ?? that alaska_flights represents only a random sample of 50 Alaska Airlines flights and not all flights. Hence if we repeated the analysis but with another random sample of 50 flights, the fitted line would likely change slightly due to sampling variability. In this case, there is a true population least squares line is defined by the formula \\(y = \\beta_0 + \\beta_1 x + \\epsilon\\) where \\(\\beta_0\\) is the true population intercept parameter \\(\\beta_1\\) is the true population slope parameter \\(\\epsilon\\) represents the error term \\(\\epsilon\\) corresponds to the part of the response variable \\(y\\) that remains unexplained after considering the predictor variable \\(x\\). We will see in Section 10.12.2 that ideally they should exhibit no systematic pattern in that they are normally distributed, have mean 0, and constant variance. The values \\(b_0 = -14.155\\) and \\(b_1 = 1.218\\) are point estimates of \\(\\beta_0\\) and \\(\\beta_1\\), and thus the second column of the regression output table that has their values is called estimate. The third column std.error represents the standard errors for each estimate using a theory-based approach. The rows of the fourth and fifth columns correspond to theory-based hypothesis tests testing \\(H_0: \\beta_0 = 0 \\mbox{ vs. } H_1: \\beta_0 \\neq 0\\) and \\(H_0: \\beta_1 = 0 \\mbox{ vs. } H_1: \\beta_1 \\neq 0\\). Of particular interest is the second hypothesis test because if \\(\\beta_1 = 0\\) then \\(y = \\beta_0 + \\epsilon\\). Hence the value of \\(y\\) does not depend on the value of \\(x\\) at all, in other words there is no relationship between them. Recall that any hypothesis test involves 1) an observed test statistic and 2) a \\(p\\)-value resulting from the comparison of the observed test statistic to a null distribution. The columns “statistic” and “p.value” correspond to these values. In our example, since the \\(p\\)-value corresponding to the hypothesis test \\(H_0: \\beta_1 = 0 \\mbox{ vs. } H_1: \\beta_1 \\neq 0\\) is 0, for any value of \\(\\alpha\\) we would reject \\(H_0\\) in favor of \\(H_1\\) and declare that there is a significant relationship between arrival delay and departure delay. For the conclusions of the hypothesis tests for regression to be valid, there are certain conditions that must be met, in particular relating to the behavior of the residuals. We will address these assumptions in the upcoming Subsection 10.12.1. 10.12.1 Conditions for regression In order for all inferences from regression to be valid (in particular the hypothesis tests from Subsection 10.12, certain conditions must roughly hold. Nearly normal residuals with mean 0 and constant variance. (Check quantile-quantile plot of standardized residuals.) Equal variances across explanatory variable. (Check residual plot for non-uniform patterns.) Independent observations. (Check residual plot for no time series-like patterns.) As you can see the residuals will play a large role in determining whether the conditions are met. In particular, the first two conditions can be roughly interpreted as requiring that there being no systematic pattern to the residuals. The residuals \\(\\widehat{\\epsilon}_i\\) are estimates for the error term \\(\\epsilon\\) we discussed with the true population regression line, and this is a big reason why they play an important role in validating regression assumptions. 10.12.2 Residual analysis The following diagram will help you to keep track of what is meant by a residual. Consider the observation marked by the blue dot: Recall that \\(y_i\\) is the observed value of the arr_delay variable (y-position of blue dot), \\(\\widehat{y}_i\\) is the fitted value of the arr_delay (value that is being pointed to on the red line), and the residual is \\(\\widehat{\\epsilon}_i = y_i - \\hat{y}_i\\). We can quickly extract the values of all 50 residuals by using the augment() function in the broom package. Specifically, we are interested in the .fitted and .resid variables. Let’s look at the residuals corresponding to the first six rows of data. regression_points <- augment(delay_fit) %>% select(arr_delay, dep_delay, .fitted, .resid) regression_points %>% head() %>% kable() arr_delay dep_delay .fitted .resid -38 -3 -17.808 -20.19 86 69 69.864 16.14 -38 3 -10.502 -27.50 61 53 50.381 10.62 3 12 0.457 2.54 21 2 -11.720 32.72 Let’s begin by analyzing the distribution of the residuals. We would expect the shape of the distribution to be symmetric and roughly bell-shaped with a peak near zero and fewer and fewer values going into the tails on both the left and right sides. ggplot(data = regression_points, mapping = aes(x = .resid)) + geom_histogram(binwidth = 10, color = "white") + geom_vline(xintercept = 0, color = "blue") Next, we create a scatterplot looking at how the fitted values relate to the residual values. ggplot(data = regression_points, mapping = aes(x = .fitted, y = .resid)) + geom_point() + geom_abline(intercept = 0, slope = 0, color = "blue") Figure 10.15: Fitted versus Residuals plot Lastly, we create a quantile-quantile plot that compares the residual values to what would be expected from a bell-shaped distribution (in particular, the normal distribution). ggplot(data = regression_points, mapping = aes(sample = .resid)) + stat_qq() Figure 10.16: QQ Plot of residuals Checking conditions: We are looking to see if the points are scattered about the blue line at 0 relatively evenly as we look from left to right in Figure 10.15. We have some reason for concern here as the large lump of values on the left are much more dispersed than those on the right. The second condition is invalidated if there is a trigonometric pattern of up and down throughout the fitted by residual plot in Figure 10.15. That is not the case here. We look at the quantile-quantile plot (“Q-Q plot” for short) for the third condition in Figure 10.16. We are looking to see if the residuals fall on a straight line with what we would expect if they were normally distributed. We see some curvature here as well. We should begin to wonder if regression was valid here with both condition 1 and condition 3 in question. We have reason to doubt whether a linear regression is valid here. Unfortunately, all too frequently regressions are run without checking these assumptions carefully. While small deviations from the assumptions can be OK, larger violations can completely invalidate the results and make any inferences improbable and questionable. 10.13 Conclusion 10.13.1 What’s to come? This chapter examined the basics of hypothesis testing with terminology and also an example of how to apply the “There is Only One Test” diagram to the Lady Tasting Tea example presented in Chapter 8 and to an example on comparing the IMDB ratings of action movies and romance movies. Lastly, we looked at how to use resampling and theory-based methods on regression. We’ll see in Chapter 9 how we can provide a range of possible values for an unknown population parameter instead of just running a Yes/No decision from a hypothesis test. 10.13.2 Script of R code An R script file of all R code used in this chapter is available here. "], ["11-inference-for-regression.html", "11 Inference for Regression 11.1 Refresher: Professor evaluations data", " 11 Inference for Regression Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. 11.1 Refresher: Professor evaluations data Let’s revisit the professor evaluations data that we analyzed using multiple regression with one numerical and one categorical predictor. In particular \\(y\\): outcome variable of instructor evaluation score predictor variables \\(x_1\\): numerical explanatory/predictor variable of age \\(x_2\\): categorical explanatory/predictor variable of gender library(ggplot2) library(dplyr) library(moderndive) load(url("http://www.openintro.org/stat/data/evals.RData")) evals <- evals %>% select(score, ethnicity, gender, language, age, bty_avg, rank) First, recall that we had two competing potential models to explain professors’ teaching scores: Model 1: No interaction term. i.e. both male and female profs have the same slope describing the associated effect of age on teaching score Model 2: Includes an interaction term. i.e. we allow for male and female profs to have different slopes describing the associated effect of age on teaching score 11.1.1 Refresher: Visualizations Recall the plots we made for both these models: Figure 11.1: Model 1: no interaction effect included Figure 11.2: Model 2: interaction effect included 11.1.2 Refresher: Regression tables Last, let’s recall the regressions we fit. First, the regression with no interaction effect: note the use of + in the formula. score_model_2 <- lm(score ~ age + gender, data = evals) get_regression_table(score_model_2) Table 11.1: Model 1: Regression table with no interaction effect included term estimate std_error statistic p_value conf_low conf_high intercept 4.484 0.125 35.79 0.000 4.238 4.730 age -0.009 0.003 -3.28 0.001 -0.014 -0.003 gendermale 0.191 0.052 3.63 0.000 0.087 0.294 Second, the regression with an interaction effect: note the use of * in the formula. score_model_3 <- lm(score ~ age * gender, data = evals) get_regression_table(score_model_3) Table 11.2: Model 2: Regression table with interaction effect included term estimate std_error statistic p_value conf_low conf_high intercept 4.883 0.205 23.80 0.000 4.480 5.286 age -0.018 0.004 -3.92 0.000 -0.026 -0.009 gendermale -0.446 0.265 -1.68 0.094 -0.968 0.076 age:gendermale 0.014 0.006 2.45 0.015 0.003 0.024 11.1.3 Script of R code An R script file of all R code used in this chapter is available here. "], -["12-thinking-with-data.html", "12 Thinking with Data 12.1 Effective Data Storytelling 12.2 Examples Concluding remarks", " 12 Thinking with Data Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. 12.1 Effective Data Storytelling As we’ve progressed throughout this book, you’ve seen how to work with data in a variety of ways. You’ve learned effective strategies for plotting data by understanding which types of plots work best for which combinations of variable types. You’ve summarized data in table form and calculated summary statistics for a variety of different variables. Further, you’ve seen the value of inference as a process to come to conclusions about a population by using a random sample. Lastly, you’ve explored how to use linear regression and the importance of checking the conditions required to make it a valid procedure. All throughout, you’ve learned many computational techniques and focused on reproducible research in writing R code and keeping track of your work in R Markdown. All of these steps go into making a great story using data. As the textbook comes to a close, we thought it best that you explore what stellar work is being produced by data journalists throughout the world that specialize in effective data storytelling. We recommend you read and analyze this article by Walt Hickey entitled The Dollar-And-Cents Case Against Hollywood’s Exclusion of Women. As you read over it, think carefully about how Walt is using data, graphics, and analyses to paint the picture for the reader of what the story is he wants to tell. In the spirit of reproducibility, the members of FiveThirtyEight have also shared the data that they used to create this story and some R code here. A vignette showing how to reproduce one of the plots at the end of the article using dplyr, ggplot2, and other packages in Hadley’s tidyverse is available here as part of the fivethirtyeight R package (Ismay and Chunn 2017). Great data stories don’t mislead the reader, but rather engulf them in understanding the importance that data plays in our lives through the captivation of storytelling. 12.2 Examples Concluding remarks If you’ve come to this point in the book, I’d suspect that you know a thing or two about how to work with data in R. You’ve also gained a lot of knowledge about how to use simulation techniques to determine statistical significance and how these techniques build an intuition about traditional inferential methods like the \\(t\\)-test. The hope is that you’ve come to appreciate data wrangling, tidy datasets, and the power of data visualization. Actually, the data visualization part may be the most important thing here. If you can create truly beautiful graphics that display information in ways that the reader can clearly decipher, you’ve picked up a great skill. Let’s hope that that skill keeps you creating great stories with data into the near and far distant future. Thanks for coming along for the ride as we dove into modern data analysis using R! "], +["12-thinking-with-data.html", "12 Thinking with Data 12.1 Effective Data Storytelling 12.2 Examples Concluding remarks", " 12 Thinking with Data Note: This chapter is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. 12.1 Effective Data Storytelling As we’ve progressed throughout this book, you’ve seen how to work with data in a variety of ways. You’ve learned effective strategies for plotting data by understanding which types of plots work best for which combinations of variable types. You’ve summarized data in table form and calculated summary statistics for a variety of different variables. Further, you’ve seen the value of inference as a process to come to conclusions about a population by using a random sample. Lastly, you’ve explored how to use linear regression and the importance of checking the conditions required to make it a valid procedure. All throughout, you’ve learned many computational techniques and focused on reproducible research in writing R code and keeping track of your work in R Markdown. All of these steps go into making a great story using data. As the textbook comes to a close, we thought it best that you explore what stellar work is being produced by data journalists throughout the world that specialize in effective data storytelling. We recommend you read and analyze this article by Walt Hickey entitled The Dollar-And-Cents Case Against Hollywood’s Exclusion of Women. As you read over it, think carefully about how Walt is using data, graphics, and analyses to paint the picture for the reader of what the story is he wants to tell. In the spirit of reproducibility, the members of FiveThirtyEight have also shared the data that they used to create this story and some R code here. A vignette showing how to reproduce one of the plots at the end of the article using dplyr, ggplot2, and other packages in Hadley’s tidyverse is available here as part of the fivethirtyeight R package (Kim, Ismay, and Chunn 2017). Great data stories don’t mislead the reader, but rather engulf them in understanding the importance that data plays in our lives through the captivation of storytelling. 12.2 Examples Concluding remarks If you’ve come to this point in the book, I’d suspect that you know a thing or two about how to work with data in R. You’ve also gained a lot of knowledge about how to use simulation techniques to determine statistical significance and how these techniques build an intuition about traditional inferential methods like the \\(t\\)-test. The hope is that you’ve come to appreciate data wrangling, tidy datasets, and the power of data visualization. Actually, the data visualization part may be the most important thing here. If you can create truly beautiful graphics that display information in ways that the reader can clearly decipher, you’ve picked up a great skill. Let’s hope that that skill keeps you creating great stories with data into the near and far distant future. Thanks for coming along for the ride as we dove into modern data analysis using R! "], ["A-appendixA.html", "A Statistical Background A.1 Basic statistical terms", " A Statistical Background A.1 Basic statistical terms A.1.1 Mean The mean is the most commonly reported measure of center. It is commonly called the “average” though this term can be a little ambiguous. The mean is the sum of all of the data elements divided by how many elements there are. If we have \\(n\\) data points, the mean is given by: \\[Mean = \\frac{x_1 + x_2 + \\cdots + x_n}{n}\\] A.1.2 Median The median is calculated by first sorting a variable’s data from smallest to largest. After sorting the data, the middle element in the list is the median. If the middle falls between two values, then the median is the mean of those two values. A.1.3 Standard deviation We will next discuss the standard deviation of a sample dataset pertaining to one variable. The formula can be a little intimidating at first but it is important to remember that it is essentially a measure of how far to expect a given data value is from its mean: \\[Standard \\, deviation = \\sqrt{\\frac{(x_1 - Mean)^2 + (x_2 - Mean)^2 + \\cdots + (x_n - Mean)^2}{n - 1}}\\] A.1.4 Five-number summary The five-number summary consists of five values: minimum, first quantile (25th percentile), median (50th percentile), third quantile (75th) quantile, and maximum. The quantiles are calculated as first quantile (\\(Q_1\\)): the median of the first half of the sorted data third quantile (\\(Q_3\\)): the median of the second half of the sorted data The interquartile range is defined as \\(Q_3 - Q_1\\) and is a measure of how spread out the middle 50% of values is. The five-number summary is not influenced by the presence of outliers in the ways that the mean and standard deviation are. It is, thus, recommended for skewed datasets. A.1.5 Distribution The distribution of a variable/dataset corresponds to generalizing patterns in the dataset. It often shows how frequently elements in the dataset appear. It shows how the data varies and gives some information about where a typical element in the data might fall. Distributions are most easily seen through data visualization. A.1.6 Outliers Outliers correspond to values in the dataset that fall far outside the range of “ordinary” values. In regards to a boxplot (by default), they correspond to values below \\(Q_1 - (1.5 * IQR)\\) or above \\(Q_3 + (1.5 * IQR)\\). Note that these terms (aside from Distribution) only apply to quantitative variables. "], ["B-appendixB.html", "B Inference Examples Needed packages B.1 Under construction… B.2 Inference mind map B.3 One mean B.4 One proportion B.5 Two proportions B.6 Two means (independent samples) B.7 Two means (paired samples)", " B Inference Examples This appendix is designed to provide you with examples of the five basic hypothesis tests and their corresponding confidence intervals. Traditional theory-based methods as well as computational-based methods are presented. Note: This appendix is still under construction. If you would like to contribute, please check us out on GitHub at https://github.com/moderndive/moderndive_book. Please check out our sneak peak of infer below in the meanwhile. For more details on infer visit https://infer.netlify.com/. Needed packages library(dplyr) library(ggplot2) library(mosaic) library(knitr) library(readr) B.1 Under construction… The content here will be deprecated with a shift to using the infer package in the months to come. B.2 Inference mind map To help you better navigate and choose the appropriate analysis, we’ve created a mind map on http://coggle.it available here and below. Figure B.1: Mind map for Inference B.3 One mean B.3.1 Problem statement The National Survey of Family Growth conducted by the Centers for Disease Control gathers information on family life, marriage and divorce, pregnancy, infertility, use of contraception, and men’s and women’s health. One of the variables collected on this survey is the age at first marriage. 5,534 randomly sampled US women between 2006 and 2010 completed the survey. The women sampled here had been married at least once. Do we have evidence that the mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years? (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 4]) B.3.2 Competing hypotheses B.3.2.1 In words Null hypothesis: The mean age of first marriage for all US women from 2006 to 2010 is equal to 23 years. Alternative hypothesis: The mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years. B.3.2.2 In symbols (with annotations) \\(H_0: \\mu = \\mu_{0}\\), where \\(\\mu\\) represents the mean age of first marriage for all US women from 2006 to 2010 and \\(\\mu_0\\) is 23. \\(H_A: \\mu > 23\\) B.3.2.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.3.3 Exploring the sample data #download.file("http://ismayc.github.io/teaching/sample_problems/ageAtMar.csv", # destfile = "data/ageAtMar.csv", # method = "curl") ageAtMar <- read_csv("data/ageAtMar.csv") age_summ <- ageAtMar %>% summarize(sample_size = n(), mean = mean(age), sd = sd(age), minimum = min(age), lower_quartile = quantile(age, 0.25), median = median(age), upper_quartile = quantile(age, 0.75), max = max(age)) kable(age_summ) sample_size mean sd minimum lower_quartile median upper_quartile max 5534 23.4 4.72 10 20 23 26 43 The histogram below also shows the distribution of age. ageAtMar %>% ggplot(aes(x = age)) + geom_histogram(binwidth = 3, color = "white") B.3.3.1 Guess about statistical significance We are looking to see if the observed sample mean of 23.44 is statistically greater than \\(\\mu_0 = 23\\). They seem to be quite close, but we have a large sample size here. Let’s guess that the large sample size will lead us to reject this practically small difference. B.3.4 Non-traditional methods B.3.4.1 Bootstrapping for hypothesis test In order to look to see if the observed sample mean of 23.44 is statistically greater than \\(\\mu_0 = 23\\), we need to account for the sample size. We also need to determine a process that replicates how the original sample of size 5534 was selected. We can use the idea of bootstrapping to simulate the population from which the sample came and then generate samples from that simulated population to account for sampling variability. Recall how bootstrapping would apply in this context: Sample with replacement from our original sample of 5534 women and repeat this process 10,000 times, calculate the mean for each of the 10,000 bootstrap samples created in Step 1., combine all of these bootstrap statistics calculated in Step 2 into a boot_distn object, and shift the center of this distribution over to the null value of 23. (This is needed since it will be centered at 23.44 via the process of bootstrapping.) set.seed(2017) mu0 <- 23 shift <- mu0 - age_summ$mean null_distn_one_mean <- do(10000) * resample(ageAtMar, replace = TRUE) %>% mutate(age = age + shift) %>% summarize(mean_age = mean(age)) ggplot(null_distn_one_mean, aes(x = mean_age)) + geom_histogram(bins = 30, color = "white") We can next use this distribution to observe our \\(p\\)-value. Recall this is a right-tailed test so we will be looking for values that are greater than or equal to 23.44 for our \\(p\\)-value. obs_mean <- age_summ$mean ggplot(null_distn_one_mean, aes(x = mean_age)) + geom_histogram(bins = 30, color = "white") + geom_vline(color = "red", xintercept = obs_mean) B.3.4.1.1 Calculate \\(p\\)-value pvalue <- null_distn_one_mean %>% filter( mean_age >= obs_mean ) %>% nrow() / nrow(null_distn_one_mean) pvalue [1] 0 So our \\(p\\)-value is 0 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the tail of the null distribution. B.3.4.2 Bootstrapping for confidence interval We can also create a confidence interval for the unknown population parameter \\(\\mu\\) using our sample data using bootstrapping. Note that we don’t need to shift this distribution since we want the center of our confidence interval to be our point estimate \\(\\bar{x}_{obs} = 23.44\\). boot_distn_one_mean <- do(10000) * resample(ageAtMar, replace = TRUE) %>% summarize(mean_age = mean(age)) ggplot(boot_distn_one_mean, aes(x = mean_age)) + geom_histogram(bins = 30, color = "white") boot_distn_one_mean %>% summarize(lower = quantile(mean_age, probs = 0.025), upper = quantile(mean_age, probs = 0.975)) lower upper 1 23.3 23.6 We see that 23 is not contained in this confidence interval as a plausible value of \\(\\mu\\) (the unknown population mean) and the entire interval is larger than 23. This matches with our hypothesis test results of rejecting the null hypothesis in favor of the alternative (\\(\\mu > 23\\)). Interpretation: We are 95% confident the true mean age of first marriage for all US women from 2006 to 2010 is between and . B.3.5 Traditional methods B.3.5.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are collected independently. The cases are selected independently through random sampling so this condition is met. Approximately normal: The distribution of the response variable should be normal or the sample size should be at least 30. The histogram for the sample above does show some skew. The Q-Q plot below also shows some skew. ggplot(data = ageAtMar, mapping = aes(sample = age)) + stat_qq() The sample size here is quite large though (\\(n = 5534\\)) so both conditions are met. B.3.5.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population mean \\(\\mu\\). A good guess is the sample mean \\(\\bar{X}\\). Recall that this sample mean is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample mean of \\(\\bar{x}_{obs} = 23.44\\) or larger assuming that the population mean is 23 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can “standardize” this original test statistic of \\(\\bar{X}\\) into a \\(T\\) statistic that follows a \\(t\\) distribution with degrees of freedom equal to \\(df = n - 1\\): \\[ T =\\dfrac{ \\bar{X} - \\mu_0}{ S / \\sqrt{n} } \\sim t (df = n - 1) \\] where \\(S\\) represents the standard deviation of the sample and \\(n\\) is the sample size. B.3.5.2.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the t.test function to perform this analysis for us. t.test(x = ageAtMar$age, alternative = "greater", mu = 23) One Sample t-test data: ageAtMar$age t = 7, df = 6000, p-value = 0.000000000002 alternative hypothesis: true mean is greater than 23 95 percent confidence interval: 23.3 Inf sample estimates: mean of x 23.4 We see here that the \\(t_{obs}\\) value is around 6.94. Recall that for large sample sizes the \\(t\\) distribution is essentially the standard normal distribution and this is why the statistic is reported as Z. B.3.5.3 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing an \\(t_{obs}\\) value of 6.94 or more in our null distribution of a \\(t\\) with 5433 degrees of freedom—is essentially 0. This can also be calculated in R directly: pt(6.936, df = nrow(ageAtMar) - 1, lower.tail = FALSE) [1] 0.00000000000225 We can also use the \\(N(0, 1)\\) distribution here: pnorm(6.936, lower.tail = FALSE) [1] 0.00000000000202 B.3.5.4 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample mean was statistically greater than the hypothesized mean has supporting evidence here. Based on this sample, we have evidence that the mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years. B.3.5.5 Confidence interval The confidence interval reported above with t.test is known as a one-sided confidence interval and gives the lowest value one could expect \\(\\mu\\) to be with 95% confidence. We usually want a range of values so we can use alternative = "two.sided" to get the similar values compared to the bootstrapping process: t.test(x = ageAtMar$age, alternative = "two.sided", mu = 23)$conf [1] 23.3 23.6 attr(,"conf.level") [1] 0.95 B.3.6 Comparing results Observing the bootstrap distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met (the large sample size was the driver here) leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.4 One proportion B.4.1 Problem statement The CEO of a large electric utility claims that 80 percent of his 1,000,000 customers are satisfied with the service they receive. To test this claim, the local newspaper surveyed 100 customers, using simple random sampling. 73 were satisfied and the remaining were unsatisfied. Based on these findings from the sample, can we reject the CEO’s hypothesis that 80% of the customers are satisfied? [Tweaked a bit from http://stattrek.com/hypothesis-test/proportion.aspx?Tutorial=AP] B.4.2 Competing hypotheses B.4.2.1 In words Null hypothesis: The proportion of all customers of the large electric utility satisfied with service they receive is equal 0.80. Alternative hypothesis: The proportion of all customers of the large electric utility satisfied with service they receive is different from 0.80. B.4.2.2 In symbols (with annotations) \\(H_0: \\pi = p_{0}\\), where \\(\\pi\\) represents the proportion of all customers of the large electric utility satisfied with service they receive and \\(p_0\\) is 0.8. \\(H_A: \\pi \\ne 0.8\\) B.4.2.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.4.3 Exploring the sample data elec <- c(rep("satisfied", 73), rep("unsatisfied", 27)) %>% as_data_frame() %>% rename("satisfy" = value) The bar graph below also shows the distribution of satisfy. ggplot(data = elec, aes(x = satisfy)) + geom_bar() B.4.3.1 Guess about statistical significance We are looking to see if the sample proportion of 0.73 is statistically different from \\(p_0 = 0.8\\) based on this sample. They seem to be quite close, and our sample size is not huge here (\\(n = 100\\)). Let’s guess that we do not have evidence to reject the null hypothesis. B.4.4 Non-traditional methods B.4.4.1 Simulation for hypothesis test In order to look to see if 0.73 is statistically different from 0.8, we need to account for the sample size. We also need to determine a process that replicates how the original sample of size 100 was selected. We can use the idea of an unfair coin to simulate this process. We will simulate flipping an unfair coin (with probability of success 0.8 matching the null hypothesis) 100 times. Then we will keep track of how many heads come up in those 100 flips. Our simulated statistic matches with how we calculated the original statistic \\(\\hat{p}\\): the number of heads (satisfied) out of our total sample of 100. We then repeat this process many times (say 10,000) to create the null distribution looking at the simulated proportions of successes: set.seed(2017) null_distn_one_prop <- do(10000) * rflip(100, prob = 0.8) ggplot(null_distn_one_prop, aes(x = prop)) + geom_histogram(bins = 30, color = "white") We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are 0.8 - 0.73 = 0.07 away from 0.8 in BOTH directions for our \\(p\\)-value: p_hat <- 73/100 dist <- 0.8 - p_hat ggplot(null_distn_one_prop, aes(x = prop)) + geom_histogram(bins = 30, color = "white") + geom_vline(color = "red", xintercept = 0.8 + dist) + geom_vline(color = "red", xintercept = p_hat) B.4.4.1.1 Calculate \\(p\\)-value pvalue <- null_distn_one_prop %>% filter( (prop >= 0.8 + dist) | (prop <= p_hat) ) %>% nrow() / nrow(null_distn_one_prop) pvalue [1] 0.082 So our \\(p\\)-value is 0.082 and we fail to reject the null hypothesis at the 5% level. B.4.4.2 Bootstrapping for confidence interval We can also create a confidence interval for the unknown population parameter \\(\\pi\\) using our sample data. To do so, we use bootstrapping, which involves sampling with replacement from our original sample of 100 survey respondents and repeating this process 10,000 times, calculating the proportion of successes for each of the 10,000 bootstrap samples created in Step 1., combining all of these bootstrap statistics calculated in Step 2 into a boot_distn object, identifying the 2.5th and 97.5th percentiles of this distribution (corresponding to the 5% significance level chosen) to find a 95% confidence interval for \\(\\pi\\), and interpret this confidence interval in the context of the problem. boot_distn_one_prop <- do(10000) * (elec %>% resample(size = 100, replace = TRUE) )%>% summarize(success_rate = mean(satisfy == "satisfied")) Just as we use the mean function for calculating the mean over a numerical variable, we can also use it to compute the proportion of successes for a categorical variable where we specify what we are calling a “success” after the ==. (Think about the formula for calculating a mean and how R handles logical statements such as satisfy == "satisfied" for why this must be true.) ggplot(boot_distn_one_prop, aes(x = success_rate)) + geom_histogram(bins = 30, color = "white") boot_distn_one_prop %>% summarize(lower = quantile(success_rate, probs = 0.025), upper = quantile(success_rate, probs = 0.975)) lower upper 1 0.64 0.82 We see that 0.80 is contained in this confidence interval as a plausible value of \\(\\pi\\) (the unknown population proportion). This matches with our hypothesis test results of failing to reject the null hypothesis. Interpretation: We are 95% confident the true proportion of customers who are satisfied with the service they receive is between and . Note: You could also use the null distribution with a shift to have its center at \\(\\hat{p} = 0.73\\) instead of at \\(p_0 = 0.8\\) and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.4.5 Traditional methods B.4.5.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are collected independently. The cases are selected independently through random sampling so this condition is met. Approximately normal: The number of expected successes and expected failures is at least 10. This condition is met since 73 and 27 are both greater than 10. B.4.5.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population proportion \\(\\pi\\). A good guess is the sample proportion \\(\\hat{P}\\). Recall that this sample proportion is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample proportion of \\(\\hat{p}_{obs} = 0.73\\) or larger assuming that the population proportion is 0.80 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can standardize this original test statistic of \\(\\hat{P}\\) into a \\(Z\\) statistic that follows a \\(N(0, 1)\\) distribution. \\[ Z =\\dfrac{ \\hat{P} - p_0}{\\sqrt{\\dfrac{p_0(1 - p_0)}{n} }} \\sim N(0, 1) \\] B.4.5.2.1 Observed test statistic While one could compute this observed test statistic by “hand” by plugging the observed values into the formula, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. The calculation has been done in R below for completeness though: p_hat <- 0.73 p0 <- 0.8 n <- 100 (z_obs <- (p_hat - p0) / sqrt( (p0 * (1 - p0)) / n)) [1] -1.75 We see here that the \\(z_{obs}\\) value is around -1.75. Our observed sample proportion of 0.73 is 1.75 standard errors below the hypothesized parameter value of 0.8. B.4.5.3 Compute \\(p\\)-value 2 * pnorm(z_obs) [1] 0.0801 The \\(p\\)-value—the probability of observing an \\(z_{obs}\\) value of -1.75 or more extreme (in both directions) in our null distribution—is around 8%. Note that we could also do this test directly using the prop.test function. stats::prop.test(x = table(elec$satisfy), n = length(elec$satisfy), alternative = "two.sided", p = 0.8, correct = FALSE) 1-sample proportions test without continuity correction data: table(elec$satisfy), null probability 0.8 X-squared = 3, df = 1, p-value = 0.08 alternative hypothesis: true p is not equal to 0.8 95 percent confidence interval: 0.636 0.807 sample estimates: p 0.73 prop.test does a \\(\\chi^2\\) test here but this matches up exactly with what we would expect: \\(x^2_{obs} = 3.06 = (-1.75)^2 = (z_{obs})^2\\) and the \\(p\\)-values are the same because we are focusing on a two-tailed test. Note that the 95 percent confidence interval given above matches well with the one calculated using bootstrapping. B.4.5.4 State conclusion We, therefore, do not have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample proportion was not statistically greater than the hypothesized proportion has not been invalidated. Based on this sample, we have do not evidence that the proportion of all customers of the large electric utility satisfied with service they receive is different from 0.80, at the 5% level. B.4.6 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.5 Two proportions B.5.1 Problem statement A 2010 survey asked 827 randomly sampled registered voters in California “Do you support? Or do you oppose? Drilling for oil and natural gas off the Coast of California? Or do you not know enough to say?” Conduct a hypothesis test to determine if the data provide strong evidence that the proportion of college graduates who do not have an opinion on this issue is different than that of non-college graduates. (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 6]) B.5.2 Competing hypotheses B.5.2.1 In words Null hypothesis: There is no association between having an opinion on drilling and having a college degree for all registered California voters in 2010. Alternative hypothesis: There is an association between having an opinion on drilling and having a college degree for all registered California voters in 2010. B.5.2.2 Another way in words Null hypothesis: The probability that a Californian voter in 2010 having no opinion on drilling and is a college graduate is the same as that of a non-college graduate. Alternative hypothesis: These parameter probabilities are different. B.5.2.3 In symbols (with annotations) \\(H_0: \\pi_{college} = \\pi_{no\\_college}\\) or \\(H_0: \\pi_{college} - \\pi_{no\\_college} = 0\\), where \\(\\pi\\) represents the probability of not having an opinion on drilling. \\(H_A: \\pi_{college} - \\pi_{no\\_college} \\ne 0\\) B.5.2.4 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.5.3 Exploring the sample data #download.file("http://ismayc.github.io/teaching/sample_problems/offshore.csv", # destfile = "data/offshore.csv", # method = "curl") offshore <- read_csv("data/offshore.csv") table(offshore$college_grad, offshore$response) no opinion opinion no 131 258 yes 104 334 off_summ <- offshore %>% group_by(college_grad) %>% summarize(prop_no_opinion = mean(response == "no opinion"), sample_size = n()) ggplot(offshore, aes(x = college_grad, fill = response)) + geom_bar(position = "fill") + coord_flip() B.5.3.1 Guess about statistical significance We are looking to see if a difference exists in the heights of the bars corresponding to no opinion for the plot. Based solely on the plot, we have little reason to believe that a difference exists since the bars seem to be about the same height, BUT…it’s important to use statistics to see if that difference is actually statistically significant! B.5.4 Non-traditional methods B.5.4.1 Collecting summary info Next we will assign some key values to variable names in R: phat_nograd <- off_summ$prop_no_opinion[1] phat_grad <- off_summ$prop_no_opinion[2] obs_diff <- phat_grad - phat_nograd n_nograd <- off_summ$sample_size[1] n_grad <- off_summ$sample_size[2] B.5.4.2 Randomization for hypothesis test In order to look to see if the observed sample proportion of no opinion for college graduates of 0.337 is statistically different than that for graduates of 0.237, we need to account for the sample sizes. Note that this is the same as looking to see if \\(\\hat{p}_{grad} - \\hat{p}_{nograd}\\) is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 389 and 438 were selected. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came (with two groups of different sizes) and then generate samples using shuffling from that simulated population to account for sampling variability. set.seed(2017) many_shuffles <- do(10000) * (offshore %>% mutate(college_grad = shuffle(college_grad)) %>% group_by(college_grad) %>% summarize(prop_no_opinion = mean(response == "no opinion")) ) null_distn_two_props <- many_shuffles %>% group_by(.index) %>% summarize(diffprop = diff(prop_no_opinion)) ggplot(null_distn_two_props, aes(x = diffprop)) + geom_histogram(bins = 25, color = "white") We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to -0.099 or less than or equal to 0.099 for our \\(p\\)-value. ggplot(null_distn_two_props, aes(x = diffprop)) + geom_histogram(bins = 20, color = "white") + geom_vline(color = "red", xintercept = obs_diff) + geom_vline(color = "red", xintercept = -obs_diff) B.5.4.2.1 Calculate \\(p\\)-value pvalue <- null_distn_two_props %>% filter( (diffprop <= obs_diff) | (diffprop >= -obs_diff) ) %>% nrow() / nrow(null_distn_two_props) pvalue [1] 0.0021 So our \\(p\\)-value is 0.002 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the tails of the null distribution. B.5.4.3 Bootstrapping for confidence interval We can also create a confidence interval for the unknown population parameter \\(\\pi_{college} - \\pi_{no\\_college}\\) using our sample data with bootstrapping. Here we will bootstrap each of the groups with replacement instead of shuffling. This is done using the groups argument in the resample function to fix the size of each group to be the same as the original group sizes of 389 for non-college graduates and 438 for college graduates. boot_props <- do(10000) * offshore %>% resample(replace = TRUE, groups = college_grad) %>% group_by(college_grad) %>% summarize(prop_no_opinion = mean(response == "no opinion")) # Next, we calculate the difference in sample proportions for each of the 10,000 replications: boot_distn_two_props <- boot_props %>% group_by(.index) %>% summarize(diffprop = diff(prop_no_opinion)) ggplot(boot_distn_two_props, aes(x = diffprop)) + geom_histogram(bins = 30, color = "white") ci_boot <- boot_distn_two_props %>% summarize(lower = quantile(diffprop, probs = 0.025), upper = quantile(diffprop, probs = 0.975)) ci_boot # A tibble: 1 x 2 lower upper <dbl> <dbl> 1 -0.162 -0.0368 We see that 0 is not contained in this confidence interval as a plausible value of \\(\\pi_{college} - \\pi_{no\\_college}\\) (the unknown population parameter). This matches with our hypothesis test results of rejecting the null hypothesis. Since zero is not a plausible value of the population parameter, we have evidence that the proportion of college graduates in California with no opinion on drilling is different than that of non-college graduates. Interpretation: We are 95% confident the true proportion of non-college graduates with no opinion on offshore drilling in California is between 0.16 dollars smaller to 0.04 dollars smaller than for college graduates. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\hat{p}_{college} - \\hat{p}_{no\\_college} = \\$-0.1\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.5.5 Traditional methods B.5.6 Check conditions Remember that in order to use the short-cut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: Each case that was selected must be independent of all the other cases selected. This condition is met since cases were selected at random to observe. Sample size: The number of pooled successes and pooled failures must be at least 10 for each group. We need to first figure out the pooled success rate: \\[\\hat{p}_{obs} = \\dfrac{131 + 104}{827} = 0.28.\\] We now determine expected (pooled) success and failure counts: \\(0.28 \\cdot (131 + 258) = 108.92\\), \\(0.72 \\cdot (131 + 258) = 280.08\\) \\(0.28 \\cdot (104 + 334) = 122.64\\), \\(0.72 \\cdot (104 + 334) = 315.36\\) Independent selection of samples: The cases are not paired in any meaningful way. We have no reason to suspect that a college graduate selected would have any relationship to a non-college graduate selected. B.5.7 Test statistic The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample proportions corresponding to no opinion on drilling (\\(\\hat{p}_{college, obs} - \\hat{p}_{no\\_college, obs}\\) = 0.033) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the standard normal distribution to standardize the difference in sample proportions (\\(\\hat{P}_{college} - \\hat{P}_{no\\_college}\\)) using the standard error of \\(\\hat{P}_{college} - \\hat{P}_{no\\_college}\\) and the pooled estimate: \\[ Z =\\dfrac{ (\\hat{P}_1 - \\hat{P}_2) - 0}{\\sqrt{\\dfrac{\\hat{P}(1 - \\hat{P})}{n_1} + \\dfrac{\\hat{P}(1 - \\hat{P})}{n_2} }} \\sim N(0, 1) \\] where \\(\\hat{P} = \\dfrac{\\text{total number of successes} }{ \\text{total number of cases}}.\\) B.5.7.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the prop.test function to perform this analysis for us. stats::prop.test(x = table(offshore$college_grad, offshore$response), n = nrow(offshore), alternative = "two.sided", correct = FALSE) 2-sample test for equality of proportions without continuity correction data: table(offshore$college_grad, offshore$response) X-squared = 10, df = 1, p-value = 0.002 alternative hypothesis: two.sided 95 percent confidence interval: 0.0377 0.1609 sample estimates: prop 1 prop 2 0.337 0.237 prop.test does a \\(\\chi^2\\) test here but this matches up exactly with what we would expect from the test statistic above since \\(Z^2 = \\chi^2\\) so \\(\\sqrt{9.99} = 3.16 = z_{obs}\\): The \\(p\\)-values are the same because we are focusing on a two-tailed test. The observed difference in sample proportions is 3.16 standard deviations larger than 0. The \\(p\\)-value—the probability of observing a \\(Z\\) value of 3.16 or more extreme in our null distribution—is 0.0016. This can also be calculated in R directly: 2 * pnorm(3.16, lower.tail = FALSE) [1] 0.00158 The 95% confidence interval is also stated above in the prop.test results. B.5.8 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that a statistically significant difference did not exist in the proportions of no opinion on offshore drilling between college educated and non-college educated Californians was not validated. We do have evidence to suggest that there is a dependency between college graduation and position on offshore drilling for Californians. B.5.9 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions were not met since the number of pairs was small, but the sample data was not highly skewed. Using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) lead to similar results. B.6 Two means (independent samples) B.6.1 Problem statement Average income varies from one region of the country to another, and it often reflects both lifestyles and regional living expenses. Suppose a new graduate is considering a job in two locations, Cleveland, OH and Sacramento, CA, and he wants to see whether the average income in one of these cities is higher than the other. He would like to conduct a hypothesis test based on two randomly selected samples from the 2000 Census. (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 5]) B.6.2 Competing hypotheses B.6.2.1 In words Null hypothesis: There is no association between income and location (Cleveland, OH and Sacramento, CA). Alternative hypothesis: There is an association between income and location (Cleveland, OH and Sacramento, CA). B.6.2.2 Another way in words Null hypothesis: The mean income is the same for both cities. Alternative hypothesis: The mean income is different for the two cities. B.6.2.3 In symbols (with annotations) \\(H_0: \\mu_{sac} = \\mu_{cle}\\) or \\(H_0: \\mu_{sac} - \\mu_{cle} = 0\\), where \\(\\mu\\) represents the average income. \\(H_A: \\mu_{sac} - \\mu_{cle} \\ne 0\\) B.6.2.4 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.6.3 Exploring the sample data inc_summ <- cleSac %>% group_by(metro_area) %>% summarize(sample_size = n(), mean = mean(income), sd = sd(income), minimum = min(income), lower_quartile = quantile(income, 0.25), median = median(income), upper_quartile = quantile(income, 0.75), max = max(income)) kable(inc_summ) metro_area sample_size mean sd minimum lower_quartile median upper_quartile max Cleveland_ OH 212 27467 27681 0 8475 21000 35275 152400 Sacramento_ CA 175 32428 35774 0 8050 20000 49350 206900 The boxplot below also shows the mean for each group highlighted by the red dots. ggplot(cleSac, aes(x = metro_area, y = income)) + geom_boxplot() + stat_summary(fun.y = "mean", geom = "point", color = "red") B.6.3.1 Guess about statistical significance We are looking to see if a difference exists in the mean income of the two levels of the explanatory variable. Based solely on the boxplot, we have reason to believe that no difference exists. The distributions of income seem similar and the means fall in roughly the same place. B.6.4 Non-traditional methods B.6.4.1 Collecting summary info Next we will assign some key values to variable names in R: xbar_cle <- inc_summ$mean[1] xbar_sac <- inc_summ$mean[2] obs_diff <- xbar_sac - xbar_cle n_cle <- inc_summ$sample_size[1] n_sac <- inc_summ$sample_size[2] B.6.4.2 Randomization for hypothesis test In order to look to see if the observed sample mean for Sacramento of 27467.066 is statistically different than that for Cleveland of 32427.543, we need to account for the sample sizes. Note that this is the same as looking to see if \\(\\bar{x}_{sac} - \\bar{x}_{cle}\\) is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 212 and 175 were selected. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came (with two groups of different sizes) and then generate samples using shuffling from that simulated population to account for sampling variability. set.seed(2017) many_shuffles <- do(10000) * (cleSac %>% mutate(metro_area = shuffle(metro_area)) %>% group_by(metro_area) %>% summarize(mean_inc = mean(income)) ) null_distn_two_means <- many_shuffles %>% group_by(.index) %>% summarize(diffmean = diff(mean_inc)) ggplot(null_distn_two_means, aes(x = diffmean)) + geom_histogram(bins = 30, color = "white") We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to 4960.477 or less than or equal to -4960.477 for our \\(p\\)-value. ggplot(null_distn_two_means, aes(x = diffmean)) + geom_histogram(bins = 30, color = "white") + geom_vline(color = "red", xintercept = obs_diff) + geom_vline(color = "red", xintercept = -obs_diff) B.6.4.2.1 Calculate \\(p\\)-value pvalue <- null_distn_two_means %>% filter( (diffmean >= obs_diff) | (diffmean <= -obs_diff) ) %>% nrow() / nrow(null_distn_two_means) pvalue [1] 0.121 So our \\(p\\)-value is 0.121 and we fail to reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are not very far into the tail of the null distribution. B.6.4.3 Bootstrapping for confidence interval We can also create a confidence interval for the unknown population parameter \\(\\mu_{sac} - \\mu_{cle}\\) using our sample data with bootstrapping. Here we will bootstrap each of the groups with replacement instead of shuffling. This is done using the groups argument in the resample function to fix the size of each group to be the same as the original group sizes of 175 for Sacramento and 212 for Cleveland. boot_means <- do(10000) * cleSac %>% resample(replace = TRUE, groups = metro_area) %>% group_by(metro_area) %>% summarize(mean_inc = mean(income)) # Next, we calculate the difference in sample means for each of the 10,000 replications: boot_distn_two_means <- boot_means %>% group_by(.index) %>% summarize(diffmean = diff(mean_inc)) ggplot(boot_distn_two_means, aes(x = diffmean)) + geom_histogram(bins = 30, color = "white") ci_boot <- boot_distn_two_means %>% summarize(lower = quantile(diffmean, probs = 0.025), upper = quantile(diffmean, probs = 0.975)) ci_boot # A tibble: 1 x 2 lower upper <dbl> <dbl> 1 -1346 11441 We see that 0 is contained in this confidence interval as a plausible value of \\(\\mu_{sac} - \\mu_{cle}\\) (the unknown population parameter). This matches with our hypothesis test results of failing to reject the null hypothesis. Since zero is a plausible value of the population parameter, we do not have evidence that Sacramento incomes are different than Cleveland incomes. Interpretation: We are 95% confident the true mean yearly income for those living in Sacramento is between 1345.78 dollars smaller to 11440.64 dollars higher than for Cleveland. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\bar{x}_{sac} - \\bar{x}_{cle} = \\$4960.48\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.6.5 Traditional methods B.6.5.0.1 Check conditions Remember that in order to use the short-cut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are independent in both groups. This metro_area variable is met since the cases are randomly selected from each city. Approximately normal: The distribution of the response for each group should be normal or the sample sizes should be at least 30. ggplot(cleSac, aes(x = income)) + geom_histogram(color = "white", binwidth = 20000) + facet_wrap(~ metro_area) We have some reason to doubt the normality assumption here since both the histograms show deviation from a normal model fitting the data well for each group. The sample sizes for each group are greater than 100 though so the assumptions should still apply. Independent samples: The samples should be collected without any natural pairing. There is no mention of there being a relationship between those selected in Cleveland and in Sacramento. B.6.6 Test statistic The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample means (\\(\\bar{x}_{sac, obs} - \\bar{x}_{cle, obs}\\) = 4960.477) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the \\(t\\) distribution to standardize the difference in sample means (\\(\\bar{X}_{sac} - \\bar{X}_{cle}\\)) using the approximate standard error of \\(\\bar{X}_{sac} - \\bar{X}_{cle}\\) (invoking \\(S_{sac}\\) and \\(S_{cle}\\) as estimates of unknown \\(\\sigma_{sac}\\) and \\(\\sigma_{cle}\\)). \\[ T =\\dfrac{ (\\bar{X}_1 - \\bar{X}_2) - 0}{ \\sqrt{\\dfrac{S_1^2}{n_1} + \\dfrac{S_2^2}{n_2}} } \\sim t (df = min(n_1 - 1, n_2 - 1)) \\] where 1 = Sacramento and 2 = Cleveland with \\(S_1^2\\) and \\(S_2^2\\) the sample variance of the incomes of both cities, respectively, and \\(n_1 = 175\\) for Sacramento and \\(n_2 = 212\\) for Cleveland. B.6.6.1 Observed test statistic Note that we could also do (ALMOST) this test directly using the t.test function. The x and y arguments are expected to both be numeric vectors here so we’ll need to appropriately filter our datasets. cleveland <- cleSac %>% filter(metro_area == "Cleveland_ OH") sacramento <- cleSac %>% filter(metro_area != "Cleveland_ OH") t.test(y = cleveland$income, x = sacramento$income, alternative = "two.sided") Welch Two Sample t-test data: sacramento$income and cleveland$income t = 2, df = 300, p-value = 0.1 alternative hypothesis: true difference in means is not equal to 0 95 percent confidence interval: -1543 11464 sample estimates: mean of x mean of y 32428 27467 Note that the degrees of freedom reported above are different than what we used above in specifying the Test Statistic. The degrees of freedom used here is also known as the Satterthwaite approximation and involves a quite complicated formula. For most problems, the much simpler smaller of sample sizes minus one will suffice. While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We see here that the observed test statistic value is around -1.5 with \\(df = min(212 - 1, 175 - 1) = 174\\). Recall that for large degrees of freedom, the \\(t\\) distribution is roughly equal to the standard normal curve so our difference in df for the Satterthwaite and “min” variations doesn’t really matter. B.6.7 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing an \\(t_{174}\\) value of -1.501 or more extreme (in both directions) in our null distribution—is 0.13. This can also be calculated in R directly: 2 * pt(-1.501, df = min(212 - 1, 175 - 1), lower.tail = TRUE) [1] 0.135 We can also approximate by using the standard normal curve: 2 * pnorm(-1.501) [1] 0.133 Note that the 95 percent confidence interval given above matches well with the one calculated using bootstrapping. B.6.8 State conclusion We, therefore, do not have sufficient evidence to reject the null hypothesis. Our initial guess that a statistically significant difference not existing in the means was backed by this statistical analysis. We do not have evidence to suggest that the true mean income differs between Cleveland, OH and Sacramento, CA based on this data. B.6.9 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.7 Two means (paired samples) B.7.0.1 Problem statement Trace metals in drinking water affect the flavor and an unusually high concentration can pose a health hazard. Ten pairs of data were taken measuring zinc concentration in bottom water and surface water at 10 randomly selected locations on a stretch of river. Do the data suggest that the true average concentration in the surface water is smaller than that of bottom water? (Note that units are not given.) [Tweaked a bit from https://onlinecourses.science.psu.edu/stat500/node/51] B.7.1 Competing hypotheses B.7.1.1 In words Null hypothesis: The mean concentration in the bottom water is the same as that of the surface water at different paired locations. Alternative hypothesis: The mean concentration in the surface water is smaller than that of the bottom water at different paired locations. B.7.1.2 In symbols (with annotations) \\(H_0: \\mu_{diff} = 0\\), where \\(\\mu_{diff}\\) represents the mean difference in concentration for surface water minus bottom water. \\(H_A: \\mu_{diff} < 0\\) B.7.1.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.7.2 Exploring the sample data #download.file("http://ismayc.github.io/teaching/sample_problems/zinc_tidy.csv", # destfile = "data/zinc_tidy.csv", # method = "curl") zinc_tidy <- read_csv("data/zinc_tidy.csv") We want to look at the differences in surface - bottom for each location: zinc_diff <- zinc_tidy %>% group_by(loc_id) %>% summarize(pair_diff = diff(concentration)) zinc_summ <- zinc_diff %>% summarize(sample_size = n(), mean = mean(pair_diff), sd = sd(pair_diff), minimum = min(pair_diff), lower_quartile = quantile(pair_diff, 0.25), median = median(pair_diff), upper_quartile = quantile(pair_diff, 0.75), max = max(pair_diff)) kable(zinc_summ) sample_size mean sd minimum lower_quartile median upper_quartile max 10 -0.08 0.052 -0.177 -0.11 -0.084 -0.036 -0.015 The histogram below also shows the distribution of pair_diff. ggplot(zinc_diff, aes(x = pair_diff)) + geom_histogram(binwidth = 0.04, color = "white") B.7.2.1 Guess about statistical significance We are looking to see if the sample paired mean difference of -0.08 is statistically less than 0. They seem to be quite close, but we have a small number of pairs here. Let’s guess that we will fail to reject the null hypothesis. B.7.3 Non-traditional methods B.7.3.1 Collecting summary info Next we will assign some key values to variable names in R: obs_diff <- zinc_summ$mean n_pairs <- zinc_summ$sample_size B.7.3.2 Randomization for hypothesis test In order to look to see if the observed sample mean difference \\(\\bar{x}_{diff} = -0.08\\) is statistically less than 0, we need to account for the number of pairs. We also need to determine a process that replicates how the paired data was selected in a way similar to how we calculated our original difference in sample means. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came and then generate samples using shuffling from that simulated population to account for sampling variability. In this case, we will shuffle along each paired location. So values that were on the bottom of location 1 may now be switched to be on the surface or vice versa. set.seed(2017) many_shuffles <- do(10000) * (zinc_tidy %>% mutate(location = shuffle(location, groups = loc_id)) %>% group_by(loc_id) %>% summarize(pair_diff = diff(concentration)) ) null_distn_paired_means <- many_shuffles %>% group_by(.index) %>% summarize(mean_diff = mean(pair_diff)) ggplot(null_distn_paired_means, aes(x = mean_diff)) + geom_histogram(bins = 30, color = "white") We can next use this distribution to observe our \\(p\\)-value. Recall this is a left-tailed test so we will be looking for values that are less than or equal to -0.08 for our \\(p\\)-value. ggplot(null_distn_paired_means, aes(x = mean_diff)) + geom_histogram(bins = 30, color = "white") + geom_vline(color = "red", xintercept = obs_diff) B.7.3.2.1 Calculate \\(p\\)-value pvalue <- null_distn_paired_means %>% filter(mean_diff <= obs_diff) %>% nrow() / nrow(null_distn_paired_means) pvalue [1] 1 So our \\(p\\)-value is essentially 1 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the left tail of the null distribution. B.7.3.3 Bootstrapping for confidence interval We can also create a confidence interval for the unknown population parameter \\(\\mu_{diff}\\) using our sample data (the calculated differences) with bootstrapping. This is similar to the bootstrapping done in a one sample mean case, except now our data is differences instead of raw numerical data. boot_distn_paired_means <- do(10000) * resample(zinc_diff, replace = TRUE) %>% summarize(mean_diff = mean(pair_diff)) ggplot(boot_distn_paired_means, aes(x = mean_diff)) + geom_histogram(bins = 30, color = "white") boot_distn_paired_means %>% summarize(lower = quantile(mean_diff, probs = 0.025), upper = quantile(mean_diff, probs = 0.975)) lower upper 1 -0.112 -0.0506 We see that 0 is not contained in this confidence interval as a plausible value of \\(\\mu_{diff}\\) (the unknown population parameter). This matches with our hypothesis test results of rejecting the null hypothesis. Since zero is not a plausible value of the population parameter and since the entire confidence interval falls below zero, we have evidence that surface zinc concentration levels are lower, on average, than bottom level zinc concentrations. Interpretation: We are 95% confident the true mean zinc concentration on the surface is between 1345.78 units smaller to -11440.64 units smaller than on the bottom. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\bar{x}_{diff} = -0.08\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.7.4 Traditional methods B.7.4.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations among pairs are independent. The locations are selected independently through random sampling so this condition is met. Approximately normal: The distribution of population of differences is normal or the number of pairs is at least 30. The histogram above does show some skew so we have reason to doubt the population being normal based on this sample. We also only have 10 pairs which is fewer than the 30 needed. A theory-based test may not be valid here. B.7.4.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population mean difference \\(\\mu_{diff}\\). A good guess is the sample mean difference \\(\\bar{X}_{diff}\\). Recall that this sample mean is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample mean of \\(\\bar{x}_{diff, obs} = 0.0804\\) or larger assuming that the population mean difference is 0 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can “standardize” this original test statistic of \\(\\bar{X}_{diff}\\) into a \\(T\\) statistic that follows a \\(t\\) distribution with degrees of freedom equal to \\(df = n - 1\\): \\[ T =\\dfrac{ \\bar{X}_{diff} - 0}{ S_{diff} / \\sqrt{n} } \\sim t (df = n - 1) \\] where \\(S\\) represents the standard deviation of the sample differences and \\(n\\) is the number of pairs. B.7.4.2.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the t.test function on the differences to perform this analysis for us. stats::t.test(x = zinc_diff$pair_diff, alternative = "less", mu = 0) One Sample t-test data: zinc_diff$pair_diff t = -5, df = 9, p-value = 0.0004 alternative hypothesis: true mean is less than 0 95 percent confidence interval: -Inf -0.0501 sample estimates: mean of x -0.0804 We see here that the \\(t_{obs}\\) value is around -5. B.7.4.3 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing a \\(t_{obs}\\) value of -5 or less in our null distribution of a \\(t\\) with 9 degrees of freedom—is 0.0004. This can also be calculated in R directly: pt(-5, df = nrow(zinc_diff) - 1, lower.tail = TRUE) [1] 0.000369 B.7.4.4 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample mean difference was not statistically less than the hypothesized mean of 0 has been invalidated here. Based on this sample, we have evidence that the mean concentration in the bottom water is greater than that of the surface water at different paired locations. B.7.5 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions were not met since the number of pairs was small, but the sample data was not highly skewed. Using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) lead to similar results. "], ["C-appendixC.html", "C Reach for the Stars Needed packages C.1 Sorted barplots C.2 Interactive graphics", " C Reach for the Stars Needed packages library(dplyr) library(ggplot2) library(knitr) library(dygraphs) library(nycflights13) C.1 Sorted barplots Building upon the example in Section 3.8: flights_table <- table(flights$carrier) flights_table 9E AA AS B6 DL EV F9 FL HA MQ OO UA US 18460 32729 714 54635 48110 54173 685 3260 342 26397 32 58665 20536 VX WN YV 5162 12275 601 We can sort this table from highest to lowest counts by using the sort function: sorted_flights <- sort(flights_table, decreasing = TRUE) names(sorted_flights) [1] "UA" "B6" "EV" "DL" "AA" "MQ" "US" "9E" "WN" "VX" "FL" "AS" "F9" "YV" "HA" [16] "OO" It is often preferred for barplots to be ordered corresponding to the heights of the bars. This allows the reader to more easily compare the ordering of different airlines in terms of departed flights (Robbins 2013). We can also much more easily answer questions like “How many airlines have more departing flights than Southwest Airlines?”. We can use the sorted table giving the number of flights defined as sorted_flights to reorder the carrier. ggplot(data = flights, mapping = aes(x = carrier)) + geom_bar() + scale_x_discrete(limits = names(sorted_flights)) Figure C.1: Number of flights departing NYC in 2013 by airline - Descending numbers The last addition here specifies the values of the horizontal x axis on a discrete scale to correspond to those given by the entries of sorted_flights. C.2 Interactive graphics C.2.1 Interactive linegraphs Another useful tool for viewing linegraphs such as this is the dygraph function in the dygraphs package in combination with the dyRangeSelector function. This allows us to zoom in on a selected range and get an interactive plot for us to work with: library(dygraphs) flights_day <- mutate(flights, date = as.Date(time_hour)) flights_summarized <- flights_day %>% group_by(date) %>% summarize(median_arr_delay = median(arr_delay, na.rm = TRUE)) rownames(flights_summarized) <- flights_summarized$date flights_summarized <- select(flights_summarized, -date) dyRangeSelector(dygraph(flights_summarized)) The syntax here is a little different than what we have covered so far. The dygraph function is expecting for the dates to be given as the rownames of the object. We then remove the date variable from the flights_summarized data frame since it is accounted for in the rownames. Lastly, we run the dygraph function on the new data frame that only contains the median arrival delay as a column and then provide the ability to have a selector to zoom in on the interactive plot via dyRangeSelector. (Note that this plot will only be interactive in the HTML version of this book.) "],