From 7c41dce8e50544103f3a95be6493326499798a12 Mon Sep 17 00:00:00 2001 From: ismayc Date: Mon, 20 Aug 2018 09:29:03 -0700 Subject: [PATCH] Add bookdown build gitbook/pdf --- bib/packages.bib | 78 - docs/10-hypothesis-testing.html | 10 +- docs/11-inference-for-regression.html | 236 ++- docs/12-thinking-with-data.html | 126 +- docs/2-getting-started.html | 3 +- docs/3-viz.html | 689 ++++++- docs/4-tidy.html | 193 +- docs/5-wrangling.html | 408 ++++- docs/6-regression.html | 1605 ++++++++++++++++- docs/7-multiple-regression.html | 979 +++++++++- docs/8-sampling.html | 1094 ++++++++++- docs/9-confidence-intervals.html | 1044 ++++++++++- docs/A-appendixA.html | 3 +- docs/B-appendixB.html | 160 +- docs/C-appendixC.html | 7 +- docs/index.html | 7 +- docs/ismaykim.pdf | Bin 23258988 -> 23261949 bytes docs/ismaykim.tex | 113 +- .../figure-html/gapminder-1.png | Bin 119802 -> 121639 bytes docs/ismaykim_files/figure-html/jitter-1.png | Bin 105355 -> 102672 bytes .../figure-html/jitter-example-plot-2-1.png | Bin 43223 -> 43238 bytes .../figure-html/monthtempbox3-1.png | Bin 124637 -> 124270 bytes docs/libs/kePrint-0.0.1/kePrint.js | 4 + docs/references.html | 3 +- docs/scripts/03-visualization.R | 14 +- docs/scripts/04-tidy.R | 10 +- docs/scripts/05-wrangling.R | 20 +- docs/scripts/06-regression.R | 26 +- docs/scripts/07-multiple-regression.R | 20 +- docs/scripts/08-sampling.R | 22 +- docs/scripts/09-confidence-intervals.R | 11 +- docs/scripts/11-inference-for-regression.R | 4 +- docs/scripts/12-thinking-with-data.R | 4 +- docs/search_index.json | 22 +- 34 files changed, 6603 insertions(+), 312 deletions(-) create mode 100644 docs/libs/kePrint-0.0.1/kePrint.js diff --git a/bib/packages.bib b/bib/packages.bib index c7bb3b922..76cf6466a 100755 --- a/bib/packages.bib +++ b/bib/packages.bib @@ -6,13 +6,6 @@ @Manual{R-base year = {2018}, url = {https://www.R-project.org/}, } -@Manual{R-bindrcpp, - title = {bindrcpp: An 'Rcpp' Interface to Active Bindings}, - author = {Kirill Müller}, - year = {2018}, - note = {R package version 0.2.2}, - url = {https://CRAN.R-project.org/package=bindrcpp}, -} @Manual{R-bookdown, title = {bookdown: Authoring Books and Technical Documents with R Markdown}, author = {Yihui Xie}, @@ -20,13 +13,6 @@ @Manual{R-bookdown note = {R package version 0.7}, url = {https://CRAN.R-project.org/package=bookdown}, } -@Manual{R-broom, - title = {broom: Convert Statistical Analysis Objects into Tidy Tibbles}, - author = {David Robinson and Alex Hayes}, - year = {2018}, - note = {R package version 0.5.0}, - url = {https://CRAN.R-project.org/package=broom}, -} @Manual{R-dplyr, title = {dplyr: A Grammar of Data Manipulation}, author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller}, @@ -48,20 +34,6 @@ @Manual{R-fivethirtyeight url = {https://github.com/rudeboybert/fivethirtyeight}, year = {2018}, } -@Manual{R-forcats, - title = {forcats: Tools for Working with Categorical Variables (Factors)}, - author = {Hadley Wickham}, - year = {2018}, - note = {R package version 0.3.0}, - url = {https://CRAN.R-project.org/package=forcats}, -} -@Manual{R-gapminder, - title = {gapminder: Data from Gapminder}, - author = {Jennifer Bryan}, - year = {2017}, - note = {R package version 0.3.0}, - url = {https://CRAN.R-project.org/package=gapminder}, -} @Manual{R-ggplot2, title = {ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics}, author = {Hadley Wickham and Winston Chang and Lionel Henry and Thomas Lin Pedersen and Kohske Takahashi and Claus Wilke and Kara Woo}, @@ -75,13 +47,6 @@ @Manual{R-ggplot2movies note = {R package version 0.0.1}, url = {https://CRAN.R-project.org/package=ggplot2movies}, } -@Manual{R-gridExtra, - title = {gridExtra: Miscellaneous Functions for "Grid" Graphics}, - author = {Baptiste Auguie}, - year = {2017}, - note = {R package version 2.3}, - url = {https://CRAN.R-project.org/package=gridExtra}, -} @Manual{R-infer, title = {infer: Tidy Statistical Inference}, author = {Andrew Bray and Chester Ismay and Ben Baumer and Mine Cetinkaya-Rundel}, @@ -89,14 +54,6 @@ @Manual{R-infer url = {https://github.com/tidymodels/infer}, year = {2018}, } -@Manual{R-ISLR, - title = {ISLR: Data for an Introduction to Statistical Learning with -Applications in R}, - author = {Gareth James and Daniela Witten and Trevor Hastie and Rob Tibshirani}, - year = {2017}, - note = {R package version 1.2}, - url = {https://CRAN.R-project.org/package=ISLR}, -} @Manual{R-janitor, title = {janitor: Simple Tools for Examining and Cleaning Dirty Data}, author = {Sam Firke}, @@ -117,20 +74,6 @@ @Manual{R-knitr note = {R package version 1.20}, url = {https://CRAN.R-project.org/package=knitr}, } -@Manual{R-moderndive, - title = {moderndive: Tidyverse-Friendly Introductory Linear Regression}, - author = {Chester Ismay}, - year = {2018}, - note = {R package version 0.2.0}, - url = {https://CRAN.R-project.org/package=moderndive}, -} -@Manual{R-mvtnorm, - title = {mvtnorm: Multivariate Normal and t Distributions}, - author = {Alan Genz and Frank Bretz and Tetsuhisa Miwa and Xuefei Mi and Torsten Hothorn}, - year = {2018}, - note = {R package version 1.0-8}, - url = {https://CRAN.R-project.org/package=mvtnorm}, -} @Manual{R-nycflights13, title = {nycflights13: Flights that Departed NYC in 2013}, author = {Hadley Wickham}, @@ -138,13 +81,6 @@ @Manual{R-nycflights13 note = {R package version 1.0.0}, url = {https://CRAN.R-project.org/package=nycflights13}, } -@Manual{R-patchwork, - title = {patchwork: The Composer of ggplots}, - author = {Thomas Lin Pedersen}, - year = {2017}, - note = {R package version 0.0.1}, - url = {https://github.com/thomasp85/patchwork}, -} @Manual{R-readr, title = {readr: Read Rectangular Text Data}, author = {Hadley Wickham and Jim Hester and Romain Francois}, @@ -159,13 +95,6 @@ @Manual{R-rmarkdown note = {R package version 1.10}, url = {https://CRAN.R-project.org/package=rmarkdown}, } -@Manual{R-scales, - title = {scales: Scale Functions for Visualization}, - author = {Hadley Wickham}, - year = {2018}, - note = {R package version 1.0.0}, - url = {https://CRAN.R-project.org/package=scales}, -} @Manual{R-skimr, title = {skimr: Compact and Flexible Summaries of Data}, author = {Amelia McNamara and Eduardo {Arino de la Rubia} and Hao Zhu and Shannon Ellis and Michael Quinn}, @@ -173,13 +102,6 @@ @Manual{R-skimr note = {R package version 1.0.3}, url = {https://CRAN.R-project.org/package=skimr}, } -@Manual{R-stringr, - title = {stringr: Simple, Consistent Wrappers for Common String Operations}, - author = {Hadley Wickham}, - year = {2018}, - note = {R package version 1.3.1}, - url = {https://CRAN.R-project.org/package=stringr}, -} @Manual{R-tibble, title = {tibble: Simple Data Frames}, author = {Kirill Müller and Hadley Wickham}, diff --git a/docs/10-hypothesis-testing.html b/docs/10-hypothesis-testing.html index 8d310f7aa..fd2bc48e6 100644 --- a/docs/10-hypothesis-testing.html +++ b/docs/10-hypothesis-testing.html @@ -25,7 +25,7 @@ - + @@ -47,6 +47,7 @@ + @@ -539,7 +540,12 @@

Needed packages

DataCamp

Our approach of using data science tools to understand the second major component of statistical inference, hypothesis testing, uses the same tools as in Mine Cetinkaya-Rundel and Andrew Bray’s DataCamp courses “Inference for Numerical Data” and “Inference for Categorical Data.” If you’re interested in complementing your learning below in an interactive online environment, click on the images below to access the courses.

-

+
+ +
+
+ +

diff --git a/docs/11-inference-for-regression.html b/docs/11-inference-for-regression.html index 894f1a0bb..069d8e147 100644 --- a/docs/11-inference-for-regression.html +++ b/docs/11-inference-for-regression.html @@ -25,7 +25,7 @@ - + @@ -47,6 +47,7 @@ + @@ -544,7 +545,9 @@

Needed packages

DataCamp

Our approach of understanding both the statistical and practical significance of any regression results, is aligned with the approach taken in Jo Hardin’s DataCamp course “Inference for Regression.” If you’re interested in complementing your learning below in an interactive online environment, click on the image below to access the course.

-

+
+ +

11.1 Simulation-based Inference for Regression

@@ -700,14 +703,237 @@

11.3.2 Refresher: Visualizations<

11.3.3 Refresher: Regression tables

-

Last, let’s recall the regressions we fit. First, the regression with no interaction effect: note the use of + in the formula in Table ??.

+

Last, let’s recall the regressions we fit. First, the regression with no interaction effect: note the use of + in the formula in Table 11.1.

score_model_2 <- lm(score ~ age + gender, data = evals_multiple)
 get_regression_table(score_model_2)
- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Table 11.1: Model 1: Regression table with no interaction effect included +
+term + +estimate + +std_error + +statistic + +p_value + +lower_ci + +upper_ci +
+intercept + +4.484 + +0.125 + +35.79 + +0.000 + +4.238 + +4.730 +
+age + +-0.009 + +0.003 + +-3.28 + +0.001 + +-0.014 + +-0.003 +
+gendermale + +0.191 + +0.052 + +3.63 + +0.000 + +0.087 + +0.294 +

Second, the regression with an interaction effect: note the use of * in the formula.

score_model_3 <- lm(score ~ age * gender, data = evals_multiple)
 get_regression_table(score_model_3)
- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Table 11.2: Model 2: Regression table with interaction effect included +
+term + +estimate + +std_error + +statistic + +p_value + +lower_ci + +upper_ci +
+intercept + +4.883 + +0.205 + +23.80 + +0.000 + +4.480 + +5.286 +
+age + +-0.018 + +0.004 + +-3.92 + +0.000 + +-0.026 + +-0.009 +
+gendermale + +-0.446 + +0.265 + +-1.68 + +0.094 + +-0.968 + +0.076 +
+age:gendermale + +0.014 + +0.006 + +2.45 + +0.015 + +0.003 + +0.024 +

11.3.4 Script of R code

diff --git a/docs/12-thinking-with-data.html b/docs/12-thinking-with-data.html index d1f084590..c1fcf0607 100644 --- a/docs/12-thinking-with-data.html +++ b/docs/12-thinking-with-data.html @@ -25,7 +25,7 @@ - + @@ -47,6 +47,7 @@ + @@ -577,7 +578,9 @@

Needed packages

DataCamp

The case study of Seattle house prices below was the inspiration for a large part of ModernDive co-author Albert Y. Kim’s DataCamp course “Modeling with Data in the Tidyverse.” If you’re interested in complementing your learning below in an interactive online environment, click on the image below to access the course. The relevant chapters are Chapter 1 “Introduction to Modeling” and Chapter 3 “Modeling with Multiple Regression.”

-

+
+ +

Case studies involving data in the fivethirtyeight R package form the basis of ModernDive co-author Chester Ismay’s DataCamp course “Effective Data Storytelling in the Tidyverse.” This free course can be accessed here.


@@ -694,7 +697,124 @@

12.1.1 Exploratory data analysis

12.1.2 log10 transformations

At its simplest, log10() transformations returns base 10 logarithms. For example, since \(1000 = 10^3\), log10(1000) returns 3. To undo a log10-transformation, we raise 10 to this value. For example, to undo the previous log10-transformation and return the original value of 1000, we raise 10 to this value \(10^{3}\) by running 10^(3) = 1000. log-transformations allow us to focus on multiplicative changes instead of additive ones, thereby emphasizing changes in “orders of magnitude.” Let’s illustrate this idea in Table ?? with examples of prices of consumer goods in US dollars.

- + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+Price + +log10(Price) + +Order of magnitude + +Examples +
+$1 + +0 + +Singles + +Cups of coffee +
+$10 + +1 + +Tens + +Books +
+$100 + +2 + +Hundreds + +Mobile phones +
+$1,000 + +3 + +Thousands + +High definition TV’s +
+$10,000 + +4 + +Tens of thousands + +Cars +
+$100,000 + +5 + +Hundreds of thousands + +Luxury cars & houses +
+$1,000,000 + +6 + +Millions + +Luxury houses +

Let’s break this down:

  1. When purchasing a cup of coffee, we tend to think of prices ranging in single dollars e.g. $2 or $3. However when purchasing say mobile phones, we don’t tend to think in prices in single dollars e.g. $676 or $757, but tend to round to the nearest unit of hundreds of dollars e.g. $200 or $500.
  2. diff --git a/docs/2-getting-started.html b/docs/2-getting-started.html index dc109b15d..e55bcc9cd 100644 --- a/docs/2-getting-started.html +++ b/docs/2-getting-started.html @@ -25,7 +25,7 @@ - + @@ -47,6 +47,7 @@ + diff --git a/docs/3-viz.html b/docs/3-viz.html index 79e139d66..c997ea7d2 100644 --- a/docs/3-viz.html +++ b/docs/3-viz.html @@ -25,7 +25,7 @@ - + @@ -47,6 +47,7 @@ + @@ -536,7 +537,9 @@

    Needed packages

    DataCamp

    Our approach to introducing data visualization via the Grammar of Graphics and the ggplot2 package is very similar to the approach taken in David Robinson’s DataCamp course “Introduction to the Tidyverse,” a course targeted at people new to R and the tidyverse. If you’re interested in complementing your learning below in an interactive online environment, click on the image below to access the course. The relevant chapters of the course are Chapter 2 on “Data visualization” and Chapter 4 on “Types of visualizations.”

    -

    +
    + +

    3.1 The Grammar of Graphics

    @@ -558,7 +561,134 @@

    3.1.1 Components of the Grammar

    3.1.2 Gapminder

    In February 2006, a statistician named Hans Rosling gave a TED talk titled “The best stats you’ve ever seen” where he presented global economic, health, and development data from the website gapminder.org. For example, from the 1704 countries included from 2007, consider only the first 6 countries when listed alphabetically:

    - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 3.1: Gapminder 2007 Data: First 6 of 142 countries +
    +Country + +Continent + +Life Expectancy + +Population + +GDP per Capita +
    +Afghanistan + +Asia + +43.83 + +31889923 + +974.58 +
    +Albania + +Europe + +76.42 + +3600523 + +5937.03 +
    +Algeria + +Africa + +72.30 + +33333216 + +6223.37 +
    +Angola + +Africa + +42.73 + +12420476 + +4797.23 +
    +Argentina + +Americas + +75.32 + +40301927 + +12779.38 +
    +Australia + +Oceania + +81.23 + +20434176 + +34435.37 +

    Each row in this table corresponds to a country in 2007. For each row, we have 5 columns:

    1. Country: Name of country.
    2. @@ -583,7 +713,70 @@

      3.1.2 Gapminder

    Recall that data here corresponds to each of the variables being in the same data frame and the “data variable” corresponds to a column in a data frame.

    While in this example we are considering one type of geometric object (of type point), graphics are not limited to just points. Some plots involve lines while others involve bars. Let’s summarize the three essential components of the grammar in a table:

    - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 3.2: Summary of Grammar of Graphics for this plot +
    +data variable + +aes + +geom +
    +GDP per Capita + +x + +point +
    +Life Expectancy + +y + +point +
    +Population + +size + +point +
    +Continent + +color + +point +

    3.1.3 Other components of the Grammar

    @@ -1056,12 +1249,82 @@

    3.8 5NG#5: Barplots

    number = c(3, 2) )

    We see both the fruits and fruits_counted data frames represent the same collection of fruit. Whereas fruits just lists the fruit individually:

    - + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 3.3: Fruits +
    +fruit +
    +apple +
    +apple +
    +apple +
    +orange +
    +orange +

    fruits_counted has a variable count which represents pre-counted values of each fruit.

    - + + + + + + + + + + + + + + + + + + +
    +Table 3.4: Fruits (Pre-Counted) +
    +fruit + +number +
    +apple + +3 +
    +orange + +2 +

    3.8.1 Barplots via geom_bar/geom_col

    -

    Let’s generate barplots using these two different representations of the same basket of fruit: 3 apples and 2 oranges. Using the not pre-counted data fruits from Table ??:

    +

    Let’s generate barplots using these two different representations of the same basket of fruit: 3 apples and 2 oranges. Using the not pre-counted data fruits from Table 3.3:

    ggplot(data = fruits, mapping = aes(x = fruit)) +
       geom_bar()
    @@ -1070,7 +1333,7 @@

    3.8.1 Barplots via geom_bar/geom_ Figure 3.19: Barplot when counts are not pre-counted

    -

    and using the pre-counted data fruits_counted from Table ??:

    +

    and using the pre-counted data fruits_counted from Table 3.4:

    ggplot(data = fruits_counted, mapping = aes(x = fruit, y = number)) +
       geom_col()
    @@ -1101,13 +1364,295 @@

    3.8.1 Barplots via geom_bar/geom_

    To get an understanding of what the names of these airlines are corresponding to these carrier codes, we can look at the airlines data frame in the nycflights13 package.

    airlines
    - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +carrier + +name +
    +9E + +Endeavor Air Inc. +
    +AA + +American Airlines Inc. +
    +AS + +Alaska Airlines Inc. +
    +B6 + +JetBlue Airways +
    +DL + +Delta Air Lines Inc. +
    +EV + +ExpressJet Airlines Inc. +
    +F9 + +Frontier Airlines Inc. +
    +FL + +AirTran Airways Corporation +
    +HA + +Hawaiian Airlines Inc. +
    +MQ + +Envoy Air +
    +OO + +SkyWest Airlines Inc. +
    +UA + +United Air Lines Inc. +
    +US + +US Airways Inc. +
    +VX + +Virgin America +
    +WN + +Southwest Airlines Co. +
    +YV + +Mesa Airlines Inc. +

    Going back to our barplot, we see that United Air Lines, JetBlue Airways, and ExpressJet Airlines had the most flights depart New York City in 2013. To get the actual number of flights by each airline we can use the group_by(), summarize(), and n() functions in the dplyr package on the carrier variable in flights, which we will introduce formally in Chapter 5.

    flights_table <- flights %>% 
       group_by(carrier) %>% 
       summarize(number = n())
     flights_table
    - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +carrier + +number +
    +9E + +18460 +
    +AA + +32729 +
    +AS + +714 +
    +B6 + +54635 +
    +DL + +48110 +
    +EV + +54173 +
    +F9 + +685 +
    +FL + +3260 +
    +HA + +342 +
    +MQ + +26397 +
    +OO + +32 +
    +UA + +58665 +
    +US + +20536 +
    +VX + +5162 +
    +WN + +12275 +
    +YV + +601 +

    In this table, the counts of the carriers are pre-counted. To create a barplot using the data frame flights_table, we

    • use geom_col() instead of geom_bar()
    • @@ -1254,8 +1799,128 @@

      3.8.4 Summary

      3.9 Conclusion

      3.9.1 Putting it all together

      -

      Let’s recap all five of the Five Named Graphs (5NG) in Table ?? summarizing their differences. Using these 5NG, you’ll be able to visualize the distributions and relationships of variables contained in a wide array of datasets. This will be even more the case as we start to map more variables to more of each geometric object’s aesthetic attribute options, further unlocking the awesome power of the ggplot2 package.

      - +

      Let’s recap all five of the Five Named Graphs (5NG) in Table 3.5 summarizing their differences. Using these 5NG, you’ll be able to visualize the distributions and relationships of variables contained in a wide array of datasets. This will be even more the case as we start to map more variables to more of each geometric object’s aesthetic attribute options, further unlocking the awesome power of the ggplot2 package.

      + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +Table 3.5: Summary of 5NG +
      + +Named graph + +Shows + +Geometric object + +Notes +
      +1 + +Scatterplot + +Relationship between 2 numerical variables + +geom_point() + +
      +2 + +Linegraph + +Relationship between 2 numerical variables + +geom_line() + +Used when there is a sequential order to x-variable e.g. time +
      +3 + +Histogram + +Distribution of 1 numerical variable + +geom_histogram() + +Facetted histogram shows distribution of 1 numerical variable split by 1 categorical variable +
      +4 + +Boxplot + +Distribution of 1 numerical variable split by 1 categorical variable + +geom_boxplot() + +
      +5 + +Barplot + +Distribution of 1 categorical variable + +geom_barplot() when counts are not pre-counted + +Stacked & dodged barplots show distribution of 2 categorical variables +
      + + + +geom_col() when counts are pre-counted + +

      3.9.2 Review questions

      diff --git a/docs/4-tidy.html b/docs/4-tidy.html index 6aa2cee99..3f1fb347e 100644 --- a/docs/4-tidy.html +++ b/docs/4-tidy.html @@ -25,7 +25,7 @@ - + @@ -47,6 +47,7 @@ + @@ -538,7 +539,9 @@

      Needed packages

      DataCamp

      Our approach to introducing the concept of “tidy” data is aligned with the approach taken in Alison Hill’s DataCamp course “Working with Data in the Tidyverse,” a course where students learn to work with data using tools from the tidyverse in R. If you’re interested in complementing your learning below in an interactive online environment, click on the image below to access the course. The relevant chapter is Chapter 3 “Tidy your data.”

      -

      +
      + +

      @@ -572,11 +575,187 @@

      4.1 What is tidy data?

      For example, say the following table consists of stock prices:

      - + + + + + + + + + + + + + + + + + + + + + + + + +
      +Table 4.1: Stock Prices (Non-Tidy Format) +
      +Date + +Boeing Stock Price + +Amazon Stock Price + +Google Stock Price +
      +2009-01-01 + +$173.55 + +$174.90 + +$174.34 +
      +2009-01-02 + +$172.61 + +$171.42 + +$170.04 +

      Although the data are neatly organized in a spreadsheet-type format, they are not in tidy format since there are three variables corresponding to three unique pieces of information (Date, Stock Name, and Stock Price), but there are not three columns. In tidy data format each variable should be its own column, as shown below. Notice that both tables present the same information, but in different formats.

      - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +Table 4.2: Stock Prices (Tidy Format) +
      +Date + +Stock Name + +Stock Price +
      +2009-01-01 + +Boeing + +$173.55 +
      +2009-01-02 + +Boeing + +$172.61 +
      +2009-01-01 + +Amazon + +$174.90 +
      +2009-01-02 + +Amazon + +$171.42 +
      +2009-01-01 + +Google + +$174.34 +
      +2009-01-02 + +Google + +$170.04 +

      However, consider the following table

      - + + + + + + + + + + + + + + + + + + + + + +
      +Table 4.3: Date, Boeing Price, Weather Data +
      +Date + +Boeing Price + +Weather +
      +2009-01-01 + +$173.55 + +Sunny +
      +2009-01-02 + +$172.61 + +Overcast +

      In this case, even though the variable “Boeing Price” occurs again, the data is tidy since there are three variables corresponding to three unique pieces of information (Date, Boeing stock price, and the weather that particular day).

      The non-tidy data format in the original table is also known as “wide” format whereas the tidy data format in the second table is also known as “long/narrow” data format.

      In this book, we will work mostly with datasets that are already in tidy format even though a lot of the world’s data isn’t always in this nice format that the tidyverse gets its name from. Data that is in wide format can be converted to “tidy” format by using the gather() function in the tidyr package (Wickham and Henry 2018) in the tidyverse; we’ll show an example of this in Section 4.4. For other examples of converting a dataset into “tidy” format, check out the different functions available for data tidying and a case study using data from the World Health Organization in R for Data Science (Grolemund and Wickham 2016).

      @@ -785,7 +964,9 @@

      4.5 Optional: Normal forms of dat

      We saw an example of this above with the airlines dataset. While the flights data frame could also include a column with the names of the airlines instead of the carrier code, this would be repetitive since there is a unique mapping of the carrier code to the name of the airline/carrier.

      Below an example is given showing how to join the airlines data frame together with the flights data frame by linking together the two datasets via a common key of "carrier". Note that this “joined” data frame is assigned to a new data frame called joined_flights. The key variable that we frequently join by is one of the identification variables mentioned above.

      library(dplyr)
      -joined_flights <- inner_join(x = flights, y = airlines, by = "carrier")
      +joined_flights <- inner_join(x = flights, + y = airlines, + by = "carrier")

    View(joined_flights)

    If we View this dataset, we see a new variable has been created called name. (We will see in Subsection 5.9.2 ways to change name to a more descriptive variable name.) More discussion about joining data frames together will be given in Chapter 5. We will see there that the names of the columns to be linked need not match as they did here with "carrier".

    diff --git a/docs/5-wrangling.html b/docs/5-wrangling.html index bd91955e0..2ec6ace3d 100644 --- a/docs/5-wrangling.html +++ b/docs/5-wrangling.html @@ -25,7 +25,7 @@ - + @@ -47,6 +47,7 @@ + @@ -548,9 +549,13 @@

    Needed packages

    DataCamp

    Our approach to introducing data wrangling tools from the dplyr package is very similar to the approach taken in David Robinson’s DataCamp course “Introduction to the Tidyverse,” a course targeted at people new to R and the tidyverse. If you’re interested in complementing your learning below in an interactive online environment, click on the image below to access the course. The relevant chapters are Chapter 1 on “Data wrangling” and Chapter 3 on “Grouping and summarizing.”

    -

    +
    + +

    While not required for this book, if you would like a quick peek at more powerful tools to explore, tame, tidy, and transform data, we suggest you take Alison Hill’s DataCamp course “Working with Data in the Tidyverse,” Click on the image below to access the course. The relevant chapter is Chapter 3 “Tidy your data.”

    -

    +
    + +

    5.1 The pipe %>%

    @@ -612,7 +617,9 @@

    5.3 Filter observations using fil

    To see many of these in action, let’s select all flights that left JFK airport heading to Burlington, Vermont ("BTV") or Seattle, Washington ("SEA") in the months of October, November, or December. Run the following

    btv_sea_flights_fall <- flights %>% 
    -  filter(origin == "JFK", (dest == "BTV" | dest == "SEA"), month >= 10)
    +  filter(origin == "JFK", 
    +         dest == "BTV" | dest == "SEA", 
    +         month >= 10)
     View(btv_sea_flights_fall)

    Note: even though colloquially speaking one might say “all flights leaving Burlington, Vermont and Seattle, Washington,” in terms of computer logical operations, we really mean “all flights leaving Burlington, Vermont or Seattle, Washington.” For a given row in the data, dest can be “BTV”, “SEA”, or something else, but not “BTV” and “SEA” at the same time.

    Another example uses the ! to pick rows that don’t match a condition. The ! can be read as “not.” Here we are selecting rows corresponding to flights that didn’t go to Burlington, VT or Seattle, WA.

    @@ -650,7 +657,26 @@

    5.4 Summarize variables using sum summarize(mean = mean(temp), std_dev = sd(temp)) summary_temp

    - + + + + + + + + + + + + + +
    +mean + +std_dev +
    + +

    We’ve created a small data frame here called summary_temp that includes both the mean and the std_dev of the temp variable in weather. Notice as shown in Figures 5.2 and 5.3, the data frame weather went from many rows to a single row of just the summary values in the data frame summary_temp.

    But why are the values returned NA? This stands for “not available or not applicable” and is how R encodes missing values; if in a data frame for a particular row and column no value exists, NA is stored instead. Furthermore, by default any time you try to summarize a number of values (using mean() and sd() for example) that has one or more missing values, then NA is returned.

    Values can be missing for many reasons. Perhaps the data was collected but someone forgot to enter it? Perhaps the data was not collected at all because it was too difficult? Perhaps there was an erroneous value that someone entered that has been correct to read as missing? You’ll often encounter issues with missing values.

    @@ -659,7 +685,28 @@

    5.4 Summarize variables using sum summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) summary_temp

    - + + + + + + + + + + + + + +
    +mean + +std_dev +
    +55.26039 + +17.78785 +

    It is not good practice to include a na.rm = TRUE in your summary commands by default; you should attempt to run code first without this argument as this will alert you to the presence of missing data. Only after you’ve identified where missing values occur and have thought about the potential causes of this missing should you consider using na.rm = TRUE. In the upcoming Learning Checks we’ll consider the possible ramifications of blindly sweeping rows with missing values under the rug.

    @@ -664,7 +859,7 @@

    7.1.1 Exploratory data analysis \begin{center} - +
    \end{center} --> @@ -690,7 +885,107 @@

    7.1.2 Multiple regression

    Just as we did when we had a single numerical explanatory variable \(x\) in Subsection 6.1.2 and when we had a single categorical explanatory variable \(x\) in Subsection 6.2.2, we fit a regression model and obtained the regression table in our two numerical explanatory variable scenario. To fit a regression model and get a table using get_regression_table(), we now use a + to consider multiple explanatory variables. In this case since we want to perform a regression of Limit and Income simultaneously, we input Balance ~ Limit + Income.

    Balance_model <- lm(Balance ~ Limit + Income, data = Credit)
     get_regression_table(Balance_model)
    - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 7.3: Multiple regression table +
    +term + +estimate + +std_error + +statistic + +p_value + +lower_ci + +upper_ci +
    +intercept + +-385.179 + +19.465 + +-19.8 + +0 + +-423.446 + +-346.912 +
    +Limit + +0.264 + +0.006 + +45.0 + +0 + +0.253 + +0.276 +
    +Income + +-7.663 + +0.385 + +-19.9 + +0 + +-8.420 + +-6.906 +

    How do we interpret these three values that define the regression plane?

    • Intercept: -$385.18 (rounded to two decimal points to represent cents). The intercept in our case represents the credit card balance for an individual who has both a credit Limit of $0 and Income of $0. In our data however, the intercept has limited practical interpretation as no individuals had Limit or Income values of $0 and furthermore the smallest credit card balance was $0. Rather, it is used to situate the regression plane in 3D space.
    • @@ -715,10 +1010,138 @@

      7.1.2 Multiple regression

    7.1.3 Observed/fitted values and residuals

    -

    As we did previously in Table ??, let’s unpack the output of the get_regression_points() function for our model for credit card balance for all 400 card holders in the dataset. Recall that each card holder corresponds to one of the 400 rows in the Credit data frame and also for one of the 400 3D points in the 3D scatterplots in Subsection 7.1.1.

    +

    As we did previously in Table 7.4, let’s unpack the output of the get_regression_points() function for our model for credit card balance for all 400 card holders in the dataset. Recall that each card holder corresponds to one of the 400 rows in the Credit data frame and also for one of the 400 3D points in the 3D scatterplots in Subsection 7.1.1.

    regression_points <- get_regression_points(Balance_model)
     regression_points
    - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 7.4: Regression points (first 5 rows of 400) +
    +ID + +Balance + +Limit + +Income + +Balance_hat + +residual +
    +1 + +333 + +3606 + +14.9 + +454 + +-120.8 +
    +2 + +903 + +6645 + +106.0 + +559 + +344.3 +
    +3 + +580 + +7075 + +104.6 + +683 + +-103.4 +
    +4 + +964 + +9504 + +148.9 + +986 + +-21.7 +
    +5 + +331 + +4897 + +55.9 + +481 + +-150.0 +

    Recall the format of the output:

    • Balance corresponds to \(y\) (the observed value)
    • @@ -786,9 +1209,83 @@

      7.2.1 Exploratory data analysisLet’s reload the evals data and select() only the needed subset of variables. Note that these are different than the variables chosen in Chapter 6. Let’s given this the name evals_ch7.

      evals_ch7 <- evals %>%
         select(score, age, gender)
      -

      Let’s look at the raw data values both by bringing up RStudio’s spreadsheet viewer and the glimpse() function, although in Table ?? we only show 5 randomly selected instructors out of 463:

      +

      Let’s look at the raw data values both by bringing up RStudio’s spreadsheet viewer and the glimpse() function, although in Table 7.5 we only show 5 randomly selected instructors out of 463:

      View(evals_ch7)
      - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +Table 7.5: Random sample of 5 instructors +
      +score + +age + +gender +
      +3.6 + +34 + +male +
      +4.9 + +43 + +male +
      +3.3 + +47 + +male +
      +4.4 + +33 + +female +
      +4.7 + +60 + +male +

      Let’s look at some summary statistics using the skim() function from the skimr package:

      evals_ch7 %>% 
         skim()
      @@ -796,15 +1293,15 @@

      7.2.1 Exploratory data analysis

      Furthermore, let’s compute the correlation between two numerical variables we have score and age. Recall that correlation coefficients only exist between numerical variables. We observe that they are weakly negatively correlated.

      @@ -840,7 +1337,107 @@

      7.2.2 Multiple regression: Parall

      Much like we started to consider multiple explanatory variables using the + sign in Subsection 7.1.2, let’s fit a regression model and get the regression table. This time we provide the name of score_model_2 to our regression model fit, in so as to not overwrite the model score_model from Section 6.1.2.

      score_model_2 <- lm(score ~ age + gender, data = evals_ch7)
       get_regression_table(score_model_2)
      - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +Table 7.6: Regression table +
      +term + +estimate + +std_error + +statistic + +p_value + +lower_ci + +upper_ci +
      +intercept + +4.484 + +0.125 + +35.79 + +0.000 + +4.238 + +4.730 +
      +age + +-0.009 + +0.003 + +-3.28 + +0.001 + +-0.014 + +-0.003 +
      +gendermale + +0.191 + +0.052 + +3.63 + +0.000 + +0.087 + +0.294 +

      The modeling equation for this scenario is:

      \[\begin{align} \widehat{y} &= b_0 + b_1 \cdot x_1 + b_2 \cdot x_2 \\ @@ -871,7 +1468,130 @@

      7.2.3 Multiple regression: Intera

      Let’s fit a regression with an interaction term. Instead of using the + sign in the enumeration of explanatory variables, we use the * sign. Let’s fit this regression and save it in score_model_3, then we get the regression table using the get_regression_table() function as before.

      score_model_interaction <- lm(score ~ age * gender, data = evals_ch7)
       get_regression_table(score_model_interaction)
      - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
      +Table 7.7: Regression table +
      +term + +estimate + +std_error + +statistic + +p_value + +lower_ci + +upper_ci +
      +intercept + +4.883 + +0.205 + +23.80 + +0.000 + +4.480 + +5.286 +
      +age + +-0.018 + +0.004 + +-3.92 + +0.000 + +-0.026 + +-0.009 +
      +gendermale + +-0.446 + +0.265 + +-1.68 + +0.094 + +-0.968 + +0.076 +
      +age:gendermale + +0.014 + +0.006 + +2.45 + +0.015 + +0.003 + +0.024 +

      The modeling equation for this scenario is:

      \[\begin{align} \widehat{y} &= b_0 + b_1 \cdot x_1 + b_2 \cdot x_2 + b_3 \cdot x_1 \cdot x_2\\ @@ -893,7 +1613,48 @@

      7.2.3 Multiple regression: Intera &= 4.883 -0.018 \cdot \mbox{age} \end{align}\]

      Let’s summarize these values in a table:

      - + + + + + + + + + + + + + + + + + + + + + +
      +Table 7.8: Comparison of male and female intercepts and age slopes +
      +Gender + +Intercept + +Slope for age +
      +Male instructors + +4.44 + +-0.004 +
      +Female instructors + +4.88 + +-0.018 +

      We see that while male instructors have a lower intercept, as they age, they have a less steep associated average decrease in teaching scores: 0.004 teaching score units per year as opposed to -0.018 for women. This is consistent with the different slopes and intercepts of the red and blue regression lines fit in Figure 7.5. Recall our definition of a model having an interaction effect: when the associated effect of one variable, in this case age, depends on the value of another variable, in this case gender.

      But how do we know when it’s appropriate to include an interaction effect? For example, which is the more appropriate model? The regular multiple regression model without an interaction term we saw in Section 7.2.2 or the multiple regression model with the interaction term we just saw? We’ll revisit this question in Chapter 11 on “inference for regression.”

    @@ -902,7 +1663,135 @@

    7.2.4 Observed/fitted values and

    Now say we want to apply the above calculations for male and female instructors for all 463 instructors in the evals_ch7 dataset. As our multiple regression models get more and more complex, computing such values by hand gets more and more tedious. The get_regression_points() function spares us this tedium and returns all fitted values and all residuals. For simplicity, let’s focus only on the fitted interaction model, which is saved in score_model_interaction.

    regression_points <- get_regression_points(score_model_interaction)
     regression_points
    - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +Table 7.9: Regression points (first 5 rows of 463) +
    +ID + +score + +age + +gender + +score_hat + +residual +
    +1 + +4.7 + +36 + +female + +4.25 + +0.448 +
    +2 + +4.1 + +36 + +female + +4.25 + +-0.152 +
    +3 + +3.9 + +36 + +female + +4.25 + +-0.352 +
    +4 + +4.8 + +36 + +female + +4.25 + +0.548 +
    +5 + +4.6 + +59 + +male + +4.20 + +0.399 +

    Recall the format of the output: