欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

17-R-DataCamp-Exploratory-Data-Analysis-in-R-Case-Study

程序员文章站 2024-01-30 23:22:34
...

17-R-DataCamp-Exploratory-Data-Analysis-in-R-Case-Study

1. Data Cleaning and Summarizing with dplyr

1.1 The United Nations Voting Dataset (video)
1.2 Filtering rows

Instruction :

# Load the dplyr package
library(dplyr)

# Print the votes dataset
votes

# Filter for votes that are "yes", "abstain", or "no"
votes %>%
filter(vote %in% c(1, 2, 3))
1.3 Adding a year column

Instruction :

# Add another %>% step to add a year column
votes %>%
  filter(vote <= 3) %>%
  mutate(year = 1945 + session)
1.4 Adding a country column

Instruction :

# Load the countrycode package
library(countrycode)
# Convert country code 100
countrycode(100, "cown", "country.name")

# Add a country column within the mutate: votes_processed
votes_processed <- votes %>%
  filter(vote <= 3) %>%
  mutate(year = session + 1945,
         country = countrycode(ccode, "cown", "country.name")
)
1.5 Grouping and summarizing (video)
1.6 Summarizing the full dataset

Instruction :

# Print votes_processed
votes_processed

# Find total and fraction of "yes" votes
votes_processed %>%
summarise(total = n(),
percent_yes = mean(vote == 1))
1.7 Summarizing by year

Instruction :

# Change this code to summarize by year
votes_processed %>%
  group_by(year) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1)) 
1.8 Summarizing by country

Instruction :

# Summarize by country: by_country
by_country <- votes_processed %>%
  group_by(country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))
1.9 Sorting and filtering summarized data (video)
1.10 Sorting by percentage of yes"votes

Instruction :

# You have the votes summarized by country
by_country <- votes_processed %>%
  group_by(country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Print the by_country dataset
by_country

# Sort in ascending order of percent_yes
by_country %>%
arrange(percent_yes)

# Now sort in descending order
by_country %>%
arrange(desc(percent_yes))
1.11 Filtering summarized output

Instruction :

# Filter out countries with fewer than 100 votes
by_country %>%
  arrange(percent_yes) %>%
  filter(total >= 100)

2. Data Visualization with ggplot2

2.1 Visualization with ggplot2 (video)
2.2 Choosing an aesthetic
2.3 Plotting a line over time

Instruction :

# Define by_year
by_year <- votes_processed %>%
  group_by(year) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Load the ggplot2 package
library(ggplot2)

# Create line plot
ggplot(by_year, aes(x = year, y = percent_yes)) +
  geom_line()
2.4 Other ggplot2 layers

Instruction :

# Change to scatter plot and add smoothing curve
ggplot(by_year, aes(year, percent_yes)) +
  geom_point() +
  geom_smooth()
2.5 Visualizing by country (video)
2.6 Summarizing by year and country

Instruction :

# Group by year and country: by_year_country
by_year_country <- votes_processed %>%
  group_by(year,country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))
2.7 Plotting just the UK over time

Instruction :

# Start with by_year_country dataset
by_year_country <- votes_processed %>%
  group_by(year, country) %>%
  summarize(total = n(),
            percent_yes = mean(vote == 1))

# Print by_year_country
by_year_country

# Create a filtered version: UK_by_year
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom")

# Line plot of percent_yes over time for UK only
ggplot(UK_by_year, aes(x = year, y = percent_yes)) +
geom_line()
2.8 Plotting multiple countries

Instruction :

# Vector of four countries to examine
countries <- c("United States", "United Kingdom",
               "France", "India")

# Filter by_year_country: filtered_4_countries
filtered_4_countries <- by_year_country %>%
filter(country %in% countries)

# Line plot of % yes in four countries
ggplot(filtered_4_countries, aes(x = year, y = percent_yes, color = country)) +
geom_line()
2.9 Faceting by country (video)
2.10 Faceting the time series

Instruction :

# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
               "France", "Japan", "Brazil", "India")

# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
filter(country %in% countries)

# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~country)
2.11 Faceting with free y-axis

Instruction :

# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
               "France", "Japan", "Brazil", "India")

# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
  filter(country %in% countries)

# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(year, percent_yes)) +
  geom_line() +
  facet_wrap(~ country, scales = "free_y")
2.12 Choose your own countries

Instruction :

# Add three more countries to this list
countries <- c("United States", "United Kingdom", "France", "Japan", 
               "Brazil", "India", "Chile", "China", "Colombia")

# Filtered by_year_country: filtered_countries
filtered_countries <- by_year_country %>%
  filter(country %in% countries)

# Line plot of % yes over time faceted by country
ggplot(filtered_countries, aes(year, percent_yes)) +
  geom_line() +
  facet_wrap(~ country, scales = "free_y")

3. Tidy Modeling with Broom

3.1 Linear regression (video)
3.2 Linear regression on the United States

Instruction :

# Percentage of yes votes from the US by year: US_by_year
US_by_year <- by_year_country %>%
  filter(country == "United States")

# Print the US_by_year data
US_by_year

# Perform a linear regression of percent_yes by year: US_fit
US_fit <- lm(percent_yes ~ year,US_by_year)

# Perform summary() on the US_fit object
summary(US_fit)
3.3 Finding the slope of a linear regression
3.4 Finding the p-value of a linear regression
3.5 Tidying models with broom (video)
3.7 Tidying a linear regression model

Instruction :

# Load the broom package
library(broom)

# Call the tidy() function on the US_fit object
tidy(US_fit)
3.8 Combining models for multiple countries

Instruction :

# Linear regression of percent_yes by year for US
US_by_year <- by_year_country %>%
  filter(country == "United States")
US_fit <- lm(percent_yes ~ year, US_by_year)

# Fit model for the United Kingdom
UK_by_year <- by_year_country %>%
  filter(country == "United Kingdom")
UK_fit <- lm(percent_yes ~ year, UK_by_year)

# Create US_tidied and UK_tidied
US_tidied <- tidy(US_fit)
UK_tidied <- tidy(UK_fit)

# Combine the two tidied models
bind_rows(US_tidied, UK_tidied)
3.9 Nesting for multiple models (video)
3.10 Nesting a data frame

Instruction :

# Load the tidyr package
library(tidyr)

# Nest all columns besides country
by_year_country %>%
nest(-country)
3.11 List columns

Instruction :

# All countries are nested besides country
nested <- by_year_country %>%
  nest(-country)

# Print the nested data for Brazil
nested$data[[7]]
3.12 Unnesting

Instruction :

# All countries are nested besides country
nested <- by_year_country %>%
  nest(-country)

# Unnest the data column to return it to its original form
nested %>%
unnest()
3.13 Fitting multiple models (video)
3.14 Performing linear regression on each nested dataset

Instruction :

# Load tidyr and purrr
library(tidyr)
library(purrr)


# Perform a linear regression on each item in the data column
by_year_country %>%
  nest(-country)%>%
  mutate(model = map(data, ~lm(percent_yes ~ year, .)))
3.15 Tidy each linear regression mode

Instruction :

# Load the broom package
library(broom)

# Add another mutate that applies tidy() to each model
by_year_country %>%
  nest(-country) %>%
  mutate(model = map(data, ~ lm(percent_yes ~ year, data = .))) %>%
  mutate(tidied = map(model, tidy))
3.16 Unnesting a data frame

Instruction :

# Add one more step that unnests the tidied column
country_coefficients <- by_year_country %>%
  nest(-country) %>%
  mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
         tidied = map(model, tidy)) %>%
  unnest(tidied)


# Print the resulting country_coefficients variable
country_coefficients
3.17 Working with many tidy models (video)
3.18 Filtering model terms

Instruction :

# Print the country_coefficients dataset
country_coefficients

# Filter for only the slope terms
country_coefficients %>%
filter(term == "year")
3.19 Filtering for significant countries

Instruction :

# Filter for only the slope terms
slope_terms <- country_coefficients %>%
  filter(term == "year")

# Add p.adjusted column, then filter
slope_terms %>%
 mutate(p.adjusted = p.adjust(p.value)) %>%
 filter(p.adjusted < .05)
3.20 Sorting by slope

Instruction :

# Filter by adjusted p-values
filtered_countries <- country_coefficients %>%
  filter(term == "year") %>%
  mutate(p.adjusted = p.adjust(p.value)) %>%
  filter(p.adjusted < .05)

# Sort for the countries increasing most quickly
filtered_countries %>%
  arrange(estimate)


# Sort for the countries decreasing most quickly
filtered_countries %>%
  arrange(desc(estimate))

4. Joining and Tidying

4.1 Joining datasets (video)
4.2 Joining datasets with inner_join

Instruction :

# Load dplyr package
library(dplyr)

# Print the votes_processed dataset
votes_processed

# Print the descriptions dataset
descriptions

# Join them together based on the "rcid" and "session" columns
votes_joined <- votes_processed %>%
  inner_join(descriptions, by = c("rcid", "session"))
4.3 Filtering the joined dataset

Instruction :

# Filter for votes related to colonialism
votes_joined %>%
  filter(co == 1)
4.4 Visualizing colonialism votes

Instruction :

# Load the ggplot2 package
library(ggplot2)

# Filter, then summarize by year: US_co_by_year
US_co_by_year <- votes_joined %>%
  filter(country == "United States",co == 1) %>%
  group_by(year) %>%
  summarize(percent_yes = mean(vote == 1))

# Graph the % of "yes" votes over time
ggplot(US_co_by_year, aes(x = year, y = percent_yes)) +
  geom_line()
4.5 Tidy data (video)
4.6 Tidy data observations
4.7 Using gather to tidy a dataset

Instruction :

# Load the tidyr package
library(tidyr)

# Gather the six me/nu/di/hr/co/ec columns
votes_joined %>%
  gather(topic, has_topic, me:ec)


# Perform gather again, then filter
votes_gathered <- votes_joined %>%
  gather(topic, has_topic, me:ec) %>%
  filter(has_topic == 1)
4.8 Recoding the topics

Instruction :

# Replace the two-letter codes in topic: votes_tidied
votes_tidied <- votes_gathered %>%
  mutate(topic = recode(topic,
                        me = "Palestinian conflict",
                        nu = "Nuclear weapons and nuclear material",
                        di = "Arms control and disarmament",
                        hr = "Human rights",
                        co = "Colonialism",
                        ec = "Economic development"))
4.9 Summarize by country, year, and topic

Instruction :

# Print votes_tidied
votes_tidied

# Summarize the percentage "yes" per country-year-topic
by_country_year_topic <- votes_tidied %>%
  group_by(country, year, topic) %>%
  summarize(total = n(), percent_yes = mean(vote == 1) ) %>%
  ungroup()

# Print by_country_year_topic
by_country_year_topic
4.10 Visualizing trends in topics for one country

Instruction :

# Load the ggplot2 package
library(ggplot2)

# Filter by_country_year_topic for just the US
US_by_country_year_topic <- by_country_year_topic %>%
filter(country == "United States")

# Plot % yes over time for the US, faceting by topic
ggplot(US_by_country_year_topic, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~topic)
4.11 Tidy modeling by topic and country (video)
4.12 Nesting by topic and country

Instruction :

# Load purrr, tidyr, and broom
library(purrr)
library(tidyr)
library(broom)

# Print by_country_year_topic
by_country_year_topic

# Fit model on the by_country_year_topic dataset
country_topic_coefficients <- by_country_year_topic %>%
nest(-country, -topic) %>%
  mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
         tidied = map(model, tidy)) %>%
  unnest(tidied)

# Print country_topic_coefficients
country_topic_coefficients
4.13 Interpreting tidy models

Instruction :

# Create country_topic_filtered
country_topic_filtered <- country_topic_coefficients %>%
  filter(term == "year") %>%
  mutate(p.adjusted = p.adjust(p.value)) %>%
  filter(p.adjusted < .05)
4.14 Steepest trends by topic
4.15 Checking models visually

Instruction :

by_country_year_topic %>%
  filter(country == "Vanuatu")

# Plot of percentage "yes" over time, faceted by topic
ggplot(vanuatu_by_country_year_topic, aes(x = year, y = percent_yes)) +
  geom_line() +
  facet_wrap(~topic)
4.16 Conclusion