17-R-DataCamp-Exploratory-Data-Analysis-in-R-Case-Study
程序员文章站
2024-01-30 23:22:34
...
17-R-DataCamp-Exploratory-Data-Analysis-in-R-Case-Study
1. Data Cleaning and Summarizing with dplyr
1.1 The United Nations Voting Dataset (video)
1.2 Filtering rows
Instruction :
# Load the dplyr package
library(dplyr)
# Print the votes dataset
votes
# Filter for votes that are "yes", "abstain", or "no"
votes %>%
filter(vote %in% c(1, 2, 3))
1.3 Adding a year column
Instruction :
# Add another %>% step to add a year column
votes %>%
filter(vote <= 3) %>%
mutate(year = 1945 + session)
1.4 Adding a country column
Instruction :
# Load the countrycode package
library(countrycode)
# Convert country code 100
countrycode(100, "cown", "country.name")
# Add a country column within the mutate: votes_processed
votes_processed <- votes %>%
filter(vote <= 3) %>%
mutate(year = session + 1945,
country = countrycode(ccode, "cown", "country.name")
)
1.5 Grouping and summarizing (video)
1.6 Summarizing the full dataset
Instruction :
# Print votes_processed
votes_processed
# Find total and fraction of "yes" votes
votes_processed %>%
summarise(total = n(),
percent_yes = mean(vote == 1))
1.7 Summarizing by year
Instruction :
# Change this code to summarize by year
votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
1.8 Summarizing by country
Instruction :
# Summarize by country: by_country
by_country <- votes_processed %>%
group_by(country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
1.9 Sorting and filtering summarized data (video)
1.10 Sorting by percentage of yes"votes
Instruction :
# You have the votes summarized by country
by_country <- votes_processed %>%
group_by(country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Print the by_country dataset
by_country
# Sort in ascending order of percent_yes
by_country %>%
arrange(percent_yes)
# Now sort in descending order
by_country %>%
arrange(desc(percent_yes))
1.11 Filtering summarized output
Instruction :
# Filter out countries with fewer than 100 votes
by_country %>%
arrange(percent_yes) %>%
filter(total >= 100)
2. Data Visualization with ggplot2
2.1 Visualization with ggplot2 (video)
2.2 Choosing an aesthetic
2.3 Plotting a line over time
Instruction :
# Define by_year
by_year <- votes_processed %>%
group_by(year) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Load the ggplot2 package
library(ggplot2)
# Create line plot
ggplot(by_year, aes(x = year, y = percent_yes)) +
geom_line()
2.4 Other ggplot2 layers
Instruction :
# Change to scatter plot and add smoothing curve
ggplot(by_year, aes(year, percent_yes)) +
geom_point() +
geom_smooth()
2.5 Visualizing by country (video)
2.6 Summarizing by year and country
Instruction :
# Group by year and country: by_year_country
by_year_country <- votes_processed %>%
group_by(year,country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
2.7 Plotting just the UK over time
Instruction :
# Start with by_year_country dataset
by_year_country <- votes_processed %>%
group_by(year, country) %>%
summarize(total = n(),
percent_yes = mean(vote == 1))
# Print by_year_country
by_year_country
# Create a filtered version: UK_by_year
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom")
# Line plot of percent_yes over time for UK only
ggplot(UK_by_year, aes(x = year, y = percent_yes)) +
geom_line()
2.8 Plotting multiple countries
Instruction :
# Vector of four countries to examine
countries <- c("United States", "United Kingdom",
"France", "India")
# Filter by_year_country: filtered_4_countries
filtered_4_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes in four countries
ggplot(filtered_4_countries, aes(x = year, y = percent_yes, color = country)) +
geom_line()
2.9 Faceting by country (video)
2.10 Faceting the time series
Instruction :
# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
"France", "Japan", "Brazil", "India")
# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~country)
2.11 Faceting with free y-axis
Instruction :
# Vector of six countries to examine
countries <- c("United States", "United Kingdom",
"France", "Japan", "Brazil", "India")
# Filtered by_year_country: filtered_6_countries
filtered_6_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_6_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scales = "free_y")
2.12 Choose your own countries
Instruction :
# Add three more countries to this list
countries <- c("United States", "United Kingdom", "France", "Japan",
"Brazil", "India", "Chile", "China", "Colombia")
# Filtered by_year_country: filtered_countries
filtered_countries <- by_year_country %>%
filter(country %in% countries)
# Line plot of % yes over time faceted by country
ggplot(filtered_countries, aes(year, percent_yes)) +
geom_line() +
facet_wrap(~ country, scales = "free_y")
3. Tidy Modeling with Broom
3.1 Linear regression (video)
3.2 Linear regression on the United States
Instruction :
# Percentage of yes votes from the US by year: US_by_year
US_by_year <- by_year_country %>%
filter(country == "United States")
# Print the US_by_year data
US_by_year
# Perform a linear regression of percent_yes by year: US_fit
US_fit <- lm(percent_yes ~ year,US_by_year)
# Perform summary() on the US_fit object
summary(US_fit)
3.3 Finding the slope of a linear regression
3.4 Finding the p-value of a linear regression
3.5 Tidying models with broom (video)
3.7 Tidying a linear regression model
Instruction :
# Load the broom package
library(broom)
# Call the tidy() function on the US_fit object
tidy(US_fit)
3.8 Combining models for multiple countries
Instruction :
# Linear regression of percent_yes by year for US
US_by_year <- by_year_country %>%
filter(country == "United States")
US_fit <- lm(percent_yes ~ year, US_by_year)
# Fit model for the United Kingdom
UK_by_year <- by_year_country %>%
filter(country == "United Kingdom")
UK_fit <- lm(percent_yes ~ year, UK_by_year)
# Create US_tidied and UK_tidied
US_tidied <- tidy(US_fit)
UK_tidied <- tidy(UK_fit)
# Combine the two tidied models
bind_rows(US_tidied, UK_tidied)
3.9 Nesting for multiple models (video)
3.10 Nesting a data frame
Instruction :
# Load the tidyr package
library(tidyr)
# Nest all columns besides country
by_year_country %>%
nest(-country)
3.11 List columns
Instruction :
# All countries are nested besides country
nested <- by_year_country %>%
nest(-country)
# Print the nested data for Brazil
nested$data[[7]]
3.12 Unnesting
Instruction :
# All countries are nested besides country
nested <- by_year_country %>%
nest(-country)
# Unnest the data column to return it to its original form
nested %>%
unnest()
3.13 Fitting multiple models (video)
3.14 Performing linear regression on each nested dataset
Instruction :
# Load tidyr and purrr
library(tidyr)
library(purrr)
# Perform a linear regression on each item in the data column
by_year_country %>%
nest(-country)%>%
mutate(model = map(data, ~lm(percent_yes ~ year, .)))
3.15 Tidy each linear regression mode
Instruction :
# Load the broom package
library(broom)
# Add another mutate that applies tidy() to each model
by_year_country %>%
nest(-country) %>%
mutate(model = map(data, ~ lm(percent_yes ~ year, data = .))) %>%
mutate(tidied = map(model, tidy))
3.16 Unnesting a data frame
Instruction :
# Add one more step that unnests the tidied column
country_coefficients <- by_year_country %>%
nest(-country) %>%
mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
tidied = map(model, tidy)) %>%
unnest(tidied)
# Print the resulting country_coefficients variable
country_coefficients
3.17 Working with many tidy models (video)
3.18 Filtering model terms
Instruction :
# Print the country_coefficients dataset
country_coefficients
# Filter for only the slope terms
country_coefficients %>%
filter(term == "year")
3.19 Filtering for significant countries
Instruction :
# Filter for only the slope terms
slope_terms <- country_coefficients %>%
filter(term == "year")
# Add p.adjusted column, then filter
slope_terms %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < .05)
3.20 Sorting by slope
Instruction :
# Filter by adjusted p-values
filtered_countries <- country_coefficients %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < .05)
# Sort for the countries increasing most quickly
filtered_countries %>%
arrange(estimate)
# Sort for the countries decreasing most quickly
filtered_countries %>%
arrange(desc(estimate))
4. Joining and Tidying
4.1 Joining datasets (video)
4.2 Joining datasets with inner_join
Instruction :
# Load dplyr package
library(dplyr)
# Print the votes_processed dataset
votes_processed
# Print the descriptions dataset
descriptions
# Join them together based on the "rcid" and "session" columns
votes_joined <- votes_processed %>%
inner_join(descriptions, by = c("rcid", "session"))
4.3 Filtering the joined dataset
Instruction :
# Filter for votes related to colonialism
votes_joined %>%
filter(co == 1)
4.4 Visualizing colonialism votes
Instruction :
# Load the ggplot2 package
library(ggplot2)
# Filter, then summarize by year: US_co_by_year
US_co_by_year <- votes_joined %>%
filter(country == "United States",co == 1) %>%
group_by(year) %>%
summarize(percent_yes = mean(vote == 1))
# Graph the % of "yes" votes over time
ggplot(US_co_by_year, aes(x = year, y = percent_yes)) +
geom_line()
4.5 Tidy data (video)
4.6 Tidy data observations
4.7 Using gather to tidy a dataset
Instruction :
# Load the tidyr package
library(tidyr)
# Gather the six me/nu/di/hr/co/ec columns
votes_joined %>%
gather(topic, has_topic, me:ec)
# Perform gather again, then filter
votes_gathered <- votes_joined %>%
gather(topic, has_topic, me:ec) %>%
filter(has_topic == 1)
4.8 Recoding the topics
Instruction :
# Replace the two-letter codes in topic: votes_tidied
votes_tidied <- votes_gathered %>%
mutate(topic = recode(topic,
me = "Palestinian conflict",
nu = "Nuclear weapons and nuclear material",
di = "Arms control and disarmament",
hr = "Human rights",
co = "Colonialism",
ec = "Economic development"))
4.9 Summarize by country, year, and topic
Instruction :
# Print votes_tidied
votes_tidied
# Summarize the percentage "yes" per country-year-topic
by_country_year_topic <- votes_tidied %>%
group_by(country, year, topic) %>%
summarize(total = n(), percent_yes = mean(vote == 1) ) %>%
ungroup()
# Print by_country_year_topic
by_country_year_topic
4.10 Visualizing trends in topics for one country
Instruction :
# Load the ggplot2 package
library(ggplot2)
# Filter by_country_year_topic for just the US
US_by_country_year_topic <- by_country_year_topic %>%
filter(country == "United States")
# Plot % yes over time for the US, faceting by topic
ggplot(US_by_country_year_topic, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~topic)
4.11 Tidy modeling by topic and country (video)
4.12 Nesting by topic and country
Instruction :
# Load purrr, tidyr, and broom
library(purrr)
library(tidyr)
library(broom)
# Print by_country_year_topic
by_country_year_topic
# Fit model on the by_country_year_topic dataset
country_topic_coefficients <- by_country_year_topic %>%
nest(-country, -topic) %>%
mutate(model = map(data, ~ lm(percent_yes ~ year, data = .)),
tidied = map(model, tidy)) %>%
unnest(tidied)
# Print country_topic_coefficients
country_topic_coefficients
4.13 Interpreting tidy models
Instruction :
# Create country_topic_filtered
country_topic_filtered <- country_topic_coefficients %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value)) %>%
filter(p.adjusted < .05)
4.14 Steepest trends by topic
4.15 Checking models visually
Instruction :
by_country_year_topic %>%
filter(country == "Vanuatu")
# Plot of percentage "yes" over time, faceted by topic
ggplot(vanuatu_by_country_year_topic, aes(x = year, y = percent_yes)) +
geom_line() +
facet_wrap(~topic)
4.16 Conclusion
上一篇: gitlab修改用户密码