16-R-DataCamp-Exploratory-Data-Analysis-in-R
程序员文章站
2024-01-30 23:26:52
...
16-R-DataCamp-Exploratory-Data-Analysis-in-R
1. Exploring Categorical Data
1.1 Exploring categorical data (video)
1.2 Bar chart expectations
1.3 Contingency table review
Instruction:
# Print the first rows of the data
comics
# Check levels of align
levels(comics$align)
# Check the levels of gender
levels(comics$gender)
# Create a 2-way contingency table
table(comics$align, comics$gender)
1.4 Dropping levels
Instruction:
# Load dplyr
library(dplyr)
# Print tab
tab
# Remove align level
comics_filtered <- comics %>%
filter(align != "Reformed Criminals") %>%
droplevels()
# See the result
comics_filtered
1.5 Side-by-side barcharts
Instruction:
# Load ggplot2
library(ggplot2)
# Create side-by-side barchart of gender by alignment
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "dodge")
# Create side-by-side barchart of alignment by gender
ggplot(comics, aes(x = gender, fill = align)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 90))
1.6 Bar chart interpretation
1.7 Counts vs proportions (video)
1.8 Conditional proportions
1.9 Counts vs proportions (2)
Instruction:
# Plot of gender by align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar()
# Plot proportion of gender, conditional on align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "fill") +
ylab("proportion")
1.10 Distribution of one variable (video)
1.11 Marginal barchart
Instruction:
# Change the order of the levels in align
comics$align <- factor(comics$align,
levels = c("Bad", "Neutral", "Good"))
# Create plot of align
ggplot(comics, aes(x = align)) +
geom_bar()
1.12 Conditional barchart
Instruction :
# Plot of alignment broken down by gender
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap(~ gender)
1.3 Improve piechart
Instruction:
# Put levels of flavor in descending order
lev <- c("apple", "key lime", "boston creme", "blueberry", "cherry", "pumpkin", "strawberry")
pies$flavor <- factor(pies$flavor, levels = lev)
# Create barchart of flavor
ggplot(pies, aes(x = flavor)) +
geom_bar(fill = "chartreuse") +
theme(axis.text.x = element_text(angle = 90))
2. Exploring Numerical Data
2.1 Exploring numerical data (video)
2.2 Faceted histogram
Instruction:
# Load package
library(ggplot2)
# Learn data structure
str(cars)
# Create faceted histogram
ggplot(cars, aes(x = city_mpg)) +
geom_histogram() +
facet_wrap(~ suv)
2.3 Boxplots and density plots
Instruction:
# Filter cars with 4, 6, 8 cylinders
common_cyl <- cars %>% filter(ncyl %in% c(4,6,8))
# Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
geom_boxplot()
# Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
geom_density(alpha = .3)
2.4 Compare distribution via plots
2.5 Distribution of one variable (video)
2.6 Marginal and conditional histograms
Instruction:
# Create hist of horsepwr
cars %>%
ggplot(aes(x = horsepwr)) +
geom_histogram() +
ggtitle("hist of horsepwr")
# Create hist of horsepwr for affordable cars
cars %>%
filter(msrp < 25000) %>%
ggplot(aes(x = horsepwr)) +
geom_histogram() +
xlim(c(90, 550)) +
ggtitle("hist of horsepwr for affordable cars")
2.7 Marginal and conditional histograms interpretation
2.8 Three binwidths
Instruction:
# Create hist of horsepwr with binwidth of 3
cars %>%
ggplot(aes(x = horsepwr)) +
geom_histogram(binwidth = 3) +
ggtitle("hist of horsepwr with binwidth of 3")
# Create hist of horsepwr with binwidth of 30
cars %>%
ggplot(aes(x = horsepwr)) +
geom_histogram(binwidth = 30) +
ggtitle("hist of horsepwr with binwidth of 30")
# Create hist of horsepwr with binwidth of 60
cars %>%
ggplot(aes(x = horsepwr)) +
geom_histogram(binwidth = 60) +
ggtitle("hist of horsepwr with binwidth of 60")
2.9 Three binwidths interpretation
2.10 Box plots (video)
2.11 Box plots for outliers
Instruction 1:
# Construct box plot of msrp
cars %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
# Exclude outliers from data
cars_no_out <- cars %>%
filter(msrp < 100000)
# Construct box plot of msrp using the reduced dataset
cars_no_out %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
2.12 Plot selection
Instruction:
# Create plot of city_mpg
cars %>%
ggplot(aes(x = 1 ,y = city_mpg)) +
geom_boxplot()
# Create plot of width
cars %>%
ggplot(aes(x = width)) +
geom_density()
2.13 Visualization in higher dimensions (video)
2.14 3 variable plot
Instruction:
# Facet hists using hwy mileage and ncyl
common_cyl %>%
ggplot(aes(x = hwy_mpg)) +
geom_histogram() +
facet_grid(ncyl ~ suv) +
ggtitle("Facet hists using hwy mileage and ncyl")
2.15 Interpret 3 var plot
3. Numerical Summaries
3.1 Measures of center (video)
3.2 Choice of center measure
3.3 Calculate center measures
Instruction 1:
# Create dataset of 2007 data
gap2007 <- filter(gapminder, year >= 2007)
# Compute groupwise mean and median lifeExp
gap2007 %>%
group_by(continent) %>%
summarize(mean(lifeExp),
median(lifeExp))
# Generate box plots of lifeExp for each continent
gap2007 %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot()
3.4 Measures of variability (video)
3.5 Choice of spread measure
3.6 Calculate spread measures
Instruction:
# Compute groupwise measures of spread
gap2007 %>%
group_by(continent) %>%
summarize(sd(lifeExp),
IQR(lifeExp),
n())
# Generate overlaid density plots
gap2007 %>%
ggplot(aes(x = lifeExp, fill = continent)) +
geom_density(alpha = 0.3)
3.7 Choose measures for center and spread
Instruction:
# Compute stats for lifeExp in Americas
gap2007 %>%
filter(continent == "Americas") %>%
summarize(mean(lifeExp),
sd(lifeExp))
# Compute stats for population
gap2007 %>%
summarize(median(pop),
IQR(pop))
3.8 Shape and transformations (video)
3.9 Describe the shape
3.10 Transformations
Instruction:
# Create density plot of old variable
gap2007 %>%
ggplot(aes(x = pop)) +
geom_density()
# Transform the skewed pop variable
gap2007 <- gap2007 %>%
mutate(log_pop = log(pop))
# Create density plot of new variable
gap2007 %>%
ggplot(aes(x = log_pop)) +
geom_density()
3.11 Outliers (video)
3.12 Identify outliers
Instruction:
# Filter for Asia, add column indicating outliers
gap_asia <- gap2007 %>%
filter(continent == "Asia") %>%
mutate(is_outlier = lifeExp < 50)
# Remove outliers, create box plot of lifeExp
gap_asia %>%
filter(!is_outlier) %>%
ggplot(aes(x = 1, y = lifeExp)) +
geom_boxplot()
4. Case Study
4.1 Introducing the data (video)
4.2 Spam and num char
Instruction:
# Load packages
library(ggplot2)
library(dplyr)
library(openintro)
# Compute summary statistics
email %>%
group_by(spam) %>%
summarize(median(num_char),
IQR(num_char))
# Create plot
email %>%
mutate(log_num_char = log(num_char)) %>%
ggplot(aes(x = spam, y = log_num_char)) +
geom_boxplot()
4.3 Spam and num char interpretation
4.4 Spam and!!!
Instruction:
# Compute center and spread for exclaim_mess by spam
email %>%
group_by(spam) %>%
summarize(median(exclaim_mess),
IQR(exclaim_mess))
# Create plot for spam and exclaim_mess
email %>%
mutate(log_exclaim_mess = log(exclaim_mess + 0.01)) %>%
ggplot(aes(x = log_exclaim_mess)) +
geom_histogram() +
facet_wrap(~spam)
4.5 Spam and interpretation
4.6 Check-in 1 (video)
4.7 Collapsing levels
Instruction:
# Create plot of proportion of spam by image
email %>%
mutate(has_image = image > 0) %>%
ggplot(aes(x = has_image, fill = spam)) +
geom_bar(position = "fill")
4.8 Image and spam interpretation
4.9 Data Integrity
Instruction:
# Test if images count as attachments
sum(sum(email$image) == sum(email$attach))
4.10 Answering questions with chains
Instruction:
# Question 1
email %>%
filter(dollar > 0) %>%
group_by(spam) %>%
summarize(median(dollar))
# Question 2
email %>%
filter(dollar > 10) %>%
ggplot(aes(x = spam)) +
geom_bar()
4.11 Check-in 2 (video)
4.12 What’s in a number
Instruction:
# Reorder levels
email$number_reordered <- factor(email$number, levels = c("none", "small", "big"))
# Construct plot of number_reordered
ggplot(email, aes(x = number_reordered)) +
geom_bar() +
facet_wrap(~spam)
4.13 What’s in a number interpretation
4.14 Conclusion
上一篇: 输入一个整数n,输出这个数的正数序列