library(tidyverse)
library(lubridate)
library(ggthemes)
Read the data file, NY_school_enrollment_socioecon.csv
,
as the data.frame object with the name, NY_pincp
, using (1)
the read_csv()
function and (2) its URL,
https://bcdanl.github.io/data/NY_school_enrollment_socioecon.csv
.
<- 'https://bcdanl.github.io/data/NY_school_enrollment_socioecon.csv'
url <- read_csv(url) NY_school_enrollment_socioecon
For description of variables in
NY_school_enrollment_socioecon
, refer to the file,
ny_school_enrollment_socioecon_description.zip
, which is in
the Files section in our Canvas web-page. (I recommend you to extract
the zip file, and then read the file,
ny_school_enrollment_socioecon_description.csv, using Excel or
Numbers.)
Look up the meaning of variable pincp
from the
data.frame, NY_pincp
. Create the data frame,
NY_pincp
, which has only the following four variables from
NY_school_enrollment_socioecon
:
fips
year
county_name
pincp
<- NY_school_enrollment_socioecon %>%
NY_pincp select(fips:pincp)
NY_pincp
fips <dbl> | year <dbl> | county_name <chr> | pincp <dbl> | |
---|---|---|---|---|
36001 | 2015 | Albany | 55120 | |
36001 | 2016 | Albany | 55126 | |
36001 | 2017 | Albany | 58814 | |
36001 | 2018 | Albany | 59547 | |
36001 | 2019 | Albany | 61876 | |
36001 | 2020 | Albany | 66632 | |
36003 | 2015 | Allegany | 32205 | |
36003 | 2016 | Allegany | 32417 | |
36003 | 2017 | Allegany | 34001 | |
36003 | 2018 | Allegany | 34553 |
Create the data frame, NY_pincp_wide
, which has one
observation for each county (62 observations in total) and the following
eight variables:
fips
: ID number for countycounty_name
: County Namepincp2015
: Annual personal income in year 2015pincp2016
: Annual personal income in year 2016pincp2017
: Annual personal income in year 2017pincp2018
: Annual personal income in year 2018pincp2019
: Annual personal income in year 2019pincp2020
: Annual personal income in year 2020<- NY_pincp %>%
NY_pincp_wide pivot_wider(names_from = year,
values_from = pincp,
names_prefix = 'pincp')
Use the data.frame NY_pincp_wide
from Q1b to create the
data frame, NY_pincp_long, which has six observations for each county
(372 observations in total) and the following four variables:
fips
: ID number for countyyear
: integer variable of year (e.g., The possible
values of year
are 2015, 2016, 2017, 2018, 2019, and
2020.)county_name
: County Namepincp
: Annual personal income# (1) using separate()
<- NY_pincp_wide %>%
NY_pincp_long pivot_longer( cols = pincp2015:pincp2020,
names_to = "year",
values_to = "pincp") %>%
separate(year, into = c("dum", "year"),
sep = "pincp", convert = T) %>%
select(-dum)
# () using str_replace()
<- NY_pincp_wide %>%
NY_pincp_long pivot_longer( cols = pincp2015:pincp2020,
names_to = "year",
values_to = "pincp") %>%
mutate( year = str_replace(year, "pincp", ""),
year = as.integer(year) )
Create the data.frame, NY_pincp_geo
, by join the two
data.frames, NY_county_geo
and NY_pincp
.
<- read_csv('https://bcdanl.github.io/data/NY_county_geo.csv') NY_county_geo
The data.frame, NY_pincp_geo
, must include all the
observations and variables in the data.frame,
NY_county_geo
.
<- NY_county_geo %>%
NY_pincp_geo left_join(NY_pincp, by = c("FIPS" = "fips"))
In the following ggplot code with geom_polygon()
,
replace the blanks ([?]) with the appropriate object to draw a yearly
county map of pincp
.
# ggplot(data = [?]) +
# geom_polygon(mapping = aes(x = [?], y = [?], group = group,
# fill = [?] ) ,
# color = "grey", size = 0.1) +
# facet_wrap( [?] ) +
# labs( title = "Income in NY State" ) +
# coord_map("bonne", parameters = 41.6) + # for better aspect ratio
# scale_fill_gradient2( # mapping pincp rate to color level
# low = 'red',
# high = 'blue',
# na.value = "grey50",
# midpoint = quantile(
# NY_school_enrollment_socioecon$pincp, 50)) +
# theme_map() + # a better theme for map drawing
# theme(legend.position='top')
ggplot(data = NY_pincp_geo) +
geom_polygon(mapping = aes(x = long, y = lat, group = group,
fill = pincp ) ,
color = "grey", size = 0.1) +
facet_wrap( ~ year ) +
labs( title = "Average Income in NY Counties, 2015-2020",
caption = "Source: Breau of Labor Statistics") +
coord_map("bonne", parameters = 41.6) + # for better aspect ratio
scale_fill_gradient2( # mapping pincp values to color levels
low = 'red',
high = 'blue',
na.value = "grey50",
midpoint = quantile(
$pincp, .50),
NY_school_enrollment_socioeconname = "Average Income") +
theme_map() + # a better theme for map drawing
theme(legend.position= "bottom",
plot.title = element_text(hjust = .5, size = 16,
color = "purple"),
strip.background = element_rect(color = 'white',
fill = 'white'))
Read the data file, tech_stock.csv
, as the data.frame
object with the name, tech_stock
, using (1) the
read_csv()
function and (2) its URL,
https://bcdanl.github.io/data/tech_stock
.
<- read_csv("https://bcdanl.github.io/data/tech_stock.csv")
tech_stock tech_stock
Date <date> | Open <dbl> | High <dbl> | Low <dbl> | Close <dbl> | Adj Close <dbl> | Volume <dbl> | company <chr> |
---|---|---|---|---|---|---|---|
1980-12-12 | 0.128348 | 0.128906 | 0.128348 | 0.128348 | 0.099874 | 469033600 | Apple |
1980-12-15 | 0.122210 | 0.122210 | 0.121652 | 0.121652 | 0.094663 | 175884800 | Apple |
1980-12-16 | 0.113281 | 0.113281 | 0.112723 | 0.112723 | 0.087715 | 105728000 | Apple |
1980-12-17 | 0.115513 | 0.116071 | 0.115513 | 0.115513 | 0.089886 | 86441600 | Apple |
1980-12-18 | 0.118862 | 0.119420 | 0.118862 | 0.118862 | 0.092492 | 73449600 | Apple |
1980-12-19 | 0.126116 | 0.126674 | 0.126116 | 0.126116 | 0.098137 | 48630400 | Apple |
1980-12-22 | 0.132254 | 0.132813 | 0.132254 | 0.132254 | 0.102913 | 37363200 | Apple |
1980-12-23 | 0.137835 | 0.138393 | 0.137835 | 0.137835 | 0.107256 | 46950400 | Apple |
1980-12-24 | 0.145089 | 0.145647 | 0.145089 | 0.145089 | 0.112901 | 48003200 | Apple |
1980-12-26 | 0.158482 | 0.159040 | 0.158482 | 0.158482 | 0.123323 | 55574400 | Apple |
Describe the daily trend of Close
for each company since
2010 in one ggplot.
ggplot(filter(tech_stock, Date > ymd("2010-01-01")),
aes(x = Date, y= Close,
color = company)) +
geom_line() +
::theme_ipsum() hrbrthemes
ggplot(filter(tech_stock, Date > ymd("2010-01-01")),
aes(x = Date, y= Close,
color = fct_reorder2(company, Date, Close))) +
geom_line() +
::theme_ipsum() hrbrthemes
log(Close)
and
log(Volume)
for the companies in
tech_stock
.<- ggplot(tech_stock ,
p0 aes(x = log(Close), y = log(Volume)) )
+ geom_point(alpha = .05,
p0 shape = 1,
aes(color = company)) + geom_smooth(method = lm) +
guides(colour = guide_legend(override.aes = list(alpha=1)))
Does the relationship between log(Close)
and
log(Volume)
vary by companies and by the time periods?
# all years
<- ggplot(tech_stock ,
p0 aes(x = log(Close), y = log(Volume),
color = company) )
+ geom_point(alpha = .05,
p0 shape = 1) + geom_smooth(method = lm) +
facet_grid(.~company) +
theme_minimal()+
labs(title = "All years") +
theme_bw() +
guides(color = "none")
# since 2017
<- tech_stock %>%
tech_stock2 filter(Date >= ymd("2017-01-01")) %>%
mutate(year = year(Date))
<- ggplot(tech_stock2 ,
p0 aes(x = log(Close), y = log(Volume),
color = company) )
+ geom_point(alpha = .075,
p0 shape = 1) +
geom_smooth(method = lm, se = F, size = .75) +
facet_grid(year~company, scales = "free_x") +
labs(title = "Since 2017") +
theme_bw() +
guides(color = "none")