Question 1

Question 2

Loading R packages for Homework Assignment 3

library(tidyverse)
library(lubridate)
library(ggthemes)

Question 1

Read the data file, NY_school_enrollment_socioecon.csv, as the data.frame object with the name, NY_pincp, using (1) the read_csv() function and (2) its URL, https://bcdanl.github.io/data/NY_school_enrollment_socioecon.csv.

url <- 'https://bcdanl.github.io/data/NY_school_enrollment_socioecon.csv'
NY_school_enrollment_socioecon <- read_csv(url)

For description of variables in NY_school_enrollment_socioecon, refer to the file, ny_school_enrollment_socioecon_description.zip, which is in the Files section in our Canvas web-page. (I recommend you to extract the zip file, and then read the file, ny_school_enrollment_socioecon_description.csv, using Excel or Numbers.)

Q1a

Look up the meaning of variable pincp from the data.frame, NY_pincp. Create the data frame, NY_pincp, which has only the following four variables from NY_school_enrollment_socioecon:

fips
year
county_name
pincp

NY_pincp <- NY_school_enrollment_socioecon %>% 
  select(fips:pincp)

NY_pincp

ABCDEFGHIJ0123456789

fips <dbl>	year <dbl>	county_name <chr>	pincp <dbl>
36001	2015	Albany	55120
36001	2016	Albany	55126
36001	2017	Albany	58814
36001	2018	Albany	59547
36001	2019	Albany	61876
36001	2020	Albany	66632
36003	2015	Allegany	32205
36003	2016	Allegany	32417
36003	2017	Allegany	34001
36003	2018	Allegany	34553

Q1b

Create the data frame, NY_pincp_wide, which has one observation for each county (62 observations in total) and the following eight variables:

fips: ID number for county
county_name: County Name
pincp2015: Annual personal income in year 2015
pincp2016: Annual personal income in year 2016
pincp2017: Annual personal income in year 2017
pincp2018: Annual personal income in year 2018
pincp2019: Annual personal income in year 2019
pincp2020: Annual personal income in year 2020

NY_pincp_wide <- NY_pincp %>% 
  pivot_wider(names_from = year,
              values_from = pincp,
              names_prefix = 'pincp')

Q1c

Use the data.frame NY_pincp_wide from Q1b to create the data frame, NY_pincp_long, which has six observations for each county (372 observations in total) and the following four variables:

fips: ID number for county
year: integer variable of year (e.g., The possible values of year are 2015, 2016, 2017, 2018, 2019, and 2020.)
county_name: County Name
pincp: Annual personal income

# (1) using separate()
NY_pincp_long <- NY_pincp_wide %>% 
  pivot_longer( cols = pincp2015:pincp2020,
                names_to = "year",
                values_to = "pincp") %>% 
  separate(year, into = c("dum", "year"), 
           sep = "pincp", convert = T) %>% 
  select(-dum)

# () using str_replace()
NY_pincp_long <- NY_pincp_wide %>% 
  pivot_longer( cols = pincp2015:pincp2020,
                names_to = "year",
                values_to = "pincp") %>% 
  mutate( year = str_replace(year, "pincp", ""),
          year = as.integer(year) )

Q1d

Create the data.frame, NY_pincp_geo, by join the two data.frames, NY_county_geo and NY_pincp.

NY_county_geo <- read_csv('https://bcdanl.github.io/data/NY_county_geo.csv')

The data.frame, NY_pincp_geo, must include all the observations and variables in the data.frame, NY_county_geo.

NY_pincp_geo <- NY_county_geo %>% 
  left_join(NY_pincp, by = c("FIPS" = "fips"))

Q1e

In the following ggplot code with geom_polygon(), replace the blanks ([?]) with the appropriate object to draw a yearly county map of pincp.

# ggplot(data = [?]) +
#   geom_polygon(mapping = aes(x = [?], y = [?], group = group, 
#                              fill = [?] ) ,
#                color = "grey", size = 0.1) +
#   facet_wrap( [?] ) +
#   labs( title = "Income in NY State" ) +
#   coord_map("bonne", parameters = 41.6) +  # for better aspect ratio
#   scale_fill_gradient2( #  mapping pincp rate to color level
#   low = 'red', 
#   high = 'blue', 
#   na.value = "grey50",
#   midpoint = quantile(
#     NY_school_enrollment_socioecon$pincp, 50)) +
#   theme_map() +    # a better theme for map drawing
#   theme(legend.position='top')

ggplot(data = NY_pincp_geo) +
  geom_polygon(mapping = aes(x = long, y = lat, group = group, 
                             fill = pincp ) ,
               color = "grey", size = 0.1) +
  facet_wrap( ~ year ) +
  labs( title = "Average Income in NY Counties, 2015-2020",
        caption = "Source: Breau of Labor Statistics") +
  coord_map("bonne", parameters = 41.6) +  # for better aspect ratio
  scale_fill_gradient2( #  mapping pincp values to color levels
    low = 'red', 
    high = 'blue', 
    na.value = "grey50",
    midpoint = quantile(
      NY_school_enrollment_socioecon$pincp, .50),
    name = "Average Income") +
  theme_map() +    # a better theme for map drawing
  theme(legend.position= "bottom",
        plot.title = element_text(hjust = .5, size = 16,
                                  color = "purple"),
        strip.background = element_rect(color = 'white', 
                                        fill = 'white'))

Question 2

Read the data file, tech_stock.csv, as the data.frame object with the name, tech_stock, using (1) the read_csv() function and (2) its URL, https://bcdanl.github.io/data/tech_stock.

tech_stock <- read_csv("https://bcdanl.github.io/data/tech_stock.csv")
tech_stock

ABCDEFGHIJ0123456789

Date <date>	Open <dbl>	High <dbl>	Low <dbl>	Close <dbl>	Adj Close <dbl>	Volume <dbl>	company <chr>
1980-12-12	0.128348	0.128906	0.128348	0.128348	0.099874	469033600	Apple
1980-12-15	0.122210	0.122210	0.121652	0.121652	0.094663	175884800	Apple
1980-12-16	0.113281	0.113281	0.112723	0.112723	0.087715	105728000	Apple
1980-12-17	0.115513	0.116071	0.115513	0.115513	0.089886	86441600	Apple
1980-12-18	0.118862	0.119420	0.118862	0.118862	0.092492	73449600	Apple
1980-12-19	0.126116	0.126674	0.126116	0.126116	0.098137	48630400	Apple
1980-12-22	0.132254	0.132813	0.132254	0.132254	0.102913	37363200	Apple
1980-12-23	0.137835	0.138393	0.137835	0.137835	0.107256	46950400	Apple
1980-12-24	0.145089	0.145647	0.145089	0.145089	0.112901	48003200	Apple
1980-12-26	0.158482	0.159040	0.158482	0.158482	0.123323	55574400	Apple

Q2a

Describe the daily trend of Close for each company since 2010 in one ggplot.

ggplot(filter(tech_stock, Date > ymd("2010-01-01")),
       aes(x = Date, y= Close,
           color = company)) +
  geom_line() +
  hrbrthemes::theme_ipsum()

ggplot(filter(tech_stock, Date > ymd("2010-01-01")),
       aes(x = Date, y= Close,
           color = fct_reorder2(company, Date, Close))) +
  geom_line() + 
  hrbrthemes::theme_ipsum()

Q2b

Describe the relationship between log(Close) and log(Volume) for the companies in tech_stock.

p0 <- ggplot(tech_stock , 
             aes(x = log(Close), y = log(Volume)) )
p0 + geom_point(alpha = .05,
                shape = 1,
                aes(color = company)) + geom_smooth(method = lm) +
  guides(colour = guide_legend(override.aes = list(alpha=1)))

Q2c

Does the relationship between log(Close) and log(Volume) vary by companies and by the time periods?

# all years
p0 <- ggplot(tech_stock , 
             aes(x = log(Close), y = log(Volume),
                 color = company) )
p0 + geom_point(alpha = .05,
                shape = 1) + geom_smooth(method = lm) +
  facet_grid(.~company) +
  theme_minimal()+
  labs(title = "All years") +
  theme_bw() +
  guides(color = "none")

# since 2017
tech_stock2 <- tech_stock %>% 
  filter(Date >= ymd("2017-01-01")) %>% 
  mutate(year = year(Date))

p0 <- ggplot(tech_stock2 , 
             aes(x = log(Close), y = log(Volume),
                 color = company) )
p0 + geom_point(alpha = .075,
                shape = 1) + 
  geom_smooth(method = lm, se = F, size = .75) + 
  facet_grid(year~company, scales = "free_x") +
  labs(title = "Since 2017") +
  theme_bw() +
  guides(color = "none")

DANL 200: Introduction to Data Analytics

DANL 200 - Homework Assignment 3 - Example Answers

Byeong-Hak Choe

2023-02-14

Loading R packages for Homework Assignment 3

Question 1

Q1a

Q1b

Q1c

Q1d

Q1e

Question 2

Q2a

Q2b

Q2c