RStudio
rstudio
Environment
History
Traceback
Data
avg_payroll30 obs. of 2 variables
avg_stats_per_year148 obs. of 4 variables
avg_std_stats_per_year148 obs. of 4 variables
payroll_tab858 obs. of 15 variables
result30 obs. of 2 variables
spread_payrolls30 obs. of 3 variables
standard_payrolls30 obs. of 3 variables
Values
lahman_conList of 2
payroll_query"with total_payroll as\n (SELECT teamID, yearID, sum(salary) as payroll\n FROM Salaries \n GROUP BY teamID, yearID)\n SELECT Teams.teamID, \n Teams.yearID, \n Teams.lgID, \n payroll, \n franchID, \n rank, W,G, ((W*1.0/G)*100) as win_percentage\n FROM total_payroll, Teams\n WHERE total_payroll.yearID = Teams.yearID and\n total_payroll.teamID = Teams.teamID"
query_objectFormal class SQLiteResult
query_resultList of 2
salary_query"SELECT yearID, sum(salary) as total_payroll \n FROM Salaries \n WHERE lgID == 'AL'\n GROUP BY yearID"
Files
Plots
Packages
Help
Viewer
project2.Rmd
*
129:1
Chunk 6: problem2
R Markdown
Title
Baseball and the Power of Money
Install some packages
  
install_packages
Load the Data
  
load_data
SQL
  
sql
  
sql2
Wrangling
  
Problem 1
    
p1
  
Notes on missing data
Exploratory data analysis
Payroll Distribution
  
Problem 2.
    
problem2
  
Problem 3.
    
question1
    
question1.1
Correlation Between Payroll and Winning Percentage
  
Problem 4.
    
problem4
  
Question 2.
    
oakA
Data Transformations
  
Standardization across years
  
Problem 5.
    
problem5
  
Problem 6.
    
problem6
  
Question 3.
Expected wins
  
Problem 7.
    
problem7
Spending Efficiency
  
Problem 8
    
problem8
Question 4.
Conclusion
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# yearID >=1990 & yearID <= 2014
payroll_tab %>% 
  filter(yearID >=1990 & yearID <= 2014) %>%
    ggplot(aes(x=yearID, y=payroll)) +
      geom_line() +
      facet_wrap(~teamID) +
      xlab("Year") +
      ylab("Total Payroll") +
      ggtitle("Team Payrolls Over Time for Teams") +
      theme(text = element_text(size = 7.5),
            axis.text.x = element_text(angle=90, vjust=1)) 
# Put all of these on one large plot
payroll_tab %>% 
  filter(yearID >=1990 & yearID <= 2014) %>%
    ggplot(aes(x=yearID, y=payroll)) +
      geom_point() +
      geom_smooth() +
      xlab("Year") +
      ylab("Total Payroll") +
      ggtitle("Payrolls of Teams Over Time")
      
```
This first table produced many plots for each team. In each plot, we have the **year** in the x-axis 
and we show **total payroll** in the y-axis. The first plot is difficult to visualize and make a 
statement about the overall trend of all teams across many years so we try to combine all of the 
teams and their payrolls in one plot (the second plot above). As you can see from the first plot, 
some teams only have payroll information for some later (more recent) years. In the second plot, 
their are many points that appear in vertical lines because our dataset has data about payrolls of 
teams in discrete years.
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
Console
~/russia_tea/
> payroll_tab %>% 
+   filter(teamID %in% c("OAK", "BOS", "NYA", "ATL", "TBA")) %>% 
+   ggplot(aes(x=yearID, y=efficiency)) +
+   geom_smooth() +
+   geom_point(aes(colour=teamID)) +
+   xlab("Year") +
+   ylab("Winning Efficiency") +
+   ggtitle("Efficiency of Teams Over Time") +
+   labs(colour="Team")
> payroll_tab %>% 
+   filter(teamID %in% c("OAK", "BOS", "NYA", "ATL", "TBA")) %>% 
+   ggplot(aes(x=yearID, y=efficiency, color = teamID)) +
+   geom_smooth() +
+   #geom_point(aes(colour=teamID)) +
+   xlab("Year") +
+   ylab("Winning Efficiency") +
+   ggtitle("Efficiency of Teams Over Time") +
+   labs(colour="Team")
> save.image("~/project2/project2_env.RData")
> install.packages("UScensus2010")
Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/src/contrib/UScensus2010_0.11.tar.gz'
Content type 'application/x-gzip' length 63114 bytes (61 KB)
==================================================
downloaded 61 KB

* installing *source* package ‘UScensus2010’ ...
** package ‘UScensus2010’ successfully unpacked and MD5 sums checked
** R
** data
** inst
** preparing package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded
* DONE (UScensus2010)

The downloaded source packages are in
	‘/tmp/Rtmp9uHv2u/downloaded_packages’
> library(UScensus2010)
Loading required package: maptools
Loading required package: sp
Checking rgeos availability: TRUE
Loading required package: foreign


Package UScensus2010: US Census 2010 Suite of R Packages
Version 0.11 created on 2011-11-18.

Zack Almquist, University of California-Irvine
ne

For citation information, type citation("UScensus2010").
Type help(package=UScensus2010) to get started.
> help(package="UScensus2010")
> data(maryland)
Warning message:
In data(maryland) : data set ‘maryland’ not found
> data("maryland")
Warning message:
In data("maryland") : data set ‘maryland’ not found
> ?UScensus2010::county
> ?UScensus2010::state
No documentation for ‘state’ in specified packages and libraries:
you could try ‘??state’
> ?UScensus2010::city
> md <- UScensus2010::city("Maryland")
Error in if (!statefips & nchar(state) == 2) { : 
  argument is of length zero
> md <- UScensus2010::city(",aryland")
Error in if (!statefips & nchar(state) == 2) { : 
  argument is of length zero
> md <- UScensus2010::city(state="maryland")
Error in get(paste(state, ".cdp10", sep = "")) : 
  object 'maryland.cdp10' not found
In addition: Warning message:
In data(list = paste(state, ".cdp10", sep = ""), envir = parent.frame()) :
  data set ‘maryland.cdp10’ not found
> md <- UScensus2010::city(state="montgomery")
Error in city.aux(name, state, statefips, sp.object, proj) : 
  Not a State! 
> md <- UScensus2010::city("montgomery", state="maryland")
Error in get(paste(state, ".cdp10", sep = "")) : 
  object 'maryland.cdp10' not found
In addition: Warning message:
In data(list = paste(state, ".cdp10", sep = ""), envir = parent.frame()) :
  data set ‘maryland.cdp10’ not found
02 Nov 2016 15:07:22 [rsession-rstudio] ERROR session hadabend; LOGGED FROM: rstudio::core::Error {anonymous}::rInit(const rstudio::r::session::RInitInfo&) /home/ubuntu/rstudio/src/cpp/session/SessionMain.cpp:1862
Checking rgeos availability: TRUE
Error in library(packageName, lib.loc = lib, character.only = TRUE) : 
  there is no package called ‘UScensus2010’
02 Nov 2016 15:07:24 [rsession-rstudio] ERROR r error 4 (R code execution error) [errormsg=Error in library(packageName, lib.loc = lib, character.only = TRUE) : 
  there is no package called ‘UScensus2010’
, context=Error restoring session data (loading package UScensus2010)]; OCCURRED AT: rstudio::core::Error rstudio::r::exec::{anonymous}::evaluateExpressionsUnsafe(SEXP, SEXP, SEXPREC**, rstudio::r::sexp::Protect*) /home/ubuntu/rstudio/src/cpp/r/RExec.cpp:147; LOGGED FROM: void rstudio::r::session::search_path::{anonymous}::loadPackage(const string&, const string&) /home/ubuntu/rstudio/src/cpp/r/session/RSearchPath.cpp:198
Error restoring session data (loading package UScensus2010): R code execution error
> payroll_tab %>% 
+   filter(yearID >=1990 & yearID <= 2014) %>%
+     ggplot(aes(x=yearID, y=payroll)) +
+       geom_line() +
+       facet_wrap(~teamID) +
+       xlab("Year") +
+       ylab("Total Payroll") +
+       ggtitle("Team Payrolls Over Time for Teams") +
+       theme(text = element_text(size=20),
+             axis.text.x = element_text(angle=90, vjust=1))
> payroll_tab %>% 
+   filter(yearID >=1990 & yearID <= 2014) %>%
+     ggplot(aes(x=yearID, y=payroll)) +
+       geom_point() +
+       geom_smooth() +
+       xlab("Year") +
+       ylab("Total Payroll") +
+       ggtitle("Payrolls of Teams Over Time")
> payroll_tab %>% 
+   group_by(yearID) %>% 
+     summarise(avg_payroll = mean(payroll)) %>% 
+       ggplot(aes(x=yearID, y=avg_payroll)) + 
+         geom_bar(stat = "identity") +
+         xlab("Year") +
+         ylab("Average Payroll of Baseball Teams") +
+         ggtitle("Average Payroll of Baseball Teams over Time") +
+         geom_smooth()
> payroll_tab %>% 
+   filter(yearID >=1990 & yearID <= 2014) %>%
+     ggplot(aes(x=yearID, y=payroll)) +
+       geom_line() +
+       facet_wrap(~teamID) +
+       xlab("Year") +
+       ylab("Total Payroll") +
+       ggtitle("Team Payrolls Over Time for Teams") +
+       theme(text = element_text(size=10),
+             axis.text.x = element_text(angle=90, vjust=1))
> payroll_tab %>% 
+   filter(yearID >=1990 & yearID <= 2014) %>%
+     ggplot(aes(x=yearID, y=payroll)) +
+       geom_line() +
+       facet_wrap(~teamID) +
+       xlab("Year") +
+       ylab("Total Payroll") +
+       ggtitle("Team Payrolls Over Time for Teams") +
+       theme(text = element_text(),
+             axis.text.x = element_text(angle=90, vjust=1))
> payroll_tab %>% 
+   filter(yearID >=1990 & yearID <= 2014) %>%
+     ggplot(aes(x=yearID, y=payroll)) +
+       geom_point() +
+       geom_smooth() +
+       xlab("Year") +
+       ylab("Total Payroll") +
+       ggtitle("Payrolls of Teams Over Time")
> ?theme
> payroll_tab %>% 
+   group_by(yearID) %>% 
+     summarise(avg_payroll = mean(payroll)) %>% 
+       ggplot(aes(x=yearID, y=avg_payroll)) + 
+         geom_bar(stat = "identity") +
+         xlab("Year") +
+         ylab("Average Payroll of Baseball Teams") +
+         ggtitle("Average Payroll of Baseball Teams over Time") +
+         geom_smooth()
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth()
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = (max(payroll_tab$yearID) - min(payroll_tab$yearID)))
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = 10)
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = 10)
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth()
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = number_ticks(10))
Error in check_breaks_labels(breaks, labels) : 
  could not find function "number_ticks"
> ?pretty
> pretty(10,n)
Error in n%/%3 : non-numeric argument to binary operator
> pretty(10,1)
[1] 10
> pretty(10,2)
[1] 10
> pretty(10,3)
[1]  0 10
> pretty(limits,10)
Error in pretty(limits, 10) : object 'limits' not found
> ?pretty_breaks
> pretty_breaks(3)
Error: could not find function "pretty_breaks"
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = pretty_breaks(10))
Error in check_breaks_labels(breaks, labels) : 
  could not find function "pretty_breaks"
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = scales::pretty_breaks(10))
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = scales::pretty_breaks(15))
> ?range
> range(payroll$yearID)
Error: object 'payroll' not found
> range(payroll_tab$yearID)
[1] 1985 2014
> diff(range(payroll_tab$yearID))
[1] 29
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = scales::pretty_breaks(diff(range(payroll_tab$yearID))))
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = scales::pretty_breaks(diff(range(payroll_tab$yearID)))) +
+       theme(text = element_text(),
+             axis.text.x = element_text(angle=90, vjust=1))
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = scales::pretty_breaks(diff(range(payroll_tab$yearID)))/2) +
+       theme(text = element_text(),
+             axis.text.x = element_text(angle=90, vjust=1))
Error in scales::pretty_breaks(diff(range(payroll_tab$yearID)))/2 : 
  non-numeric argument to binary operator
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = scales::pretty_breaks(diff(range(payroll_tab$yearID)))) +
+       theme(text = element_text(),
+             axis.text.x = element_text(angle=90, vjust=1))
> payroll_tab %>% 
+   group_by(yearID) %>% 
+   summarise(max_payroll = max(payroll), min_payroll = min(payroll)) %>% 
+     ggplot(aes(x = yearID, y = (max_payroll-min_payroll))) +
+       geom_bar(stat = "identity") +
+       xlab("Year") +
+       ylab("Payroll Spread") +
+       ggtitle("Difference in Payroll Between Wealthy and Poor Teams Over Time") +
+       geom_smooth() +
+       scale_x_continuous(breaks = scales::pretty_breaks(20)) +
+       theme(text = element_text(),
+             axis.text.x = element_text(angle=90, vjust=1))
> payroll_tab %>% sample_n(10) %>% select(teamID, yearID,year_range)
# A tibble: 10 × 3
   teamID yearID  year_range
    <chr>  <int>      <fctr>
1     CAL   1991 (1991,1997]
2     CHN   2006 (2002,2008]
3     DET   2001 (1997,2002]
4     NYA   1993 (1991,1997]
5     SEA   1986 (1985,1991]
6     CHN   1988 (1985,1991]
7     ARI   2009 (2008,2014]
8     HOU   1999 (1997,2002]
9     SFN   1985 (1985,1991]
10    SEA   2010 (2008,2014]
> avg_stats_per_year %>% sample_n(1) %>% select(teamID,average_pay_in_years,average_win_percent_in_years,year_range)
Source: local data frame [0 x 4]
Groups: year_range [0]

# ... with 4 variables: teamID <chr>, average_pay_in_years <dbl>, average_win_percent_in_years <dbl>,
#   year_range <fctr>
> avg_stats_per_year %>% select(teamID,average_pay_in_years,average_win_percent_in_years,year_range)
Source: local data frame [148 x 4]
Groups: year_range [5]

   teamID average_pay_in_years average_win_percent_in_years  year_range
    <chr>                <dbl>                        <dbl>      <fctr>
1     ATL             14475059                     40.22038 (1985,1991]
2     BAL             11658262                     45.40360 (1985,1991]
3     BOS             14563356                     52.89024 (1985,1991]
4     CAL             15077312                     51.74897 (1985,1991]
5     CHA              9008958                     48.18396 (1985,1991]
6     CHN             13605046                     48.44389 (1985,1991]
7     CIN             10646369                     52.73049 (1985,1991]
8     CLE              9232153                     44.49431 (1985,1991]
9     DET             13402658                     50.97960 (1985,1991]
10    HOU             13020061                     51.23457 (1985,1991]
# ... with 138 more rows
> avg_stats_per_year
Source: local data frame [148 x 4]
Groups: year_range [5]

    year_range teamID average_pay_in_years average_win_percent_in_years
        <fctr>  <chr>                <dbl>                        <dbl>
1  (1985,1991]    ATL             14475059                     40.22038
2  (1985,1991]    BAL             11658262                     45.40360
3  (1985,1991]    BOS             14563356                     52.89024
4  (1985,1991]    CAL             15077312                     51.74897
5  (1985,1991]    CHA              9008958                     48.18396
6  (1985,1991]    CHN             13605046                     48.44389
7  (1985,1991]    CIN             10646369                     52.73049
8  (1985,1991]    CLE              9232153                     44.49431
9  (1985,1991]    DET             13402658                     50.97960
10 (1985,1991]    HOU             13020061                     51.23457
# ... with 138 more rows
> avg_stats_per_year %>% sample_n(10)
Source: local data frame [50 x 4]
Groups: year_range [5]

    year_range teamID average_pay_in_years average_win_percent_in_years
        <fctr>  <chr>                <dbl>                        <dbl>
1  (1985,1991]    BOS             14563356                     52.89024
2  (1985,1991]    PHI             11807505                     46.30896
3  (1985,1991]    TOR             13027137                     55.87318
4  (1985,1991]    SEA              7085071                     45.00230
5  (1985,1991]    CLE              9232153                     44.49431
6  (1985,1991]    CHA              9008958                     48.18396
7  (1985,1991]    OAK             12618240                     55.55556
8  (1985,1991]    LAN             16466313                     51.33359
9  (1985,1991]    CAL             15077312                     51.74897
10 (1985,1991]    SFN             11856147                     50.92593
# ... with 40 more rows
> avg_stats_per_year %>% sample_n(10)
Source: local data frame [50 x 4]
Groups: year_range [5]

    year_range teamID average_pay_in_years average_win_percent_in_years
        <fctr>  <chr>                <dbl>                        <dbl>
1  (1985,1991]    DET             13402658                     50.97960
2  (1985,1991]    HOU             13020061                     51.23457
3  (1985,1991]    NYA             17883336                     51.80968
4  (1985,1991]    ATL             14475059                     40.22038
5  (1985,1991]    TOR             13027137                     55.87318
6  (1985,1991]    TEX              7901650                     47.40945
7  (1985,1991]    BOS             14563356                     52.89024
8  (1985,1991]    MON             11252155                     51.49272
9  (1985,1991]    CLE              9232153                     44.49431
10 (1985,1991]    PHI             11807505                     46.30896
# ... with 40 more rows
> avg_stats_per_year %>% 
+   ggplot(
+     aes(x=average_pay_in_years, y=average_win_percent_in_years, label=teamID)) + 
+   geom_point() +
+   geom_text() +
+   facet_wrap(~year_range) +
+   xlab("Average Team Payroll") +
+   ylab("Average Winning Percentage") +
+   ggtitle("Average Winning Percentage vs. Average Payroll across Time") +
+   geom_smooth(method = 'lm')
> avg_stats_per_year %>% 
+   ggplot(
+     aes(x=average_pay_in_years, y=average_win_percent_in_years, label=teamID)) + 
+   geom_text() +
+   facet_wrap(~year_range) +
+   xlab("Average Team Payroll") +
+   ylab("Average Winning Percentage") +
+   ggtitle("Average Winning Percentage vs. Average Payroll across Time") +
+   geom_smooth(method = 'lm')
> avg_stats_per_year %>% 
+   ggplot(aes(x=average_pay_in_years, y=average_win_percent_in_years)) + 
+   geom_point(aes(colour=ifelse(teamID=="OAK", 'Oakland As', "Other Team"))) +
+   facet_wrap(~year_range) +
+   xlab("Average Team Payroll") +
+   ylab("Average Winning Percentage") +
+   ggtitle("Oakland A's Spending Efficency Over Time") +
+   geom_smooth(method = 'lm') +
+   labs(colour="Team")
> head(standard_payrolls)
# A tibble: 6 × 3
  yearID average_payroll_year st_dev_payroll_year
   <int>                <dbl>               <dbl>
1   1985             10075565             2470845
2   1986             11840558             3186956
3   1987             10483668             3848337
4   1988             11555862             3386331
5   1989             13845989             3568844
6   1990             17072354             3771834
> avg_std_stats_per_year %>% 
+   ggplot(
+     aes(x=average_pay_in_years, y=average_win_percent_in_years, label=teamID)) + 
+   geom_point() +
+   geom_text() +
+   facet_wrap(~year_range) +
+   xlab("Average Standard Team Payroll") +
+   ylab("Average Winning Percentage") +
+   ggtitle("Average Winning Percentage vs. Average Standard Payroll across Time") +
+   geom_smooth(method = 'lm')
> payroll_tab %>% 
+   ggplot(
+     aes(x=standard_payroll, y=win_percentage, label=teamID)) + 
+   geom_point(aes(colour=yearID)) +
+   #geom_text() +
+   xlab("Standard Team Payroll") +
+   ylab("Winning Percentage") +
+   ggtitle("Winning Percentage vs. Standard Payroll across Time") +
+   geom_smooth(method = 'lm') +
+   labs(colour = "Year")
> payroll_tab %>% 
+   ggplot(aes(x=standard_payroll, y=win_percentage, label=teamID)) + 
+     geom_point(aes(colour=yearID)) +
+     xlab("Standard Team Payroll") +
+     ylab("Winning Percentage") +
+     ggtitle("Winning Percentage vs. Standard Payroll across Time") +
+     geom_smooth(method = 'lm') +
+     labs(colour = "Year")
> head(payroll_tab %>% select(teamID, yearID,win_percentage, expected_win_pct))
# A tibble: 6 × 4
  teamID yearID win_percentage expected_win_pct
   <chr>  <int>          <dbl>            <dbl>
1    ATL   1985       40.74074         54.78726
2    BAL   1985       51.55280         51.50267
3    BOS   1985       49.69325         50.83169
4    CAL   1985       55.55556         54.40368
5    CHA   1985       52.14724         49.76791
6    CHN   1985       47.53086         52.65835
> payroll_tab <- payroll_tab %>% mutate(efficiency = win_percentage-expected_win_pct)
> payroll_tab %>% 
+   filter(teamID %in% c("OAK", "BOS", "NYA", "ATL", "TBA")) %>% 
+   ggplot(aes(x=yearID, y=efficiency)) +
+   geom_smooth() +
+   geom_point(aes(colour=teamID)) +
+   xlab("Year") +
+   ylab("Winning Efficiency") +
+   ggtitle("Efficiency of Teams (Overall) Over Time") +
+   labs(colour="Team")
> payroll_tab %>% 
+   filter(teamID %in% c("OAK", "BOS", "NYA", "ATL", "TBA")) %>% 
+   ggplot(aes(x=yearID, y=efficiency, color = teamID)) +
+   geom_smooth() +
+   #geom_point(aes(colour=teamID)) +
+   xlab("Year") +
+   ylab("Winning Efficiency") +
+   ggtitle("Efficiency of Specific Teams Over Time") +
+   labs(colour="Team")
> avg_stats_per_year %>% sample_n(10)
Source: local data frame [50 x 4]
Groups: year_range [5]

    year_range teamID average_pay_in_years average_win_percent_in_years
        <fctr>  <chr>                <dbl>                        <dbl>
1  (1985,1991]    DET             13402658                     50.97960
2  (1985,1991]    NYA             17883336                     51.80968
3  (1985,1991]    MIN             10584470                     49.17695
4  (1985,1991]    CLE              9232153                     44.49431
5  (1985,1991]    CHN             13605046                     48.44389
6  (1985,1991]    KCA             15132358                     51.64481
7  (1985,1991]    NYN             16158735                     59.38786
8  (1985,1991]    LAN             16466313                     51.33359
9  (1985,1991]    ML4             11362523                     49.58017
10 (1985,1991]    OAK             12618240                     55.55556
# ... with 40 more rows
> avg_stats_per_year <- payroll_tab %>% 
+   group_by(year_range,teamID) %>% 
+   summarise(average_pay_in_years = mean(payroll), 
+             average_win_percent_in_years = mean(win_percentage, na.rm=TRUE))
> avg_stats_per_year %>% sample_n(10)
Source: local data frame [0 x 4]
Groups: year_range [0]

# ... with 4 variables: year_range <fctr>, teamID <chr>, average_pay_in_years <dbl>,
#   average_win_percent_in_years <dbl>
> avg_stats_per_year
Source: local data frame [148 x 4]
Groups: year_range [?]

    year_range teamID average_pay_in_years average_win_percent_in_years
        <fctr>  <chr>                <dbl>                        <dbl>
1  (1985,1991]    ATL             14475059                     40.22038
2  (1985,1991]    BAL             11658262                     45.40360
3  (1985,1991]    BOS             14563356                     52.89024
4  (1985,1991]    CAL             15077312                     51.74897
5  (1985,1991]    CHA              9008958                     48.18396
6  (1985,1991]    CHN             13605046                     48.44389
7  (1985,1991]    CIN             10646369                     52.73049
8  (1985,1991]    CLE              9232153                     44.49431
9  (1985,1991]    DET             13402658                     50.97960
10 (1985,1991]    HOU             13020061                     51.23457
# ... with 138 more rows
> head(avg_stats_per_year)
Source: local data frame [6 x 4]
Groups: year_range [1]

   year_range teamID average_pay_in_years average_win_percent_in_years
       <fctr>  <chr>                <dbl>                        <dbl>
1 (1985,1991]    ATL             14475059                     40.22038
2 (1985,1991]    BAL             11658262                     45.40360
3 (1985,1991]    BOS             14563356                     52.89024
4 (1985,1991]    CAL             15077312                     51.74897
5 (1985,1991]    CHA              9008958                     48.18396
6 (1985,1991]    CHN             13605046                     48.44389
> avg_stats_per_year %>% 
+   ggplot(
+     aes(x=average_pay_in_years, y=average_win_percent_in_years, label=teamID)) + 
+   geom_point() +
+   geom_text() +
+   facet_wrap(~year_range) +
+   xlab("Average Team Payroll") +
+   ylab("Average Winning Percentage") +
+   ggtitle("Average Winning Percentage vs. Average Payroll across Time") +
+   geom_smooth(method = 'lm')
> avg_stats_per_year %>% 
+   ggplot(aes(x=average_pay_in_years, y=average_win_percent_in_years, label=teamID)) + 
+     geom_point() +
+     geom_text() +
+     facet_wrap(~year_range) +
+     xlab("Average Team Payroll") +
+     ylab("Average Winning Percentage") +
+     ggtitle("Average Winning Percentage vs. Average Payroll across Time") +
+     geom_smooth(method = 'lm') +
+     theme(text = element_text(),
+       axis.text.x = element_text(angle=90, vjust=1))
> avg_stats_per_year %>% 
+   ggplot(aes(x=average_pay_in_years, y=average_win_percent_in_years)) + 
+     geom_point(aes(colour=ifelse(teamID=="OAK", 'Oakland As', "Other Teams"))) +
+     facet_wrap(~year_range) +
+     xlab("Average Team Payroll") +
+     ylab("Average Winning Percentage") +
+     ggtitle("Oakland A's Spending Efficency Over Time") +
+     geom_smooth(method = 'lm') +
+     labs(colour="Team") +
+     theme(text = element_text(),
+       axis.text.x = element_text(angle=90, vjust=1))
> payroll_tab %>% 
+   ggplot(aes(x=standard_payroll, y=win_percentage, label=teamID)) + 
+     geom_point(aes(colour=Rank)) +
+     xlab("Standard Team Payroll") +
+     ylab("Winning Percentage") +
+     ggtitle("Winning Percentage vs. Standard Payroll across Time") +
+     geom_smooth(method = 'lm') +
+     labs(colour = "Year")
> payroll_tab %>% 
+   ggplot(aes(x=standard_payroll, y=win_percentage, label=teamID)) + 
+     geom_point(aes(colour=Rank)) +
+     xlab("Standard Team Payroll") +
+     ylab("Winning Percentage") +
+     ggtitle("Winning Percentage vs. Standard Payroll across Time") +
+     geom_smooth(method = 'lm') +
+     labs(colour = "Rank") +
+     scale_colour_gradient(low="red", high="blue")
> payroll_tab %>% 
+   ggplot(aes(x=standard_payroll, y=win_percentage, label=teamID)) + 
+     geom_point(aes(colour=Rank)) +
+     xlab("Standard Team Payroll") +
+     ylab("Winning Percentage") +
+     ggtitle("Winning Percentage vs. Standard Payroll across Time") +
+     geom_smooth(method = 'lm') +
+     labs(colour = "Rank") +
+     scale_colour_gradient(low="green", high="red")
> payroll_tab %>% 
+   filter(yearID >=1990 & yearID <= 2014) %>%
+     ggplot(aes(x=yearID, y=payroll)) +
+       geom_line() +
+       facet_wrap(~teamID) +
+       xlab("Year") +
+       ylab("Total Payroll") +
+       ggtitle("Team Payrolls Over Time for Teams") +
+       theme(text = element_text(size = 10),
+             axis.text.x = element_text(angle=90, vjust=1))
> ?rvest
No documentation for ‘rvest’ in specified packages and libraries:
you could try ‘??rvest’
> ??rvest
> payroll_tab %>% select(teamID, yearID, payroll_tab, average_payroll_year, st_dev_payroll_year)
Error: All select() inputs must resolve to integer column positions.
The following do not:
*  payroll_tab
> payroll_tab %>% select(teamID, yearID, payroll_tab, average_payroll_year, st_dev_payroll_year)
Error: All select() inputs must resolve to integer column positions.
The following do not:
*  payroll_tab
> payroll_tab %>% select(teamID, yearID, average_payroll_year, st_dev_payroll_year)
# A tibble: 858 × 4
   teamID yearID average_payroll_year st_dev_payroll_year
    <chr>  <int>                <dbl>               <dbl>
1     ATL   1985             10075565             2470845
2     BAL   1985             10075565             2470845
3     BOS   1985             10075565             2470845
4     CAL   1985             10075565             2470845
5     CHA   1985             10075565             2470845
6     CHN   1985             10075565             2470845
7     CIN   1985             10075565             2470845
8     CLE   1985             10075565             2470845
9     DET   1985             10075565             2470845
10    HOU   1985             10075565             2470845
# ... with 848 more rows
> payroll_tab %>% select(teamID, yearID, average_payroll_year, st_dev_payroll_year) %>% sample_n(5)
# A tibble: 5 × 4
  teamID yearID average_payroll_year st_dev_payroll_year
   <chr>  <int>                <dbl>               <dbl>
1    SEA   2013            101150855            48830287
2    ATL   1997             40260210            13060728
3    DET   2002             67469251            24692193
4    TEX   1995             33981049             9447998
5    TOR   2009             88824233            33857093
> head(standard_payrolls)
# A tibble: 6 × 3
  yearID average_payroll_year st_dev_payroll_year
   <int>                <dbl>               <dbl>
1   1985             10075565             2470845
2   1986             11840558             3186956
3   1987             10483668             3848337
4   1988             11555862             3386331
5   1989             13845989             3568844
6   1990             17072354             3771834
> payroll_tab %>% select(teamID, yearID, average_payroll_year, st_dev_payroll_year) %>% head()
# A tibble: 6 × 4
  teamID yearID average_payroll_year st_dev_payroll_year
   <chr>  <int>                <dbl>               <dbl>
1    ATL   1985             10075565             2470845
2    BAL   1985             10075565             2470845
3    BOS   1985             10075565             2470845
4    CAL   1985             10075565             2470845
5    CHA   1985             10075565             2470845
6    CHN   1985             10075565             2470845
> payroll_tab %>% 
+   select(teamID, yearID, average_payroll_year, st_dev_payroll_year) %>% 
+   sample_n(5)
# A tibble: 5 × 4
  teamID yearID average_payroll_year st_dev_payroll_year
   <chr>  <int>                <dbl>               <dbl>
1    MON   1991             23578785             6894669
2    CHN   2014             99800016            45705053
3    BAL   2000             55537837            21416220
4    TOR   1996             34177984            10688535
5    ML4   1994             33137010             8528749
Checking rgeos availability: TRUE
> payroll_tab %>% 
+   filter(yearID >=1990 & yearID <= 2014) %>%
+     ggplot(aes(x=yearID, y=payroll)) +
+       geom_line() +
+       facet_wrap(~teamID) +
+       xlab("Year") +
+       ylab("Total Payroll") +
+       ggtitle("Team Payrolls Over Time for Teams") +
+       theme(text = element_text(size = 7.5),
+             axis.text.x = element_text(angle=90, vjust=1))
>
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX