dplyr – case stadies

# Find variables in common, i.e. return a data frame that contains every variable that appears in all 28 datasets (if any does).
lahmanNames %>% reduce(intersect)
lahmanNames %>%  
  # Bind the data frames in lahmanNames into a single table that includes a new column called dataframe, which contains the names of the data frames.
  bind_rows(.id="dataframe") %>%
  # Group the result by var
  group_by(var) %>%
  # Tally the number of appearances
  tally() %>%
  # Filter the data to only variables that appear more than once.
  filter(n>1) %>% 
  # Arrange the results so that the most frequently used variables appear before less frequently used variables.
  arrange(desc(n))
lahmanNames %>% 
#Bind the data frames in lahmanNames into a single table that includes the column dataframe, which contains the names of the data frames. 
bind_rows(.id="dataframe") %>%
  #Filter the results to just rows that contain playerID.
  filter(var=="playerID") %>% 
  # Extract the dataframe variable
#Next two are equivalent:
count(names, vars="band")
group_by(names, band) %>% count()
tbl %>%
# Find unique rows of columns a,b, and c
distinct(a, b, c) 
</code></pre>
<pre><code>
players %>% 
  # Find all players who do not appear in Salaries
  anti_join(Salaries) %>%
  # Count them
  count()
players %>% 
  anti_join(Salaries, by = "playerID") %>% 
  # How many unsalaried players appear in Appearances?
  semi_join(Appearances, by="playerID") %>% 
  count()
  
players %>% 
  # Find all players who do not appear in Salaries
  anti_join(Salaries, by="playerID") %>% 
  # Join them to Appearances
  left_join(Appearances, by="playerID") %>% 
  # Calculate total_games for each player
  group_by(playerID) %>%
  summarize(total_games=sum(G_all, na.rm=TRUE)) %>%
  # Arrange in descending order by total_games
  arrange(desc(total_games))
players %>%
  # Find unsalaried players
  anti_join(Salaries, by="playerID") %>% 
  # Join Batting to the unsalaried players
  left_join(Batting, by="playerID") %>% 
  # Group by player
  group_by(playerID) %>% 
  # Sum at-bats for each player
  #Calculate the sum of at-bats for each player as total_at_bat; 
  #this  will be the sum of the AB variable in Batting, ignoring all NAs
  summarize(total_at_bat=sum(AB, na.rm=TRUE)) %>% 
  # Arrange in descending order
  arrange(desc(total_at_bat))
# Find the distinct players that appear in HallOfFame
nominated <- HallOfFame %>% 
  distinct(playerID)

nominated %>% 
  # Count the number of players in nominated
  count()

nominated_full <- nominated %>% 
  # Join to Master
  left_join(Master, by = "playerID") %>% 
  # Return playerID, nameFirst, nameLast
  select(playerID, nameFirst, nameLast)
# Find distinct players in HallOfFame with inducted == "Y"
inducted <- HallOfFame %>% 
  filter(inducted=="Y") %>%
  distinct(playerID)  
inducted %>% 
  # Count the number of players in inducted
  count()

inducted_full <- inducted %>% 
  # Join to Master
  left_join(Master, by="playerID") %>% 
  # Return playerID, nameFirst, nameLast
  select(playerID, nameFirst, nameLast)
# Tally the number of awards in AwardsPlayers by playerID
nAwards <- AwardsPlayers %>% 
  group_by(playerID) %>% 
  tally()

nAwards %>% 
  # Filter to just the players in inducted 
  semi_join(inducted, by = "playerID") %>% 
  # Calculate the mean number of awards per player
  summarize(avg_n = mean(n, na.rm = TRUE))

nAwards %>% 
  # Filter to just the players in nominated 
  semi_join(nominated, by = "playerID") %>%
  # Filter to players NOT in inducted 
  anti_join(inducted, by = "playerID") %>%
  # Calculate the mean number of awards per player
  summarize(avg_n = mean(n, na.rm = TRUE))
# Find the players who are in nominated, but not inducted
notInducted <- nominated %>% 
  setdiff(inducted)

Salaries %>% 
  # Find the players who are in notInducted
  semi_join(notInducted, by = "playerID") %>%
  # Calculate the max salary by player
  group_by(playerID) %>% 
  summarize(max_salary = max(salary, na.rm = TRUE)) %>% 
  # Calculate the average of the max salaries
  summarize(avg_salary = mean(max_salary, na.rm = TRUE))

# Repeat for players who were inducted
Salaries %>% 
  semi_join(inducted, by = "playerID") %>% 
  group_by(playerID) %>% 
  summarize(max_salary = max(salary, na.rm = TRUE)) %>% 
  summarize(avg_salary = mean(max_salary, na.rm = TRUE))
Appearances %>% 
  # Filter Appearances against nominated
  semi_join(nominated, by="playerID") %>% 
  # Find last year played by player
  group_by(playerID) %>% 
  summarize(last_year=max(yearID)) %>% 
  # Join to full HallOfFame
  left_join(HallOfFame, by="playerID") %>% 
  # Filter for unusual observations
  filter(yearID <= last_year)