What Matters in Winning College Basketball Games?
- bphilton5
- Dec 10, 2020
- 2 min read
Data Source
All data is scraped with python from sports-reference.com code is below.
Filters
All college basketball teams from 2000 to 2019.
Key Points
No one statistical item matters.
FG percentage matters the most but not all the much.
Was surprised the R^2 of steals per game isn't negative.
I think going deeper would make more sense in how college basketball teams are winning.
Python Script
#pip install beautifulsoup4
#pip install lxml
#pip install requests
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup
n=range(2000,2020)
url2 ='https://www.sports-reference.com/cbb/seasons/'
url3 ='-school-stats.html'
## initialize all of the lists but make sure they don't refer to the same empty list
pl_team_list, pl_wins_list, pl_loses_list, pl_total_points_list, pl_total_points_againest_list, \
pl_FG_percentage_list, pl_3_percentage_list, pl_FT_percentage_list, pl_total_rebounds_list, \
pl_assist_list, pl_steals_list, pl_turnovers_list = ([] for i in range(12))
for n in n:
all = url2+str(n)+url3
r = requests.get(all)
soup = BeautifulSoup(r.text, 'html.parser')
league_table = soup.find('table', class_ = 'per_match_toggle sortable stats_table')
for team in league_table.find_all('tbody'):
rows = team.find_all('tr')
for row in rows:
pl_team = row.find('td', class_ = 'left')
if pl_team == (None):
continue
pl_wins = row.find_all('td', class_ = 'right')[1]
if pl_wins == (None):
continue
pl_loses = row.find_all('td', class_ = 'right')[2]
if pl_wins == (None):
continue
pl_total_points = row.find_all('td', class_ = 'right')[16]
if pl_total_points == (None):
continue
pl_total_points_againest = row.find_all('td', class_ = 'right')[17]
if pl_total_points_againest == (None):
continue
pl_FG_percentage = row.find_all('td', class_ = 'right')[22]
if pl_FG_percentage == (None):
continue
pl_3_percentage = row.find_all('td', class_ = 'right')[25]
if pl_3_percentage == (None):
continue
pl_FT_percentage = row.find_all('td', class_ = 'right')[28]
if pl_FT_percentage == (None):
continue
pl_total_rebounds = row.find_all('td', class_ = 'right')[30]
if pl_total_rebounds == (None):
continue
pl_assist = row.find_all('td', class_ = 'right')[31]
if pl_assist == (None):
continue
pl_steals = row.find_all('td', class_ = 'right')[32]
if pl_steals == (None):
continue
pl_turnovers = row.find_all('td', class_ = 'right')[33]
if pl_total_rebounds == (None):
continue
#print(n,
# pl_team.text,
# pl_wins.text,
# pl_loses.text,
# pl_total_points.text,
# pl_total_points_againest.text,
# pl_FG_percentage.text,
# pl_3_percentage.text,
# pl_FT_percentage.text,
# pl_total_rebounds.text,
# pl_assist.text,
# pl_steals.text,
# pl_turnovers.text,
# )
#data = {'Year': n,
# 'Team': pl_team.text,
# 'Wins': pl_wins.text}
pl_team_list.append(pl_team.text)
pl_wins_list.append(pl_wins.text)
pl_loses_list.append(pl_loses.text)
pl_total_points_list.append(pl_total_points.text)
pl_total_points_againest_list.append(pl_total_points_againest.text)
pl_FG_percentage_list.append(pl_FG_percentage.int)
pl_3_percentage_list.append(pl_3_percentage.int)
pl_FT_percentage_list.append(pl_FT_percentage.int)
pl_total_rebounds_list.append(pl_total_rebounds.text)
pl_assist_list.append(pl_assist.text)
pl_steals_list.append(pl_steals.text)
pl_turnovers_list.append(pl_turnovers.text)
df = pd.DataFrame({
'Team': pl_team_list,
'Wins': pl_wins_list,
'Loses': pl_loses_list,
'Total Points': pl_total_points_list,
'Total Points Against': pl_total_points_againest_list,
'FG Percentage': pl_FG_percentage_list,
'3 Percentage': pl_3_percentage_list,
'FT Percentage': pl_FT_percentage_list,
'Total Rebounds': pl_total_rebounds_list,
'Assist': pl_assist_list,
'Steals': pl_steals_list,
'Turnovers': pl_turnovers_list
})
df.to_excel('C://Users/bphil/OneDrive/Desktop/ncaa_basketball_teams.xlsx')
df





Comments