Step 1: Download anaconda from here: https://www.anaconda.com/products/individual
Step 2: Open Spyder that is installed with it
Step 3: Paste the code below into a blank script, pasting over the lines at the top of the script that get automatically
Step 4: Click the run button
You might get errors based on installation issues. Post them here, I'll tell you how to fix
If you want more years, go to
"years = ['2020']" in the code
add any years you want to it, but put around single quotes ex. '2019'
Use a comma to seperate them ex. ['2020','2019','2018']
I haven't tested many years in the past, could have errors in formatting.
Let me know if any of the data is incorrect. This is year old, I haven't used it (very tired of losing at college basketball)
It will give you a csv file with the data, whereever you have the script saved.
Step 2: Open Spyder that is installed with it
Step 3: Paste the code below into a blank script, pasting over the lines at the top of the script that get automatically
Step 4: Click the run button
You might get errors based on installation issues. Post them here, I'll tell you how to fix
If you want more years, go to
"years = ['2020']" in the code
add any years you want to it, but put around single quotes ex. '2019'
Use a comma to seperate them ex. ['2020','2019','2018']
I haven't tested many years in the past, could have errors in formatting.
Let me know if any of the data is incorrect. This is year old, I haven't used it (very tired of losing at college basketball)
It will give you a csv file with the data, whereever you have the script saved.
Code:
# -*- coding: utf-8 -*- """ Created on Mon Nov 30 20:10:35 2020 @author: Waterstpub87 """ import numpy as np import pandas as pd years = ['2020'] for year in years: schoolsurl = "https://www.sports-reference.com/cbb/seasons/" + year + "-school-stats.html" schools = pd.read_html(schoolsurl) df = schools[0] df = df[list(df)] scl = df['Overall'] scl['School'] = scl['School'].str.replace('NCAA','') scl['School'] = scl['School'].str.strip() scl.index = scl['School'] scl['URL'] = scl['School'] scl['URL'] = scl['URL'].str.replace(' ','-') scl['URL'] = scl['URL'].str.replace('.','') scl['URL'] = scl['URL'].str.replace('&','') scl['URL'] = scl['URL'].str.replace('(','') scl['URL'] = scl['URL'].str.replace(')','') scl['URL'] = scl['URL'].str.replace("'",'') scl['URL'] = scl['URL'].str.replace("--",'-') scl['URL'] = scl['URL'].str.lower() scl['URL'] = scl['URL'].str.replace('little-rock','arkansas-little-rock') scl['URL'] = scl['URL'].str.replace('uc-','california-') scl['URL'] = scl['URL'].str.replace('university-of-california','california') scl['URL'] = scl['URL'].str.replace('purdue-fort-wayne','ipfw') scl['URL'] = scl['URL'].str.replace('fort-wayne','ipfw') scl['URL'] = scl['URL'].str.replace('omaha','nebraska-omaha') scl['URL'] = scl['URL'].str.replace('siu-edwardsville','southern-illinois-edwardsville') scl['URL'] = scl['URL'].str.replace('texas-rio-grande-valley','texas-pan-american') #scl['URL'] = scl['URL'].str.replace('vmi','virginia-military-institute') scl['URL'] = scl['URL'].str.replace('cal-state-long-beach','long-beach-state') scl.loc['Louisiana']['URL']='louisiana-lafayette' scl.loc['VMI']['URL']='virginia-military-institute' scl = scl[scl['School'] != 'Overall'] scl = scl[scl['School'] != 'School'] scl.index = scl['URL'] for x in scl['URL']: try: url = 'https://www.sports-reference.com/cbb/schools/' + x + '/' + year + '-gamelogs.html' data = pd.read_html(url) data = data[0] data = data[list(data)] data['School1'] = scl.loc[x]['School'] if x == 'abilene-christian' and years[0]==year: results = data else: results = results.append(data) except: pass results.to_csv('CBBDB.csv') cols = ['G','Date','Location','Opp','Results','P oints','Points Against','FG','FGA','FG%','3P','3PA','3P %','FT','FTA','FT%','ORB','TRB','AST','S TL','BLK','TOV','PF','Blank','OPPFP','OPFPA','OPFG%','OPP3P', 'OPP3PA','OPP3P%','OPPFT','OPPFTA','OPPF T%','OPPORB','OPPTRB','OPPAST','OPPSTL', 'OPPBLK','OPPTOV','OPPPF','School'] results.columns = cols mid = results['School'] results.drop(labels=['School'], axis=1,inplace = True) results.insert(2, 'School', mid) results.drop(labels=['Blank'], axis=1,inplace = True) results.drop(labels=['FG%'], axis=1,inplace = True) results.drop(labels=['3P %'], axis=1,inplace = True) results.drop(labels=['FT%'], axis=1,inplace = True) results.drop(labels=['OPFG%'], axis=1,inplace = True) results.drop(labels=['OPP3P%'], axis=1,inplace = True) results.drop(labels=['OPPF T%'], axis=1,inplace = True) results = results[results.Date != 'School'] results = results[results.Date != 'Date'] results= results.fillna(0) counter = 6 cols = list(results) while counter < 34: column = cols[counter] results[column] = results[column].astype(int) counter = counter +1 results['Pace'] = (.50*(results['FGA'] + (.49*results['FTA']) + results['TOV'] - results['ORB'])) + (.50 * (results['OPFPA'] + (.49 * results['OPPFTA'])-results['OPPORB']+results['OPPTOV'])) results['School'] = results['School'].str.replace('Cal State Long Beach','Long Beach State') results['School']= results['School'].str.replace('SIU Edwardsville','Southern Illinois-Edwardsville') results['School']= results['School'].str.replace('VMI','Virginia Military Institute') results['Opp']= results['Opp'].str.replace('UMBC','Maryland-Baltimore County') results['Opp']= results['Opp'].str.replace('UNLV','Nevada-Las Vegas') results['Opp']= results['Opp'].str.replace('Detroit','Detroit Mercy') results['Opp']= results['Opp'].str.replace('BYU','Brigham Young') results['Opp']= results['Opp'].str.replace('Southern Miss','Southern Mississippi') results['Opp']= results['Opp'].str.replace('UTEP','Texas-El Paso') results['Opp']= results['Opp'].str.replace('UTSA','Texas-San Antonio') results['Opp']= results['Opp'].str.replace('UCF','Central Florida') results['Opp']= results['Opp'].str.replace('LSU','Louisiana State') results['Opp']= results['Opp'].str.replace('Ole Miss','Mississippi') results['Opp']= results['Opp'].str.replace('LIU-Brooklyn','Long Island University') results['Opp']= results['Opp'].str.replace('UMass-Lowell','Massachusetts-Lowell') results['Opp']= results['Opp'].str.replace('California','University of California') results['Opp']= results['Opp'].str.replace('USC','Southern California') results['Opp']= results['Opp'].str.replace('UConn','Connecticut') results['Opp']= results['Opp'].str.replace('UMass','Massachusetts') results['Opp']= results['Opp'].str.replace('UCSB','UC-Santa Barbara') results['Opp']= results['Opp'].str.replace('UNC Wilmington','North Carolina-Wilmington') results['Opp']= results['Opp'].str.replace("St. Peter's","Saint Peter's") results['Opp']= results['Opp'].str.replace('UNC Asheville','North Carolina-Asheville') results['Opp']= results['Opp'].str.replace('NC State','North Carolina State') results['Opp']= results['Opp'].str.replace('UNC','North Carolina') results['Opp']= results['Opp'].str.replace('Central Connecticut','Central Connecticut State') results['Opp']= results['Opp'].str.replace('UT-Martin','Tennessee-Martin') results['Opp']= results['Opp'].str.replace('TCU','Texas Christian') results['Opp']= results['Opp'].str.replace("Saint Mary's","Saint Mary's (CA)") results['Opp']= results['Opp'].str.replace("Pitt","Pittsburgh") results['Opp']= results['Opp'].str.replace("VCU","Virginia Commonwealth") results['Opp']= results['Opp'].str.replace("UIC","Illinois-Chicago") results['Opp']= results['Opp'].str.replace("SMU","Southern Methodist") results['Opp']= results['Opp'].str.replace("Penn","Pennsylvania") results['Opp']= results['Opp'].str.replace("USC Upstate","South Carolina Upstate") results['Opp']= results['Opp'].str.replace("UMKC","Missouri-Kansas City") results['Opp']= results['Opp'].str.replace("UNC Greensboro","North Carolina-Greensboro") results['Opp']= results['Opp'].str.replace("St. Joseph's","Saint Joseph's") results['Opp']= results['Opp'].str.replace("ETSU","East Tennessee State") results['Opp']= results['Opp'].str.replace("Pennsylvania State","Penn State") results['Opp']= results['Opp'].str.replace("North Carolina Greensboro","North Carolina-Greensboro") results['Opp']= results['Opp'].str.replace("Southern California Upstate","South Carolina Upstate") results['Opp']= results['Opp'].str.replace("University of California Baptist","California Baptist") results['Opp']= results['Opp'].str.replace('SIU-Edwardsville','Southern Illinois-Edwardsville') results['Opp']= results['Opp'].str.replace('VMI','Virginia Military Institute') results.to_csv('CBBD'+year+'.csv')