Chapter 7 Appendix
7.1 Data Scraping Functions
7.1.1 OUA webscraper
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import csv
from collections import defaultdict
#import pprint
import csv
def get_links():
print("getting links...")
teams = ['algoma', 'brock', 'carleton', 'guelph', 'lakehead', 'laurentian',
'laurier', 'mcmaster', 'nipissing', 'ottawa', 'queens', 'ryerson',
'toronto', 'waterloo', 'western', 'windsor', 'york']
years = ['2014-15', '2015-16', '2016-17', '2017-18', '2018-19']
original_url = 'http://oua.ca/sports/mbkb/'
end_url = '?view=gamelog'
href_list = []
for year in years:
for team in teams:
current_url = original_url + year + '/teams/' + team + end_url
r = requests.get(current_url)
raw_html = r.content
soup = BeautifulSoup(raw_html, 'html.parser')
tables = soup.findAll('table')
max_len = 0
index = 0
for i in range(len(tables)):
tags = tables[i].findAll('a')
if len(tags) > 0:
url = tags[0].get('href', None)
if "/boxscores/20" in url and len(tables[i]) > max_len:
index = i
max_len = len(tables[i])
table = tables[index]
tags = table.findAll('a')
for tag in tags:
url = re.sub("\.\.", original_url + year, tag.get('href', None))
url += '?view=teamstats'
href_list.append(url)
print("done getting links")
return href_list
def scrape(url):
""" This function is used to create data
dictionaries for any url of team stats in the oua website.
It takes an array of urls but for some games there
are extra fields to look out for. This function is
for those that do not have those extra fields in the table"""
# create dictionary for links with less fields in table
dictlist = {}
for i in range(len(url)):
print(url[i])
r = requests.get(url[i])
raw_html = r.content
soup = BeautifulSoup(raw_html, 'html.parser')
soup[url[i]] = BeautifulSoup(raw_html, 'html.parser')
stats = soup[url[i]].findAll("table")
scores = soup[url[i]].findAll('div',
{'class': 'teams clearfix'})[0].table
# some links have different amounts of tables
and sometimes the team stats table is different
table = stats[8]
for j in range(2, len(stats)):
if str(stats[j].caption) ==
'<caption class=
"caption offscreen">
<h2>Team Statistics</h2></caption>':
table=stats[j]
break
dictlist[url[i]] = {}
d = {}
dictlist[url[i]] = {
"Away" : table.findAll('th', {'scope': 'col'})[1].text.strip(),
"Home" : table.findAll('th', {'scope': 'col'})[2].text.strip(),
}
try:
winner = scores.findAll('tr', {'class': 'winner'})[0]
except IndexError:
d[None] = None
try:
loser = scores.findAll('tr', {'class': 'loser'})[0]
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Winner": winner.th.text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Loser": loser.th.text.strip()})
except IndexError:
d[None] = None
# for k in range(1,6):
# dictlist[url[i]].update({"Winner Qtr" +k +Pts"})
try:
dictlist[url[i]].update({"Winner 1st Qtr Pts":
winner.findAll('td')[0].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Loser 1st Qtr Pts":
loser.findAll('td')[0].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Winner 2nd Qtr Pts":
winner.findAll('td')[1].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Loser 2nd Qtr Pts":
loser.findAll('td')[1].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Winner 3rd Qtr Pts":
winner.findAll('td')[2].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Loser 3rd Qtr Pts":
loser.findAll('td')[2].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Winner 4th Qtr Pts":
winner.findAll('td')[3].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({"Loser 4th Qtr Pts":
loser.findAll('td')[3].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update( {"Winner Total Pts":
winner.findAll('td')[4].text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update( {"Loser Total Pts":
loser.findAll('td')[4].text.strip()})
except IndexError:
d[None] = None
for j in range(16):
try:
dictlist[url[i]].update( { table.findAll('th',
{'scope': 'row'})[j].text.strip() + ' Away':
table.findAll('td')[2*j].
text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({ table.findAll('th',
{'scope': 'row'})[j].text.strip() + ' Home' :
table.findAll('td')[2*j+1].
text.strip()})
except IndexError:
d[None] = None
try:
dictlist[url[i]].update({table.findAll('th',
{'scope': 'row'})[16].text.strip()+' Away':
table.findAll('td')[32].
text.strip()})
except IndexError:
d[None] = None
z = {**dictlist, **d}
return z
if __name__ == '__main__':
q = get_links()
a = scrape(q)
df = pd.DataFrame(a)
df = df.T
df = df.replace('\-', ' -- ', regex=True).astype(object)
df = df[['Away', 'FG Away', 'FG% Away', '3PT FG Away',
'3PT FG% Away', 'FT Away', 'FT% Away', 'Rebounds Away',
'Assists Away',
'Turnovers Away', 'Points Off Turnovers Away',
'2nd Chance Points Away', 'Points in the Paint Away',
'Fastbreak Points Away', 'Bench Points Away',
'Largest Lead Away', 'Time of Largest Lead Away',
'Home', 'FG Home',
'FG% Home', '3PT FG Home', '3PT FG% Home',
'FT Home', 'FT% Home', 'Rebounds Home', 'Assists Home',
'Turnovers Home','Points Off Turnovers Home',
'2nd Chance Points Home', 'Points in the Paint Home',
'Fastbreak Points Home',
'Bench Points Home', 'Largest Lead Away',
'Time of Largest Lead Away', 'Trends Away', 'Winner',
'Winner 1st Qtr Pts',
'Winner 2nd Qtr Pts', 'Winner 3rd Qtr Pts',
'Winner 4th Qtr Pts', 'Winner Total Pts', 'Loser',
'Loser 1st Qtr Pts', 'Loser 2nd Qtr Pts',
'Loser 3rd Qtr Pts', 'Loser 4th Qtr Pts', 'Loser Total Pts']]
df.to_csv('gbyg.csv', header=True)
import pdb; pdb.set_trace()
7.1.2 OUA Player Stats Scraper
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from urllib.request import urlopen as uReq
import re
import itertools
def get_links():
print("getting links...")
teams = ['algoma', 'brock', 'carleton', 'guelph',
'lakehead', 'laurentian',
'laurier', 'mcmaster', 'nipissing',
'ottawa', 'queens', 'ryerson',
'toronto', 'waterloo', 'western', 'windsor', 'york']
years = ['2014-15', '2015-16', '2016-17', '2017-18', '2018-19']
original_url = 'http://oua.ca/sports/mbkb/'
end_url = '?view=gamelog'
href_list = []
for year in years:
for team in teams:
current_url = original_url + year +
'/teams/' + team + end_url
r = requests.get(current_url)
raw_html = r.content
soup = BeautifulSoup(raw_html, 'html.parser')
tables = soup.findAll('table')
max_len = 0
index = 0
for i in range(len(tables)):
tags = tables[i].findAll('a')
if len(tags) > 0:
url = tags[0].get('href', None)
if "/boxscores/20" in url
and len(tables[i]) > max_len:
index = i
max_len = len(tables[i])
table = tables[index]
tags = table.findAll('a')
for tag in tags:
url = re.sub("\.\.", original_url + year,
tag.get('href', None))
url += '?view=teamstats'
href_list.append(url)
print("done getting links")
return href_list
def vsplayers_scrape(url):
vslist = {}
homelist = {}
for j in range(len(url)):
print(url[j])
r = requests.get(url[j])
raw_html = r.content
soup = BeautifulSoup(raw_html, 'html.parser')
soup[url[j]] = BeautifulSoup(raw_html, 'html.parser')
boxscore = soup[url[j]].find_all('article',
{'class': 'game-boxscore bkb clearfix'})
players = boxscore[0].find_all('div',
{'class': 'player-stats'})
team1 = players[0].find_all('div',
{'class': 'stats-wrap clearfix'})
visitorteam = team1[0].find_all('div',
{'class': 'stats-box full lineup visitor clearfix'})
team2 = players[0].find_all('div',
{'class': 'stats-wrap clearfix'})[1]
hometeam = team2.find_all('div',
{'class': 'stats-box full lineup home clearfix'})
hometbody = hometeam[0].find_all('tbody')
hometr = hometbody[1].find_all('tr')
visitortbody = visitorteam[0].find_all('tbody')
visitortr = visitorteam[0].find_all('tr')
vslist[url[j]] = {}
for k in range(len(visitorteam[0].find_all('tbody'))):
if visitorteam[0].find_all('tbody')[k].
tr.text.strip() == str('STARTERS'):
starters = visitorteam[0].find_all('tbody')[k]
starterstr = starters.find_all('tr')
elif visitorteam[0].find_all('tbody')[k].
tr.text.strip() == str('RESERVES'):
reserves = visitorteam[0].find_all('tbody')[k]
reservestr = reserves.find_all('tr')
if len(starterstr) > 4:
for i in range((len(starters.find_all('th'))) - 1):
vslist[url[j],i] = {
'Away' : visitorteam[0].caption.text.strip(),
visitorteam[0].thead.th.text.strip() :
starters.find_all('th')[i+1].text.strip(),
visitorteam[0].find_all('th')[1].text.strip() :
starterstr[i+1].td.text.strip(),
visitorteam[0].find_all('th')[2].text.strip() :
starterstr[i+1].find_all('td')[1].text.strip(),
visitorteam[0].find_all('th')[3].text.strip() :
starterstr[i+1].find_all('td')[2].text.strip(),
visitorteam[0].find_all('th')[4].text.strip() :
starterstr[i+1].find_all('td')[3].text.strip(),
visitorteam[0].find_all('th')[5].text.strip() :
starterstr[i+1].find_all('td')[4].text.strip(),
visitorteam[0].find_all('th')[6].text.strip() :
starterstr[i+1].find_all('td')[5].text.strip(),
visitorteam[0].find_all('th')[7].text.strip():
starterstr[i+1].find_all('td')[6].text.strip(),
visitorteam[0].find_all('th')[8].text.strip():
starterstr[i+1].find_all('td')[7].text.strip(),
visitorteam[0].find_all('th')[9].text.strip():
starterstr[i+1].find_all('td')[8].text.strip(),
visitorteam[0].find_all('th')[10].text.strip():
starterstr[i+1].find_all('td')[9].text.strip(),
visitorteam[0].find_all('th')[11].text.strip():
starterstr[i+1].find_all('td')[10].text.strip(),
visitorteam[0].find_all('th')[12].text.strip():
starterstr[i+1].find_all('td')[11].text.strip(),
visitorteam[0].find_all('th')[13].text.strip():
starterstr[i+1].find_all('td')[12].text.strip(),
}
return vslist
def vrplayers_scrape(url):
vrlist = {}
homelist = {}
for j in range(len(url)):
print(url[j])
r = requests.get(url[j])
raw_html = r.content
soup = BeautifulSoup(raw_html, 'html.parser')
soup[url[j]] = BeautifulSoup(raw_html, 'html.parser')
boxscore = soup[url[j]].find_all('article',
{'class': 'game-boxscore bkb clearfix'})
players = boxscore[0].find_all('div', {'class': 'player-stats'})
team1 = players[0].find_all('div', {'class': 'stats-wrap clearfix'})
visitorteam = team1[0].find_all('div',
{'class': 'stats-box full lineup visitor clearfix'})
team2 = players[0].find_all('div',
{'class': 'stats-wrap clearfix'})[1]
hometeam = team2.find_all('div',
{'class': 'stats-box full lineup home clearfix'})
hometbody = hometeam[0].find_all('tbody')
hometr = hometbody[1].find_all('tr')
visitortbody = visitorteam[0].find_all('tbody')
visitortr = visitorteam[0].find_all('tr')
vrlist[url[j]] = {}
for k in range(len(visitorteam[0].find_all('tbody'))):
if visitorteam[0].find_all('tbody')[k].
tr.text.strip() == str('STARTERS'):
starters = visitorteam[0].find_all('tbody')[k]
starterstr = starters.find_all('tr')
elif visitorteam[0].find_all('tbody')[k].
tr.text.strip() == str('RESERVES'):
reserves = visitorteam[0].find_all('tbody')[k]
reservestr = reserves.find_all('tr')
if len(reservestr) > 0:
for i in range((len(reserves.find_all('th'))) - 1):
vrlist[url[j],i] = {
'Away' : visitorteam[0].caption.text.strip(),
visitorteam[0].thead.th.text.strip() :
reserves.find_all('th')[i+1].text.strip(),
visitorteam[0].find_all('th')[1].text.strip() :
reservestr[i+1].td.text.strip(),
visitorteam[0].find_all('th')[2].text.strip() :
reservestr[i+1].find_all('td')[1].text.strip(),
visitorteam[0].find_all('th')[3].text.strip() :
reservestr[i+1].find_all('td')[2].text.strip(),
visitorteam[0].find_all('th')[4].text.strip() :
reservestr[i+1].find_all('td')[3].text.strip(),
visitorteam[0].find_all('th')[5].text.strip() :
reservestr[i+1].find_all('td')[4].text.strip(),
visitorteam[0].find_all('th')[6].text.strip() :
reservestr[i+1].find_all('td')[5].text.strip(),
visitorteam[0].find_all('th')[7].text.strip():
reservestr[i+1].find_all('td')[6].text.strip(),
visitorteam[0].find_all('th')[8].text.strip():
reservestr[i+1].find_all('td')[7].text.strip(),
visitorteam[0].find_all('th')[9].text.strip():
reservestr[i+1].find_all('td')[8].text.strip(),
visitorteam[0].find_all('th')[10].text.strip():
reservestr[i+1].find_all('td')[9].text.strip(),
visitorteam[0].find_all('th')[11].text.strip():
reservestr[i+1].find_all('td')[10].text.strip(),
visitorteam[0].find_all('th')[12].text.strip():
reservestr[i+1].find_all('td')[11].text.strip(),
visitorteam[0].find_all('th')[13].text.strip():
reservestr[i+1].find_all('td')[12].text.strip(),
}
return vrlist
def hsplayers_scrape(url):
rlist = {}
slist = {}
for j in range(len(url)):
print(url[j])
r = requests.get(url[j])
raw_html = r.content
soup = BeautifulSoup(raw_html, 'html.parser')
soup[url[j]] = BeautifulSoup(raw_html, 'html.parser')
boxscore = soup[url[j]].find_all('article',
{'class': 'game-boxscore bkb clearfix'})
players = boxscore[0].find_all('div',
{'class': 'player-stats'})
team1 = players[0].find_all('div',
{'class': 'stats-wrap clearfix'})
visitorteam = team1[0].find_all('div',
{'class': 'stats-box full lineup visitor clearfix'})
team2 = players[0].find_all('div',
{'class': 'stats-wrap clearfix'})[1]
hometeam = team2.find_all('div',
{'class': 'stats-box full lineup home clearfix'})
hometbody = hometeam[0].find_all('tbody')
hometr = hometbody[1].find_all('tr')
for k in range(len(hometeam[0].find_all('tbody'))):
if hometeam[0].find_all('tbody')[k].
tr.text.strip() == str('STARTERS'):
starters = hometeam[0].find_all('tbody')[k]
starterstr = starters.find_all('tr')
elif hometeam[0].find_all('tbody')[k].
tr.text.strip() == str('RESERVES'):
reserves = hometeam[0].find_all('tbody')[k]
reservestr = reserves.find_all('tr')
slist[url[j]] = {}
if len(starterstr) > 4:
for i in range((len(starters.find_all('th'))) - 1):
slist[url[j],i] = {
'Home': hometeam[0].caption.text.strip(),
hometeam[0].thead.th.text.strip():
starters.find_all('th')[i + 1].text.strip(),
hometeam[0].find_all('th')[1].text.strip():
starterstr[i + 1].td.text.strip(),
hometeam[0].find_all('th')[2].text.strip():
starterstr[i + 1].find_all('td')[1].text.strip(),
hometeam[0].find_all('th')[3].text.strip():
starterstr[i + 1].find_all('td')[2].text.strip(),
hometeam[0].find_all('th')[4].text.strip():
starterstr[i + 1].find_all('td')[3].text.strip(),
hometeam[0].find_all('th')[5].text.strip():
starterstr[i + 1].find_all('td')[4].text.strip(),
hometeam[0].find_all('th')[6].text.strip():
starterstr[i + 1].find_all('td')[5].text.strip(),
hometeam[0].find_all('th')[7].text.strip():
starterstr[i + 1].find_all('td')[6].text.strip(),
hometeam[0].find_all('th')[8].text.strip():
starterstr[i + 1].find_all('td')[7].text.strip(),
hometeam[0].find_all('th')[9].text.strip():
starterstr[i + 1].find_all('td')[8].text.strip(),
hometeam[0].find_all('th')[10].text.strip():
starterstr[i + 1].find_all('td')[9].text.strip(),
hometeam[0].find_all('th')[11].text.strip():
starterstr[i + 1].find_all('td')[10].text.strip(),
hometeam[0].find_all('th')[12].text.strip():
starterstr[i + 1].find_all('td')[11].text.strip(),
hometeam[0].find_all('th')[13].text.strip():
starterstr[i + 1].find_all('td')[12].text.strip(),
}
return slist
def hrplayers_scrape(url):
rlist = {}
slist = {}
for j in range(len(url)):
print(url[j])
r = requests.get(url[j])
raw_html = r.content
soup = BeautifulSoup(raw_html, 'html.parser')
soup[url[j]] = BeautifulSoup(raw_html, 'html.parser')
boxscore = soup[url[j]].find_all('article',
{'class': 'game-boxscore bkb clearfix'})
players = boxscore[0].find_all('div',
{'class': 'player-stats'})
team1 = players[0].find_all('div',
{'class': 'stats-wrap clearfix'})
visitorteam = team1[0].find_all('div',
{'class': 'stats-box full lineup visitor clearfix'})
team2 = players[0].find_all('div',
{'class': 'stats-wrap clearfix'})[1]
hometeam = team2.find_all('div', {'class':
'stats-box full lineup home clearfix'})
hometbody = hometeam[0].find_all('tbody')
hometr = hometbody[1].find_all('tr')
for k in range(len(hometeam[0].find_all('tbody'))):
if hometeam[0].find_all('tbody')[k].
tr.text.strip() == str('STARTERS'):
starters = hometeam[0].find_all('tbody')[k]
starterstr = starters.find_all('tr')
elif hometeam[0].find_all('tbody')[k].
tr.text.strip() == str('RESERVES'):
reserves = hometeam[0].find_all('tbody')[k]
reservestr = reserves.find_all('tr')
rlist[url[j]] = {}
if len(reservestr) > 0:
for i in range((len(reserves.find_all('th'))) - 1):
rlist[url[j], i] = {
'Home': hometeam[0].caption.text.strip(),
hometeam[0].thead.th.text.strip():
reserves.find_all('th')[i + 1].text.strip(),
hometeam[0].find_all('th')[1].text.strip():
reservestr[i + 1].td.text.strip(),
hometeam[0].find_all('th')[2].text.strip():
reservestr[i + 1].find_all('td')[1].text.strip(),
hometeam[0].find_all('th')[3].text.strip():
reservestr[i + 1].find_all('td')[2].text.strip(),
hometeam[0].find_all('th')[4].text.strip():
reservestr[i + 1].find_all('td')[3].text.strip(),
hometeam[0].find_all('th')[5].text.strip():
reservestr[i + 1].find_all('td')[4].text.strip(),
hometeam[0].find_all('th')[6].text.strip():
reservestr[i + 1].find_all('td')[5].text.strip(),
hometeam[0].find_all('th')[7].text.strip():
reservestr[i + 1].find_all('td')[6].text.strip(),
hometeam[0].find_all('th')[8].text.strip():
reservestr[i + 1].find_all('td')[7].text.strip(),
hometeam[0].find_all('th')[9].text.strip():
reservestr[i + 1].find_all('td')[8].text.strip(),
hometeam[0].find_all('th')[10].text.strip():
reservestr[i + 1].find_all('td')[9].text.strip(),
hometeam[0].find_all('th')[11].text.strip():
reservestr[i + 1].find_all('td')[10].text.strip(),
hometeam[0].find_all('th')[12].text.strip():
reservestr[i + 1].find_all('td')[11].text.strip(),
hometeam[0].find_all('th')[13].text.strip():
reservestr[i + 1].find_all('td')[12].text.strip(),
}
return rlist
q = get_links()
b = hrplayers_scrape(q)
c = hsplayers_scrape(q)
d = vsplayers_scrape(q)
e = vrplayers_scrape(q)
df1 = pd.DataFrame(b)
df1 = df1.T
df1 = df1.replace('\-', ' -- ', regex=True).astype(object)
df1 = df1.replace('\\n', '', regex=True).astype(object)
df1.to_csv('home reserves.csv',header = True)
df2 = pd.DataFrame(c)
df2 = df2.T
df2 = df2.replace('\-', ' -- ', regex=True).astype(object)
df2 = df2.replace('\\n', '', regex=True).astype(object)
df2.to_csv('home starters.csv',header = True)
df3 = pd.DataFrame(d)
df3 = df3.T
df3 = df3.replace('\-', ' -- ', regex=True).astype(object)
df3 = df3.replace('\\n', '', regex=True).astype(object)
df3.to_csv('visitors starters.csv',header = True)
df4 = pd.DataFrame(e)
df4 = df4.T
df4 = df4.replace('\-', ' -- ', regex=True).astype(object)
df4 = df4.replace('\\n', '', regex=True).astype(object)
df4.to_csv('visitors reserves.csv',header = True)
concat = pd.concat([df2,df3,df1,df4],sort=False)
df1.columns
concat.to_csv('player_data.csv',header=True)
7.1.3 Synergy Data Scraper
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time
import os
import sys
browser = webdriver.Chrome(os.path.join(sys.path[0], 'chromedriver'))
def login():
login_url = 'https://www.synergysportstech.com/Synergy/Default.aspx'
browser.get(login_url)
username1 = browser.find_element_by_css_selector('#txtUserName')
username = "************"
username1.send_keys(username)
password1 = browser.find_element_by_css_selector('#txtPassword')
password = "************"
password1.send_keys(password)
browser.find_element_by_css_selector('#btnLogin').click()
def get_links():
browser.get('https://www.synergysportstech.com/Synergy/Sport/Basketball/web/teamsst/Video/SelectGame2.aspx')
el5 = browser.find_element_by_css_selector
('#ctl00_MainContent_lstSeason')
for option in el5.find_elements_by_tag_name('option'):
if option.text == '2014 - 2015':
option.click() # select() in earlier versions of webdriver
break
time.sleep(3)
el2 = browser.find_element_by_css_selector
('#ctl00_MainContent_lstDivisionGroup')
for option in el2.find_elements_by_tag_name('option'):
if option.text == 'U Sports':
option.click() # select() in earlier versions of webdriver
break
el4 = browser.find_element_by_css_selector
('#ctl00_MainContent_lstViewMax')
for option in el4.find_elements_by_tag_name('option'):
if option.text == '1600':
option.click() # select() in earlier versions of webdriver
break
el = browser.find_element_by_css_selector
('#ctl00_MainContent_lstSubType')
for option in el.find_elements_by_tag_name('option'):
if option.text == 'Regular Season':
option.click() # select() in earlier versions of webdriver
break
time.sleep(5)
el3 = browser.find_element_by_css_selector
('#ctl00_MainContent_lstDivisions')
for option in el3.find_elements_by_tag_name('option'):
if option.text == 'Ontario University Athletics':
option.click() # select() in earlier versions of webdriver
break
time.sleep(5)
links = browser.find_elements_by_tag_name('table')
html = links[2].get_attribute('innerHTML')
soup1 = BeautifulSoup(html, 'html.parser')
href_list1 = soup1.find_all('a')
i = 0
url_list = []
root_url = 'https://www.synergysportstech.com/Synergy/Sport/Basketball/web/teamsst/Video/'
for link in href_list1:
if "GameGrid2" in link['href']:
url_list.append(root_url + link['href'])
return url_list
def scrape(url_list):
dict = {}
for url in url_list:
dict[url] = {}
browser.get(url)
browser.find_element_by_link_text('Game Breakdown').click()
time.sleep(5)
table = browser.find_elements_by_class_name('Tier')
raw_html = table[2].get_attribute('innerHTML')
soup = BeautifulSoup(raw_html, 'html.parser')
raw_html2 = table[0].get_attribute('innerHTML')
soup2 = BeautifulSoup(raw_html2, 'html.parser')
print(soup.tr)
tr = soup2.find_all('tr')
Away_Team = tr[1].td.text.strip()
Away_Total_Score = tr[1].find_all('td')[1].text.strip()
Home_Team = tr[2].td.text.strip()
Home_Total_Score = tr[2].find_all('td')[1].text.strip()
# team1 = soup.find_all('td')[7].text.strip()
# team2 = soup.find_all('td')[8].text.strip()
tierrow = soup.find_all('tr', {'class': 'TierRow'})
dict[url][Home_Team] = {}
dict[url][Away_Team] = {}
for i in range(len(tierrow)):
row = soup.find_all('tr', {'class': 'TierRow'})[i]
rowname = row.find_all('td')[0].text.strip()
dict[url][Home_Team][rowname] = row.find_all('td')[1].text.strip()
dict[url][Home_Team]['Total Points'] = Home_Total_Score
dict[url][Away_Team][rowname] = row.find_all('td')[2].text.strip()
dict[url][Away_Team]['Total Points'] = Away_Total_Score
if int(Home_Total_Score) > int(Away_Total_Score):
dict[url][Home_Team]['Winner'] = 1
elif Away_Total_Score > Home_Total_Score:
dict[url][Away_Team]['Winner'] = 1
return dict
if __name__ == '__main__':
login()
urls = get_links()
dict = scrape(urls)
df = pd.DataFrame.from_dict({(i,j): dict[i][j]
for i in dict.keys()
for j in dict[i].keys()})
df = df.T
df.to_csv('General2014-15 .csv')
df.fillna(0)