Skip to content

Commit

Permalink
2.9.2
Browse files Browse the repository at this point in the history
* renamed Sofascore file, class, and in __init__.py to all align on capitalization
* updates to FBRef.py for issues found during unit test dev
  • Loading branch information
oseymour committed Dec 7, 2023
1 parent 734b80c commit 27d7a42
Show file tree
Hide file tree
Showing 11 changed files with 766 additions and 464 deletions.
66 changes: 49 additions & 17 deletions ScraperFC/FBRef.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from IPython.display import clear_output
import numpy as np
import pandas as pd
from ScraperFC.shared_functions import get_source_comp_info, xpath_soup
from ScraperFC.shared_functions import get_source_comp_info, xpath_soup, \
NoMatchLinksException, UnavailableSeasonException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
Expand All @@ -12,7 +13,8 @@
import time
import re
from datetime import datetime

from io import StringIO
import warnings

class FBRef:
""" ScraperFC module for FBRef
Expand Down Expand Up @@ -153,7 +155,7 @@ def get_match_links(self, year, league):
"""
source_comp_info = get_source_comp_info(year,league,'FBRef')

print('Gathering match links.')
print(f'Gathering {year} {league} match links.')
season_link = self.get_season_link(year, league)
if season_link == -1:
return None
Expand Down Expand Up @@ -195,10 +197,9 @@ def scrape_league_table(self, year, league):
Returns
-------
: Pandas DataFrame
If league is not MLS, dataframe of the scraped league table
DataFrame may be empty if the league has no tables. Otherwise, the league table.
: tuple
If the league is MLS, a tuple of (west conference table, east conference table). Both tables are \
dataframes.
If the league has multiple tables (e.g. Champions League, Liga MX, MLS) then a tuple of DataFrames will be returned.
"""
_ = get_source_comp_info(year,league,'FBRef')

Expand All @@ -209,18 +210,49 @@ def scrape_league_table(self, year, league):
soup = BeautifulSoup(response.content, 'html.parser')

lg_table_html = soup.find_all('table', {'id': re.compile('overall')})

if league != 'MLS':
assert len(lg_table_html) == 1
lg_table_html = lg_table_html[0]
lg_table = pd.read_html(str(lg_table_html))[0]
return lg_table


if league == 'Ligue 2' and year == '2018':
# 2018 Ligue 2 page has a small sub-table of the playoff qualifiers for some dumb reason
lg_table_html = lg_table_html[:1]

if len(lg_table_html) == 0:
# Some compeitions have no tables (e.g. early women's champions league)
warnings.warn(f'No league/group tables found for {year} {league}.')
lg_table = pd.DataFrame()
elif len(lg_table_html) == 1:
# This will apply to most leagues
lg_table = pd.read_html(StringIO(str(lg_table_html[0])))[0]
else:
assert len(lg_table_html) == 2
east_table = pd.read_html(str(lg_table_html[0]))[0]
west_table = pd.read_html(str(lg_table_html[1]))[0]
return (east_table, west_table)
# Some comps have multiple tables (champions league, liga mx, mls)
warnings.warn(f'Multiple league/group tables found for {year} {league}.')
lg_table = [pd.read_html(StringIO(str(html)))[0] for html in lg_table_html]

return lg_table

# if league == 'MLS':
# assert len(lg_table_html) == 2
# east_table = pd.read_html(StringIO(str(lg_table_html[0])))[0]
# west_table = pd.read_html(StringIO(str(lg_table_html[1])))[0]
# return (east_table, west_table)
# elif league == 'Liga MX':
# apertura = pd.read_html(StringIO(str(lg_table_html[0])))[0]
# clausura = pd.read_html(StringIO(str(lg_table_html[1])))[0]
# lg_table = pd.read_html(StringIO(str(lg_table_html[2])))[0]
# rel_table = pd.read_html(StringIO(str(lg_table_html[3])))[0]
# return (apertura, clausura, lg_table, rel_table)
# elif league == 'Ligue 2' and year == '2018':
# # 2018 Ligue 2 page has a small sub-table of the playoff qualifiers for some dumb reason
# lg_table_html = lg_table_html[0]
# lg_table = pd.read_html(StringIO(str(lg_table_html)))[0]
# return lg_table
# elif league == 'Women Champions League' and year < 2024:
# warnings.warn('Women\'s Champions League has no group stage prior to 2024.')
# return pd.DataFrame()
# else:
# assert len(lg_table_html) == 1
# lg_table_html = lg_table_html[0]
# lg_table = pd.read_html(StringIO(str(lg_table_html)))[0]
# return lg_table

####################################################################################################################
def scrape_stats(self, year, league, stat_category, normalize=False):
Expand Down
4 changes: 4 additions & 0 deletions ScraperFC/shared_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,10 @@ def get_source_comp_info(year, league, source):
},
}

# If all args are None then return full source comp info (used in unit tests)
if year==None and league==None and source==None:
return source_comp_info

# Check source
if type(source) != str:
raise TypeError("Source must be a string.")
Expand Down
Binary file added dist/ScraperFC-2.9.2-py3-none-any.whl
Binary file not shown.
Binary file added dist/ScraperFC-2.9.2.tar.gz
Binary file not shown.
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,6 @@ bs4
lxml
tqdm
ScraperFC
sphinx_rtd_theme
sphinx_rtd_theme
io
warnings
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name = "ScraperFC",
version = "2.9.0",
version = "2.9.2",
author = "Owen Seymour",
author_email = "osmour043@gmail.com",
description = "Package for scraping soccer data from a variety of sources.",
Expand Down
164 changes: 50 additions & 114 deletions tests/TestFBRef.py
Original file line number Diff line number Diff line change
@@ -1,128 +1,64 @@
import sys
sys.path.insert(0, '..') # import local ScraperFC, not pypi installed version
import ScraperFC as sfc
import numpy as np
import itertools
import datetime # DO NOT import datetime as datetime, need to check type datetime.date, not datetime.datetime.date
import pandas as pd
from tqdm.auto import tqdm
import traceback
sys.path.append('../')
from ScraperFC import FBRef, get_source_comp_info, NoMatchLinksException, \
UnavailableSeasonException
from shared_test_functions import get_random_league_seasons
import random
from pandas import DataFrame


########################################################################################################################
class TestFBRef:

####################################################################################################################
def verify_match_df(self, match):
player_stats_columns = [
'Team Sheet', 'Summary', 'GK', 'Passing', 'Pass Types', 'Defense', 'Possession', 'Misc'
]
assert type(match['Link']) is str
assert type(match['Date']) is datetime.date
assert type(match['Stage']) in [int, str]
assert type(match['Home Team']) is str
assert type(match['Away Team']) is str
assert type(match['Home Team ID']) is str
assert type(match['Away Team ID']) is str
assert type(match['Home Formation']) in [type(None), str]
assert type(match['Away Formation']) in [type(None), str]
assert type(match['Home Goals']) is int
assert type(match['Away Goals']) is int
assert type(match['Home Ast']) is int or np.isnan(match['Home Ast'])
assert type(match['Away Ast']) is int or np.isnan(match['Away Ast'])

assert type(match['Home xG']) in [type(None), float]
assert type(match['Away xG']) in [type(None), float]
assert type(match['Home npxG']) in [type(None), float]
assert type(match['Away npxG']) in [type(None), float]
assert type(match['Home xAG']) in [type(None), float]
assert type(match['Away xAG']) in [type(None), float]

assert type(match['Home Player Stats']) is pd.core.frame.DataFrame
assert list(match['Home Player Stats'].columns) == player_stats_columns
for c in match['Home Player Stats'].columns:
stat_type = type(match['Home Player Stats'][c].values[0])
assert stat_type in [type(None), pd.core.frame.DataFrame]

assert type(match['Away Player Stats']) is pd.core.frame.DataFrame
assert list(match['Away Player Stats'].columns) == player_stats_columns
for c in match['Away Player Stats'].columns:
stat_type = type(match['Away Player Stats'][c].values[0])
assert stat_type in [type(None), pd.core.frame.DataFrame]

assert type(match['Shots']) is pd.core.frame.DataFrame
assert list(match['Shots'].columns) == ['Both', 'Home', 'Away']
for c in match['Shots'].columns:
stat_type = type(match['Shots'][c].values[0])
assert stat_type in [type(None), pd.core.frame.DataFrame]

####################################################################################################################
def verify_matches_df(self, matches, match_links):
assert matches.shape[0] == len(match_links)
for i in matches.index:
self.verify_match_df(matches.loc[i,:])

####################################################################################################################
def verify_all_stats(self, stats):
fbref = sfc.FBRef()
def test_get_season_link(self):
year, league = get_random_league_seasons('FBRef', 1)[0]
try:
stats_categories = fbref.stats_categories.keys()

assert len(stats.keys()) == len(stats_categories)

for category in stats_categories:
assert len(stats[category]) == 3
squad, opponent, player = stats[category]

if squad is not None:
pass

if opponent is not None:
pass

if player is not None:
pass
fbref = FBRef()
season_link = fbref.get_season_link(year, league)
assert type(season_link) is str
except UnavailableSeasonException:
pass
finally:
fbref.close()

####################################################################################################################
def test_fbref(self):
print('Testing FBRef.')
fbref = sfc.FBRef()
def test_get_match_links_type(self):
year, league = get_random_league_seasons('FBRef', 1)[0]
try:
iterator = tqdm(get_random_league_seasons('FBRef', 'all'), desc='TestFBRef')
for league, year in iterator:
# year = int(year) # year became a string during random sampling?
# league = str(league) # league also became a weird type of numpy string?
print(year, league)

# Skip invalid years -----------------------------------------------------------------------------------
try:
sfc.shared_functions.check_season(year, league, 'FBRef')
except sfc.InvalidYearException:
continue

# Season link ------------------------------------------------------------------------------------------
try:
season_link = fbref.get_season_link(year, league)
except sfc.UnavailableSeasonException:
continue
assert type(season_link) is str

# Match links ------------------------------------------------------------------------------------------
# try:
match_links = fbref.get_match_links(year, league)
# except sfc.NoMatchLinksException:
# continue
assert len(match_links) > 0
fbref = FBRef()
match_links = fbref.get_match_links(year, league)
assert type(match_links) is list
except (NoMatchLinksException, UnavailableSeasonException):
pass
finally:
fbref.close()

# Get match data and stats -----------------------------------------------------------------------------
matches = fbref.scrape_matches(year, league)
stats = fbref.scrape_all_stats(year, league)
def test_get_match_link_contents(self):
year, league = get_random_league_seasons('FBRef', 1)[0]
try:
fbref = FBRef()
match_links = fbref.get_match_links(year, league)
link = random.sample(match_links, 1)[0]
finder = get_source_comp_info(year, league, 'FBRef')['FBRef'][league]['finder']
assert type(link) is str
if type(finder) is list:
for f in finder:
assert f in link
else:
assert finder in link
assert 'fbref.com' in link
except (NoMatchLinksException, UnavailableSeasonException):
pass
finally:
fbref.close()

# Check match data and stats ---------------------------------------------------------------------------
self.verify_all_stats(stats)
self.verify_matches_df(matches, match_links)
def test_scrape_league_table(self):
year, league = get_random_league_seasons('FBRef', 1)[0]
try:
fbref = FBRef()
lgtbl = fbref.scrape_league_table(year, league)
assert type(lgtbl) in (list, DataFrame)
if type(lgtbl) is list:
for x in lgtbl:
assert type(x) is DataFrame
except UnavailableSeasonException:
pass
finally:
fbref.close()
Loading

0 comments on commit 27d7a42

Please sign in to comment.