Skip to content

Commit

Permalink
Handle canceled / forfeited games (#309)
Browse files Browse the repository at this point in the history
Fixes #304
  • Loading branch information
lorenzodb1 authored Jul 27, 2023
1 parent a910622 commit 526dda0
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 24 deletions.
29 changes: 18 additions & 11 deletions soccerdata/fbref.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,25 +1086,26 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
The level 0 headers are not consistent across seasons and leagues, this
function tries to determine uniform column names.
If there are dataframes with different columns, we will use the ones from
the dataframe with the most columns.
Parameters
----------
dfs : list(pd.DataFrame)
Input dataframes.
key : list(str)
List of columns that uniquely identify each df.
Raises
------
RuntimeError
If the dfs cannot be merged due to the columns not matching each other.
Returns
-------
pd.DataFrame
Concatenated dataframe with uniform column names.
"""
all_columns = []

# Step 0: Sort dfs by the number of columns
dfs = sorted(dfs, key=lambda x: len(x.columns), reverse=True)

# Step 1: Clean up the columns of each dataframe that should be merged
for df in dfs:
columns = pd.DataFrame(df.columns.tolist())
Expand All @@ -1125,17 +1126,17 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
columns.loc[mask, 1] = ""
df.columns = pd.MultiIndex.from_tuples(columns.to_records(index=False).tolist())

# all dataframes should now have the same length and level 1 columns
# throw a warning if not all dataframes have the same length and level 1 columns
if len(all_columns) and all_columns[0].shape[1] == 2:
for i, columns in enumerate(all_columns):
if not columns[1].equals(all_columns[0][1]):
res = all_columns[0].merge(columns, indicator=True, how='outer')
raise RuntimeError(
warnings.warn(
(
"Cannot merge the data for {first} and {cur}.\n\n"
"Different columns found for {first} and {cur}.\n\n"
+ "The following columns are missing in {first}: {extra_cols}.\n\n"
+ "The following columns are missing in {cur}: {missing_cols}.\n\n"
+ "Please try to scrape the data again with caching disabled."
+ "The columns of the dataframe with the most columns will be used."
).format(
first=dfs[0].iloc[:1][key].values,
cur=dfs[i].iloc[:1][key].values,
Expand All @@ -1156,8 +1157,10 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
)
),
),
stacklevel=1,
)

if len(all_columns) and all_columns[0].shape[1] == 2:
# Step 2: Look for the most complete level 0 columns
columns = reduce(lambda left, right: left.combine_first(right), all_columns)

Expand All @@ -1167,15 +1170,19 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
columns.loc[mask, 1] = ""
column_idx = pd.MultiIndex.from_tuples(columns.to_records(index=False).tolist())

for df in dfs:
for i, df in enumerate(dfs):
if df.columns.equals(column_idx):
# This dataframe already has the uniform column index
pass
elif len(df.columns) == len(column_idx):
if len(df.columns) == len(column_idx):
# This dataframe has the same number of columns and the same
# level 1 columns, we assume that the level 0 columns can be
# replaced
df.columns = column_idx
else:
# This dataframe has a different number of columns, so we want
# to make sure its columns match with column_idx
dfs[i] = df.reindex(columns=column_idx, fill_value=None)

return pd.concat(dfs)

Expand Down
21 changes: 8 additions & 13 deletions tests/test_FBref.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,14 @@ def test_concat() -> None:
)


def test_concat_not_matching_columns() -> None:
df1 = pd.DataFrame(
columns=pd.MultiIndex.from_tuples(
[("Unnamed: a", "player"), ("Performance", "Goals"), ("Performance", "Assists")]
)
)
df2 = pd.DataFrame(
columns=pd.MultiIndex.from_tuples(
[("Unnamed: a", "player"), ("Unnamed: b", "Goals"), ("Performance", "Fouls")]
)
)
with pytest.raises(RuntimeError):
_concat([df1, df2], key=["player"])
def test_concat_with_forfeited_game() -> None:
fbref_seriea = sd.FBref(["ITA-Serie A"], 2021)
df_1 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "34e95e35"])
df_2 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "a3e10e13"])
assert isinstance(df_1, pd.DataFrame)
assert isinstance(df_2, pd.DataFrame)
# Regardless of the order in which the matches are read, the result should be the same.
assert df_1.columns.equals(df_2.columns)


def test_combine_big5() -> None:
Expand Down

0 comments on commit 526dda0

Please sign in to comment.