Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle canceled / forfeited games #309

Merged
merged 8 commits into from
Jul 27, 2023
29 changes: 18 additions & 11 deletions soccerdata/fbref.py
Original file line number Diff line number Diff line change
Expand Up @@ -1086,25 +1086,26 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
The level 0 headers are not consistent across seasons and leagues, this
function tries to determine uniform column names.
If there are dataframes with different columns, we will use the ones from
the dataframe with the most columns.
Parameters
----------
dfs : list(pd.DataFrame)
Input dataframes.
key : list(str)
List of columns that uniquely identify each df.
Raises
------
RuntimeError
If the dfs cannot be merged due to the columns not matching each other.
Returns
-------
pd.DataFrame
Concatenated dataframe with uniform column names.
"""
all_columns = []

# Step 0: Sort dfs by the number of columns
dfs = sorted(dfs, key=lambda x: len(x.columns), reverse=True)

# Step 1: Clean up the columns of each dataframe that should be merged
for df in dfs:
columns = pd.DataFrame(df.columns.tolist())
Expand All @@ -1125,17 +1126,17 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
columns.loc[mask, 1] = ""
df.columns = pd.MultiIndex.from_tuples(columns.to_records(index=False).tolist())

# all dataframes should now have the same length and level 1 columns
# throw a warning if not all dataframes have the same length and level 1 columns
if len(all_columns) and all_columns[0].shape[1] == 2:
for i, columns in enumerate(all_columns):
if not columns[1].equals(all_columns[0][1]):
res = all_columns[0].merge(columns, indicator=True, how='outer')
raise RuntimeError(
warnings.warn(
(
"Cannot merge the data for {first} and {cur}.\n\n"
"Different columns found for {first} and {cur}.\n\n"
+ "The following columns are missing in {first}: {extra_cols}.\n\n"
+ "The following columns are missing in {cur}: {missing_cols}.\n\n"
+ "Please try to scrape the data again with caching disabled."
+ "The columns of the dataframe with the most columns will be used."
).format(
first=dfs[0].iloc[:1][key].values,
cur=dfs[i].iloc[:1][key].values,
Expand All @@ -1156,8 +1157,10 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
)
),
),
stacklevel=1,
)

if len(all_columns) and all_columns[0].shape[1] == 2:
# Step 2: Look for the most complete level 0 columns
columns = reduce(lambda left, right: left.combine_first(right), all_columns)

Expand All @@ -1167,15 +1170,19 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
columns.loc[mask, 1] = ""
column_idx = pd.MultiIndex.from_tuples(columns.to_records(index=False).tolist())

for df in dfs:
for i, df in enumerate(dfs):
if df.columns.equals(column_idx):
# This dataframe already has the uniform column index
pass
elif len(df.columns) == len(column_idx):
if len(df.columns) == len(column_idx):
# This dataframe has the same number of columns and the same
# level 1 columns, we assume that the level 0 columns can be
# replaced
df.columns = column_idx
else:
# This dataframe has a different number of columns, so we want
# to make sure its columns match with column_idx
dfs[i] = df.reindex(columns=column_idx, fill_value=None)
lorenzodb1 marked this conversation as resolved.
Show resolved Hide resolved

return pd.concat(dfs)

Expand Down
21 changes: 8 additions & 13 deletions tests/test_FBref.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,14 @@ def test_concat() -> None:
)


def test_concat_not_matching_columns() -> None:
df1 = pd.DataFrame(
columns=pd.MultiIndex.from_tuples(
[("Unnamed: a", "player"), ("Performance", "Goals"), ("Performance", "Assists")]
)
)
df2 = pd.DataFrame(
columns=pd.MultiIndex.from_tuples(
[("Unnamed: a", "player"), ("Unnamed: b", "Goals"), ("Performance", "Fouls")]
)
)
with pytest.raises(RuntimeError):
_concat([df1, df2], key=["player"])
def test_concat_with_forfeited_game() -> None:
fbref_seriea = sd.FBref(["ITA-Serie A"], 2021)
df_1 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "34e95e35"])
df_2 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "a3e10e13"])
assert isinstance(df_1, pd.DataFrame)
assert isinstance(df_2, pd.DataFrame)
# Regardless of the order in which the matches are read, the result should be the same.
assert df_1.columns.equals(df_2.columns)


def test_combine_big5() -> None:
Expand Down
Loading