Handle canceled / forfeited games (#309)

Fixes #304
probberechts · Jul 27, 2023 · 526dda0 · 526dda0
1 parent a910622
commit 526dda0
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 24 deletions.
diff --git a/soccerdata/fbref.py b/soccerdata/fbref.py
@@ -1086,25 +1086,26 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
     The level 0 headers are not consistent across seasons and leagues, this
     function tries to determine uniform column names.
 
+    If there are dataframes with different columns, we will use the ones from
+    the dataframe with the most columns.
+
     Parameters
     ----------
     dfs : list(pd.DataFrame)
         Input dataframes.
     key : list(str)
         List of columns that uniquely identify each df.
 
-    Raises
-    ------
-    RuntimeError
-        If the dfs cannot be merged due to the columns not matching each other.
-
     Returns
     -------
     pd.DataFrame
         Concatenated dataframe with uniform column names.
     """
     all_columns = []
 
+    # Step 0: Sort dfs by the number of columns
+    dfs = sorted(dfs, key=lambda x: len(x.columns), reverse=True)
+
     # Step 1: Clean up the columns of each dataframe that should be merged
     for df in dfs:
         columns = pd.DataFrame(df.columns.tolist())
@@ -1125,17 +1126,17 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
             columns.loc[mask, 1] = ""
             df.columns = pd.MultiIndex.from_tuples(columns.to_records(index=False).tolist())
 
-    # all dataframes should now have the same length and level 1 columns
+    # throw a warning if not all dataframes have the same length and level 1 columns
     if len(all_columns) and all_columns[0].shape[1] == 2:
         for i, columns in enumerate(all_columns):
             if not columns[1].equals(all_columns[0][1]):
                 res = all_columns[0].merge(columns, indicator=True, how='outer')
-                raise RuntimeError(
+                warnings.warn(
                     (
-                        "Cannot merge the data for {first} and {cur}.\n\n"
+                        "Different columns found for {first} and {cur}.\n\n"
                         + "The following columns are missing in {first}: {extra_cols}.\n\n"
                         + "The following columns are missing in {cur}: {missing_cols}.\n\n"
-                        + "Please try to scrape the data again with caching disabled."
+                        + "The columns of the dataframe with the most columns will be used."
                     ).format(
                         first=dfs[0].iloc[:1][key].values,
                         cur=dfs[i].iloc[:1][key].values,
@@ -1156,8 +1157,10 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
                             )
                         ),
                     ),
+                    stacklevel=1,
                 )
 
+    if len(all_columns) and all_columns[0].shape[1] == 2:
         # Step 2: Look for the most complete level 0 columns
         columns = reduce(lambda left, right: left.combine_first(right), all_columns)
 
@@ -1167,15 +1170,19 @@ def _concat(dfs: List[pd.DataFrame], key: List[str]) -> pd.DataFrame:
         columns.loc[mask, 1] = ""
         column_idx = pd.MultiIndex.from_tuples(columns.to_records(index=False).tolist())
 
-        for df in dfs:
+        for i, df in enumerate(dfs):
             if df.columns.equals(column_idx):
                 # This dataframe already has the uniform column index
                 pass
-            elif len(df.columns) == len(column_idx):
+            if len(df.columns) == len(column_idx):
                 # This dataframe has the same number of columns and the same
                 # level 1 columns, we assume that the level 0 columns can be
                 # replaced
                 df.columns = column_idx
+            else:
+                # This dataframe has a different number of columns, so we want
+                # to make sure its columns match with column_idx
+                dfs[i] = df.reindex(columns=column_idx, fill_value=None)
 
     return pd.concat(dfs)
 

diff --git a/tests/test_FBref.py b/tests/test_FBref.py
@@ -128,19 +128,14 @@ def test_concat() -> None:
     )
 
 
-def test_concat_not_matching_columns() -> None:
-    df1 = pd.DataFrame(
-        columns=pd.MultiIndex.from_tuples(
-            [("Unnamed: a", "player"), ("Performance", "Goals"), ("Performance", "Assists")]
-        )
-    )
-    df2 = pd.DataFrame(
-        columns=pd.MultiIndex.from_tuples(
-            [("Unnamed: a", "player"), ("Unnamed: b", "Goals"), ("Performance", "Fouls")]
-        )
-    )
-    with pytest.raises(RuntimeError):
-        _concat([df1, df2], key=["player"])
+def test_concat_with_forfeited_game() -> None:
+    fbref_seriea = sd.FBref(["ITA-Serie A"], 2021)
+    df_1 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "34e95e35"])
+    df_2 = fbref_seriea.read_player_match_stats(match_id=["e0a20cfe", "a3e10e13"])
+    assert isinstance(df_1, pd.DataFrame)
+    assert isinstance(df_2, pd.DataFrame)
+    # Regardless of the order in which the matches are read, the result should be the same.
+    assert df_1.columns.equals(df_2.columns)
 
 
 def test_combine_big5() -> None: