Merge pull request #307 from PyPSA/read-excel-mute-follow-up

mute excel reading with context manager
PyPSA · Feb 22, 2023 · 2d5b832 · 2d5b832
2 parents ced2e17 + 19c35f8
commit 2d5b832
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 34 deletions.
diff --git a/scripts/build_biomass_potentials.py b/scripts/build_biomass_potentials.py
@@ -196,7 +196,7 @@ def convert_nuts2_to_regions(bio_nuts2, regions):
 if __name__ == "__main__":
     if 'snakemake' not in globals():
         from helper import mock_snakemake
-        snakemake = mock_snakemake('build_biomass_potentials')
+        snakemake = mock_snakemake('build_biomass_potentials', simpl='', clusters='5')
 
     config = snakemake.config['biomass']
     year = config["year"]

diff --git a/scripts/build_energy_totals.py b/scripts/build_energy_totals.py
@@ -1,6 +1,6 @@
 from functools import partial
 from tqdm import tqdm
-from helper import mute
+from helper import mute_print
 
 import multiprocessing as mp
 import pandas as pd
@@ -9,7 +9,6 @@
 
 idx = pd.IndexSlice
 
-mute()
 
 def cartesian(s1, s2):
     """Cartesian product of two pd.Series"""
@@ -137,12 +136,13 @@ def build_eurostat(input_eurostat, countries, report_year,  year):
         2017: f"/{year}-ENERGY-BALANCES-June2017edition.xlsx"
     }
 
-    dfs = pd.read_excel(
-        input_eurostat + filenames[report_year],
-        sheet_name=None,
-        skiprows=1,
-        index_col=list(range(4)),
-    )
+    with mute_print():
+        dfs = pd.read_excel(
+            input_eurostat + filenames[report_year],
+            sheet_name=None,
+            skiprows=1,
+            index_col=list(range(4)),
+        )
 
     # sorted_index necessary for slicing
     lookup = eurostat_country_to_alpha2
@@ -379,11 +379,13 @@ def idees_per_country(ct, year):
 def build_idees(countries, year):
 
     nprocesses = snakemake.threads
+
     func = partial(idees_per_country, year=year)
     tqdm_kwargs = dict(ascii=False, unit=' country', total=len(countries),
                        desc='Build from IDEES database')
-    with mp.Pool(processes=nprocesses, initializer=mute) as pool:
-        totals_list = list(tqdm(pool.imap(func, countries), **tqdm_kwargs))
+    with mute_print():
+        with mp.Pool(processes=nprocesses) as pool:
+            totals_list = list(tqdm(pool.imap(func, countries), **tqdm_kwargs))
 
 
     totals = pd.concat(totals_list, axis=1)
@@ -568,7 +570,7 @@ def build_eea_co2(input_co2, year=1990, emissions_scope="CO2"):
 
     # https://www.eea.europa.eu/data-and-maps/data/national-emissions-reported-to-the-unfccc-and-to-the-eu-greenhouse-gas-monitoring-mechanism-16
     # downloaded 201228 (modified by EEA last on 201221)
-    df = pd.read_csv(input_co2, encoding="latin-1")
+    df = pd.read_csv(input_co2, encoding="latin-1", low_memory=False)
 
     df.replace(dict(Year="1985-1987"), 1986, inplace=True)
     df.Year = df.Year.astype(int)

diff --git a/scripts/build_industrial_production_per_country.py b/scripts/build_industrial_production_per_country.py
@@ -4,7 +4,7 @@
 import numpy as np
 import multiprocessing as mp
 from tqdm import tqdm
-from helper import mute
+from helper import mute_print
 
 tj_to_ktoe = 0.0238845
 ktoe_to_twh = 0.01163
@@ -113,15 +113,17 @@ def get_energy_ratio(country):
     else:
         # estimate physical output, energy consumption in the sector and country
         fn = f"{eurostat_dir}/{eb_names[country]}.XLSX"
-        df = pd.read_excel(fn, sheet_name='2016', index_col=2,
-                           header=0, skiprows=1).squeeze('columns')
+        with mute_print():
+            df = pd.read_excel(fn, sheet_name='2016', index_col=2,
+                            header=0, skiprows=1).squeeze('columns')
         e_country = df.loc[eb_sectors.keys(
         ), 'Total all products'].rename(eb_sectors)
 
     fn = f'{jrc_dir}/JRC-IDEES-2015_Industry_EU28.xlsx'
 
-    df = pd.read_excel(fn, sheet_name='Ind_Summary',
-                       index_col=0, header=0).squeeze('columns')
+    with mute_print():
+        df = pd.read_excel(fn, sheet_name='Ind_Summary',
+                        index_col=0, header=0).squeeze('columns')
 
     assert df.index[48] == "by sector"
     year_i = df.columns.get_loc(year)
@@ -140,8 +142,9 @@ def get_sector_data(sector, country):
         jrc_country = jrc_names.get(country, country)
         fn = f'{jrc_dir}/JRC-IDEES-2015_Industry_{jrc_country}.xlsx'
         sheet = sub_sheet_name_dict[sector]
-        df = pd.read_excel(fn, sheet_name=sheet,
-                           index_col=0, header=0).squeeze('columns')
+        with mute_print():
+            df = pd.read_excel(fn, sheet_name=sheet,
+                            index_col=0, header=0).squeeze('columns')
 
         year_i = df.columns.get_loc(year)
         df = df.iloc[find_physical_output(df), year_i]
@@ -168,7 +171,7 @@ def industry_production(countries):
     func = industry_production_per_country
     tqdm_kwargs = dict(ascii=False, unit=' country', total=len(countries),
                        desc="Build industry production")
-    with mp.Pool(processes=nprocesses, initializer=mute) as pool:
+    with mp.Pool(processes=nprocesses) as pool:
         demand_l = list(tqdm(pool.imap(func, countries), **tqdm_kwargs))
 
     demand = pd.concat(demand_l, axis=1).T

diff --git a/scripts/build_industry_sector_ratios.py b/scripts/build_industry_sector_ratios.py
@@ -1,9 +1,7 @@
 """Build industry sector ratios."""
 
 import pandas as pd
-from helper import mute
-
-mute()
+from helper import mute_print
 
 # GWh/ktoe OR MWh/toe
 toe_to_MWh = 11.630
@@ -77,13 +75,14 @@ def load_idees_data(sector, country="EU28"):
     def usecols(x):
         return isinstance(x, str) or x == year
 
-    idees = pd.read_excel(
-        f"{snakemake.input.idees}/JRC-IDEES-2015_Industry_{country}.xlsx",
-        sheet_name=list(sheets.values()),
-        index_col=0,
-        header=0,
-        usecols=usecols,
-    )
+    with mute_print():
+        idees = pd.read_excel(
+            f"{snakemake.input.idees}/JRC-IDEES-2015_Industry_{country}.xlsx",
+            sheet_name=list(sheets.values()),
+            index_col=0,
+            header=0,
+            usecols=usecols,
+        )
 
     for k, v in sheets.items():
         idees[k] = idees.pop(v).squeeze()

diff --git a/scripts/helper.py b/scripts/helper.py
@@ -1,5 +1,6 @@
 import os
 import sys
+import contextlib
 import yaml
 import pytz
 import pandas as pd
@@ -11,11 +12,15 @@
 import logging
 logger = logging.getLogger(__name__)
 
-def mute():
-    """hide irrelevant outputs of subprocess in multiprocessing pools.
-    also hide irrelevant outputs caused by pd.read_excel"""
-    sys.stdout = open(os.devnull, 'w')
 
+# Define a context manager to temporarily mute print statements
+@contextlib.contextmanager
+def mute_print():
+    with open(os.devnull, 'w') as devnull:
+        with contextlib.redirect_stdout(devnull):
+            yield
+
+
 def override_component_attrs(directory):
     """Tell PyPSA that links can have multiple outputs by
     overriding the component_attrs. This can be done for