refine handling of slicing for year data

broadinstitute · Oct 4, 2024 · ec9fde6 · ec9fde6
1 parent d48a84b
commit ec9fde6
Show file tree

Hide file tree

Showing 3 changed files with 52 additions and 15 deletions.
diff --git a/episodes/looping-data-sets.md b/episodes/looping-data-sets.md
@@ -142,11 +142,21 @@ Before we join the data from each CSV into a single DataFrame, we'll want to mak
 ```python
 for csv in sorted(glob.glob(file_location + 'data/*.csv')):
         year = csv[29:33] #the 30th to 33rd characters in each file match the year
-        # if you copied your data using wget, year should be set differently:
-        # year = csv[5:9] #the 5th to 9th characters in each file match the year
+                          # for files we downloaded to Google Drive for Colab
         print(f'filename: {csv} year: {year}')
 ```
 
+:::::::::::::::::::::::::::::::::::::::::: spoiler
+
+# if you downloaded your data files using wget
+```python
+for csv in sorted(glob.glob(file_location + 'data/*.csv')):
+        year = csv[5:9] # the 5th to 9th characters in each file match the year
+                        # for files downloaded using wget
+        print(f'filename: {csv} year: {year}')
+```
+::::::::::::::::::::::::::::::::::::::::::::::::::
+
 ```output
 filename: drive/MyDrive/lc-python/data/2011_circ.csv year: 2011
 filename: drive/MyDrive/lc-python/data/2012_circ.csv year: 2012
@@ -170,8 +180,9 @@ To collect the data from each CSV we'll use a list "accumulator" (as we covered
 dfs = [] # an empty list to hold all of our DataFrames
 counter = 1
 
-for csv in sorted(glob.glob('data/*.csv')):
-  year = csv[5:9] 
+for csv in sorted(glob.glob(file_location + 'data/*.csv')):
+  year = csv[29:33] # the 30th to 33rd characters match the year
+                    # for files we downloaded to Google Drive for Colab
   data = pd.read_csv(csv) 
   data['year'] = year 
   print(f'{counter} Saving {len(data)} rows from {csv}')
@@ -180,6 +191,25 @@ for csv in sorted(glob.glob('data/*.csv')):
 
 print(f'Number of saved DataFrames: {len(dfs)}')
 ```
+:::::::::::::::::::::::::::::::::::::::::: spoiler
+
+# if you downloaded your data files using wget
+```python
+dfs = [] # an empty list to hold all of our DataFrames
+counter = 1
+
+for csv in sorted(glob.glob(file_location + 'data/*.csv')):
+  year = csv[5:9] # the 5th to 9th characters in each file match the year
+                  # for files downloaded using wget
+  data = pd.read_csv(csv) 
+  data['year'] = year 
+  print(f'{counter} Saving {len(data)} rows from {csv}')
+  dfs.append(data)
+  counter += 1
+
+print(f'Number of saved DataFrames: {len(dfs)}')
+```
+::::::::::::::::::::::::::::::::::::::::::::::::::
 
 ```output
 1 Saving 80 rows from drive/MyDrive/lc-python/data/2011_circ.csv
@@ -252,7 +282,7 @@ Modify the following code to print out the lowest value in the `ytd` column from
 
 ```python
 import pandas as pd
-for csv in sorted(glob.glob('data/*.csv')):
+for csv in sorted(glob.glob(file_location + 'data/*.csv')):
     data = pd.read_csv(____)
     print(csv, data['____'].____())
 
@@ -264,7 +294,7 @@ for csv in sorted(glob.glob('data/*.csv')):
 
 ```python
 import pandas as pd
-for csv in sorted(glob.glob('data/*.csv')):
+for csv in sorted(glob.glob(file_location + 'data/*.csv')):
     data = pd.read_csv(csv)
     print(csv, data['ytd'].min())
 
@@ -289,7 +319,7 @@ import pandas as pd
 
 dfs = []
 
-for csv in sorted(glob.glob('outputs/data*.csv')):
+for csv in sorted(glob.glob(file_location + 'outputs/data*.csv')):
     data = pd.read_csv(csv)
     dfs.append(data)
 

diff --git a/episodes/pandas.md b/episodes/pandas.md
@@ -62,9 +62,12 @@ import pandas as pd
 dfs = [] 
 
 for csv in sorted(glob.glob(file_location + 'data/*.csv')):
-    year = csv[29:33] #the 30th to 33rd characters in each file match the year
-    # if you copied your data using wget, year should be set differently:
-    # year = csv[5:9] #the 5th to 9th characters in each file match the year 
+    if file_location == "drive/MyDrive/lc-python/":
+      year = csv[29:33] # the 30th to 33rd characters match the year
+                        # for files we downloaded to Google Drive for Colab
+    else:
+      year = csv[5:9] # the 5th to 9th characters in each file match the year
+                      # for files downloaded using wget
     data = pd.read_csv(csv) 
     data['year'] = year 
     dfs.append(data)

diff --git a/episodes/tidy.md b/episodes/tidy.md
@@ -20,7 +20,7 @@ exercises: 10
 :::::::::::::::::::::::::::::::::::::::::: spoiler
 
 ## Setup instructions if your Google Drive is not mounted
-If you did not run the commands from episode 7 in this Colab session, you will need to load the library `pandas` and make your google drive accessible:
+If you did not run the commands from episode 11 in this Colab session, you will need to load the library `pandas` and make your google drive accessible:
 ```python
 import pandas as pd
 from google.colab import drive
@@ -45,16 +45,20 @@ file_location = ""
 Remember that next time you use Colab, you'll need to get these files again unless you follow the [Setup instructions](https://broadinstitute.github.io/2024-09-27-python-intro-lesson/#setup) to copy the files to Google Drive.
 ::::::::::::::::::::::::::::::::::::::::::::::::::
 
-## Put all of our Chicago public library circulation data in a single DataFrame.
+Then we'll need to put all of the Chicago public library circulation data in a single DataFrame.
+
 ```python
 import glob
 
 dfs = [] 
 
 for csv in sorted(glob.glob(file_location + 'data/*.csv')):
-    year = csv[29:33] #the 30th to 33rd characters in each file match the year
-    # if you copied your data using wget, year should be set differently:
-    # year = csv[5:9] #the 5th to 9th characters in each file match the year 
+    if file_location == "drive/MyDrive/lc-python/":
+      year = csv[29:33] # the 30th to 33rd characters match the year
+                        # for files we downloaded to Google Drive for Colab
+    else:
+      year = csv[5:9] # the 5th to 9th characters in each file match the year
+                      # for files downloaded using wget or
     data = pd.read_csv(csv) 
     data['year'] = year 
     dfs.append(data)