Skip to content

Commit

Permalink
refine handling of slicing for year data
Browse files Browse the repository at this point in the history
  • Loading branch information
jlchang committed Oct 4, 2024
1 parent d48a84b commit ec9fde6
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 15 deletions.
44 changes: 37 additions & 7 deletions episodes/looping-data-sets.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,21 @@ Before we join the data from each CSV into a single DataFrame, we'll want to mak
```python
for csv in sorted(glob.glob(file_location + 'data/*.csv')):
year = csv[29:33] #the 30th to 33rd characters in each file match the year
# if you copied your data using wget, year should be set differently:
# year = csv[5:9] #the 5th to 9th characters in each file match the year
# for files we downloaded to Google Drive for Colab
print(f'filename: {csv} year: {year}')
```

:::::::::::::::::::::::::::::::::::::::::: spoiler

# if you downloaded your data files using wget
```python
for csv in sorted(glob.glob(file_location + 'data/*.csv')):
year = csv[5:9] # the 5th to 9th characters in each file match the year
# for files downloaded using wget
print(f'filename: {csv} year: {year}')
```
::::::::::::::::::::::::::::::::::::::::::::::::::

```output
filename: drive/MyDrive/lc-python/data/2011_circ.csv year: 2011
filename: drive/MyDrive/lc-python/data/2012_circ.csv year: 2012
Expand All @@ -170,8 +180,9 @@ To collect the data from each CSV we'll use a list "accumulator" (as we covered
dfs = [] # an empty list to hold all of our DataFrames
counter = 1

for csv in sorted(glob.glob('data/*.csv')):
year = csv[5:9]
for csv in sorted(glob.glob(file_location + 'data/*.csv')):
year = csv[29:33] # the 30th to 33rd characters match the year
# for files we downloaded to Google Drive for Colab
data = pd.read_csv(csv)
data['year'] = year
print(f'{counter} Saving {len(data)} rows from {csv}')
Expand All @@ -180,6 +191,25 @@ for csv in sorted(glob.glob('data/*.csv')):

print(f'Number of saved DataFrames: {len(dfs)}')
```
:::::::::::::::::::::::::::::::::::::::::: spoiler

# if you downloaded your data files using wget
```python
dfs = [] # an empty list to hold all of our DataFrames
counter = 1

for csv in sorted(glob.glob(file_location + 'data/*.csv')):
year = csv[5:9] # the 5th to 9th characters in each file match the year
# for files downloaded using wget
data = pd.read_csv(csv)
data['year'] = year
print(f'{counter} Saving {len(data)} rows from {csv}')
dfs.append(data)
counter += 1

print(f'Number of saved DataFrames: {len(dfs)}')
```
::::::::::::::::::::::::::::::::::::::::::::::::::

```output
1 Saving 80 rows from drive/MyDrive/lc-python/data/2011_circ.csv
Expand Down Expand Up @@ -252,7 +282,7 @@ Modify the following code to print out the lowest value in the `ytd` column from

```python
import pandas as pd
for csv in sorted(glob.glob('data/*.csv')):
for csv in sorted(glob.glob(file_location + 'data/*.csv')):
data = pd.read_csv(____)
print(csv, data['____'].____())

Expand All @@ -264,7 +294,7 @@ for csv in sorted(glob.glob('data/*.csv')):

```python
import pandas as pd
for csv in sorted(glob.glob('data/*.csv')):
for csv in sorted(glob.glob(file_location + 'data/*.csv')):
data = pd.read_csv(csv)
print(csv, data['ytd'].min())

Expand All @@ -289,7 +319,7 @@ import pandas as pd

dfs = []

for csv in sorted(glob.glob('outputs/data*.csv')):
for csv in sorted(glob.glob(file_location + 'outputs/data*.csv')):
data = pd.read_csv(csv)
dfs.append(data)

Expand Down
9 changes: 6 additions & 3 deletions episodes/pandas.md
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,12 @@ import pandas as pd
dfs = []

for csv in sorted(glob.glob(file_location + 'data/*.csv')):
year = csv[29:33] #the 30th to 33rd characters in each file match the year
# if you copied your data using wget, year should be set differently:
# year = csv[5:9] #the 5th to 9th characters in each file match the year
if file_location == "drive/MyDrive/lc-python/":
year = csv[29:33] # the 30th to 33rd characters match the year
# for files we downloaded to Google Drive for Colab
else:
year = csv[5:9] # the 5th to 9th characters in each file match the year
# for files downloaded using wget
data = pd.read_csv(csv)
data['year'] = year
dfs.append(data)
Expand Down
14 changes: 9 additions & 5 deletions episodes/tidy.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ exercises: 10
:::::::::::::::::::::::::::::::::::::::::: spoiler

## Setup instructions if your Google Drive is not mounted
If you did not run the commands from episode 7 in this Colab session, you will need to load the library `pandas` and make your google drive accessible:
If you did not run the commands from episode 11 in this Colab session, you will need to load the library `pandas` and make your google drive accessible:
```python
import pandas as pd
from google.colab import drive
Expand All @@ -45,16 +45,20 @@ file_location = ""
Remember that next time you use Colab, you'll need to get these files again unless you follow the [Setup instructions](https://broadinstitute.github.io/2024-09-27-python-intro-lesson/#setup) to copy the files to Google Drive.
::::::::::::::::::::::::::::::::::::::::::::::::::

## Put all of our Chicago public library circulation data in a single DataFrame.
Then we'll need to put all of the Chicago public library circulation data in a single DataFrame.

```python
import glob

dfs = []

for csv in sorted(glob.glob(file_location + 'data/*.csv')):
year = csv[29:33] #the 30th to 33rd characters in each file match the year
# if you copied your data using wget, year should be set differently:
# year = csv[5:9] #the 5th to 9th characters in each file match the year
if file_location == "drive/MyDrive/lc-python/":
year = csv[29:33] # the 30th to 33rd characters match the year
# for files we downloaded to Google Drive for Colab
else:
year = csv[5:9] # the 5th to 9th characters in each file match the year
# for files downloaded using wget or
data = pd.read_csv(csv)
data['year'] = year
dfs.append(data)
Expand Down

0 comments on commit ec9fde6

Please sign in to comment.