Skip to content

Commit

Permalink
Compress all the datafiles for movielens
Browse files Browse the repository at this point in the history
  • Loading branch information
FrancescAlted committed May 17, 2017
1 parent bce07e7 commit 786f4cc
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 9,954 deletions.
20 changes: 11 additions & 9 deletions 3-Using-Compression.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 2,
"metadata": {
"collapsed": true,
"deletable": true,
Expand All @@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 8,
"metadata": {
"collapsed": false,
"deletable": true,
Expand All @@ -49,11 +49,11 @@
"# Import CSV files via pandas\n",
"dset = 'movielens-1m'\n",
"fdata = os.path.join(dset, 'ratings.dat.gz')\n",
"fitem = os.path.join(dset, 'movies.dat')\n",
"fitem = os.path.join(dset, 'movies.dat.gz')\n",
"\n",
"# pass in column names for each CSV\n",
"r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']\n",
"ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')\n",
"ratings = pd.read_csv(fdata, sep=';', names=r_cols)\n",
"\n",
"m_cols = ['movie_id', 'title', 'genres']\n",
"movies = pd.read_csv(fitem, sep=';', names=m_cols,\n",
Expand All @@ -62,7 +62,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {
"collapsed": false,
"deletable": true,
Expand All @@ -78,7 +78,7 @@
"dtype: object"
]
},
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -89,7 +89,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {
"collapsed": false,
"deletable": true,
Expand All @@ -106,7 +106,7 @@
"dtype: object"
]
},
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -345,7 +345,9 @@
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
Expand Down
79 changes: 57 additions & 22 deletions 4-Structuring-Datasets.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"# Structuring Datasets"
]
Expand Down Expand Up @@ -281,9 +284,11 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 1,
"metadata": {
"collapsed": true
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
Expand All @@ -295,20 +300,22 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 4,
"metadata": {
"collapsed": true
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
"# Import CSV files via pandas\n",
"dset = 'movielens-1m'\n",
"fdata = os.path.join(dset, 'ratings.dat.gz')\n",
"fitem = os.path.join(dset, 'movies.dat')\n",
"fitem = os.path.join(dset, 'movies.dat.gz')\n",
"\n",
"# pass in column names for each CSV\n",
"r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']\n",
"ratings = pd.read_csv(fdata, sep=';', names=r_cols, compression='gzip')\n",
"ratings = pd.read_csv(fdata, sep=';', names=r_cols)\n",
"\n",
"m_cols = ['movie_id', 'title', 'genres']\n",
"movies = pd.read_csv(fitem, sep=';', names=m_cols,\n",
Expand All @@ -317,9 +324,11 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 5,
"metadata": {
"collapsed": true
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
Expand All @@ -329,9 +338,11 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
Expand All @@ -346,7 +357,7 @@
"dtype: object"
]
},
"execution_count": 12,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -359,7 +370,9 @@
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": true
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": [
Expand Down Expand Up @@ -393,7 +406,9 @@
"cell_type": "code",
"execution_count": 14,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
Expand All @@ -416,7 +431,9 @@
"cell_type": "code",
"execution_count": 15,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
Expand All @@ -438,7 +455,9 @@
"cell_type": "code",
"execution_count": 16,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
Expand Down Expand Up @@ -477,7 +496,9 @@
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": false
"collapsed": false,
"deletable": true,
"editable": true
},
"outputs": [
{
Expand Down Expand Up @@ -508,14 +529,20 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"As can be seen, the size of the denormalized table is much larger than the normalized one (156 MB vs 17 MB). But that is without using compression."
]
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Exercise 1\n",
"\n",
Expand Down Expand Up @@ -595,7 +622,10 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"### Exercise 2\n",
"\n",
Expand Down Expand Up @@ -689,7 +719,10 @@
},
{
"cell_type": "markdown",
"metadata": {},
"metadata": {
"deletable": true,
"editable": true
},
"source": [
"In the next section we will see the effect of querying normalized and denormalized tables."
]
Expand All @@ -698,7 +731,9 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
"collapsed": true,
"deletable": true,
"editable": true
},
"outputs": [],
"source": []
Expand Down
Loading

0 comments on commit 786f4cc

Please sign in to comment.