Skip to content

Commit

Permalink
feat: Fixes to support WSL
Browse files Browse the repository at this point in the history
Closes #3
  • Loading branch information
dashmug committed Feb 20, 2024
1 parent 6d2a2c4 commit 7c5cb63
Show file tree
Hide file tree
Showing 6 changed files with 108 additions and 6 deletions.
16 changes: 12 additions & 4 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,23 @@ FROM amazon/aws-glue-libs:glue_libs_4.0.0_image_01
# running in a container.
ENV PLATFORM="docker"

# Arguments for passing the host user:group to the container.
ARG USER_ID

# Switch to root to be able to make changes in the container filesystem.
USER root

# Install latest version of awscli.
RUN yum remove awscli -y \
&& curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
&& unzip awscliv2.zip \
&& ./aws/install
&& ./aws/install \
# Change UID of glue_user to be the same as host user. This allows
# JupyterLab to write to the host system as glue_user.
&& usermod -u $USER_ID glue_user \
# Clean up /tmp which may already have glue_user-owned files with the
# old UID.
&& rm -rf /tmp/*

# Switch to glue_user to be able to make changes for the user itself.
USER glue_user
Expand All @@ -20,9 +29,8 @@ USER glue_user
WORKDIR /home/glue_user/workspace
COPY requirements.txt .

# hadolint ignore=DL3013
RUN pip3 install --no-cache-dir --user --upgrade pip \
RUN pip3 install --no-cache-dir --user --upgrade pip==24.0 \
# Install dev requirements.
&& pip3 install --no-cache-dir --user -r requirements.txt \
&& pip3 install --no-cache-dir --user -r requirements.txt \
# Prepare a /tmp directory needed by Spark to start.
&& mkdir -p /tmp/spark-events
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ start: .env install requirements ## Rebuild the container according to the lates
ifeq ($(PLATFORM), docker)
@echo "ERROR: `make start` is meant to be used outside the container." && false
else
@docker compose up --build
@USER_ID=$$(id -u) docker compose up --build
endif


Expand Down Expand Up @@ -132,7 +132,7 @@ endif
.PHONY: clean-notebooks
clean-notebooks: ## Removes output cells from Jupyter notebooks
ifeq ($(PLATFORM), docker)
@jupyter nbconvert --clear-output notebooks/**/*.ipynb
@jupyter nbconvert --clear-output notebooks/**.ipynb
else
@$(COMPOSE_RUN) -c "make clean-notebooks"
endif
Expand Down
3 changes: 3 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ services:
build:
context: .
dockerfile: Dockerfile
args:
- USER_ID=${USER_ID:-}
- GROUP_ID=${GROUP_ID:-}
volumes:
- ~/.aws:/home/glue_user/.aws
- .:/home/glue_user/workspace
Expand Down
81 changes: 81 additions & 0 deletions notebooks/sample.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "e77e0d17",
"metadata": {},
"source": [
"This script is taken from the [AWS Glue documentation](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html#develop-local-docker-image)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16d841a2-02e6-4f4a-9f4d-29d03f0cbf79",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"\n",
"from awsglue import DynamicFrame\n",
"from awsglue.context import GlueContext\n",
"from awsglue.job import Job\n",
"from awsglue.utils import getResolvedOptions\n",
"from pyspark.context import SparkContext\n",
"\n",
"\n",
"class GluePythonSampleTest:\n",
" def __init__(self) -> None:\n",
" params = []\n",
" if \"--JOB_NAME\" in sys.argv:\n",
" params.append(\"JOB_NAME\")\n",
" args = getResolvedOptions(sys.argv, params)\n",
"\n",
" self.context = GlueContext(SparkContext.getOrCreate())\n",
" self.job = Job(self.context)\n",
"\n",
" jobname = args.get(\"JOB_NAME\", \"test\")\n",
" self.job.init(jobname, args)\n",
"\n",
" def run(self) -> None:\n",
" dyf = read_json(\n",
" self.context,\n",
" \"s3://awsglue-datasets/examples/us-legislators/all/persons.json\",\n",
" )\n",
" dyf.printSchema()\n",
"\n",
" self.job.commit()\n",
"\n",
"\n",
"def read_json(glue_context: GlueContext, path: str) -> DynamicFrame:\n",
" return glue_context.create_dynamic_frame.from_options(\n",
" connection_type=\"s3\",\n",
" connection_options={\"paths\": [path], \"recurse\": True},\n",
" format=\"json\",\n",
" )\n",
"\n",
"\n",
"GluePythonSampleTest().run()\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "PySpark",
"language": "python",
"name": "pysparkkernel"
},
"language_info": {
"codemirror_mode": {
"name": "python",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "pyspark",
"pygments_lexer": "python3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
5 changes: 5 additions & 0 deletions src/sample/script.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
This script is taken from the AWS Glue documentation
https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html#develop-local-docker-image
"""

import sys

from awsglue import DynamicFrame
Expand Down
5 changes: 5 additions & 0 deletions test/sample/test_script.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
"""
This script is taken from the AWS Glue documentation
https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-libraries.html#develop-local-docker-image
"""

import sys

import pytest
Expand Down

0 comments on commit 7c5cb63

Please sign in to comment.