diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 169a6707..a4610fad 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,14 +17,6 @@ jobs: requirements: ['-r requirements.txt'] include: # historical requirements - - name: "Minimum install_requires versions" - requirements: numpy~=1.12.0 pandas~=0.24.0 SQLAlchemy~=1.2.19 psycopg2~=2.7.0 PyMySQL==1.0.2 - pytest_flags: --ignore=siuba/dply/forcats.py siuba - python-version: 3.6 - - name: "2019-late dependencies" - requirements: numpy==1.17.4 pandas==0.24.2 SQLAlchemy==1.2.19 psycopg2==2.8.4 PyMySQL==1.0.2 - pytest_flags: --ignore=siuba/dply/forcats.py siuba - python-version: 3.6 - name: "2020-early dependencies" requirements: numpy==1.17.4 pandas~=0.25.3 SQLAlchemy~=1.3.11 psycopg2~=2.8.4 PyMySQL==1.0.2 pytest_flags: --ignore=siuba/dply/forcats.py siuba @@ -37,6 +29,14 @@ jobs: python-version: 3.8 requirements: numpy~=1.19.1 pandas~=1.1.0 SQLAlchemy~=1.4.13 psycopg2~=2.8.5 PyMySQL==1.0.2 latest: true + - name: "2022-early dependencies" + python-version: 3.8 + requirements: numpy~=1.22.0 pandas~=1.3.5 SQLAlchemy~=1.4.29 psycopg2-binary~=2.9.3 PyMySQL==1.0.2 + latest: true + - name: "2022-early dependencies" + python-version: 3.10.1 + requirements: numpy~=1.22.0 pandas~=1.3.5 SQLAlchemy~=1.4.29 psycopg2-binary~=2.9.3 PyMySQL==1.0.2 + latest: true steps: - uses: actions/checkout@v2 diff --git a/examples/examples-duckdb.ipynb b/examples/examples-duckdb.ipynb new file mode 100644 index 00000000..e568f373 --- /dev/null +++ b/examples/examples-duckdb.ipynb @@ -0,0 +1,2906 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DuckDB\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "from sqlalchemy import sql\n", + "from sqlalchemy import Table, Column, Integer, String, MetaData, ForeignKey, Sequence, DateTime\n", + "from datetime import datetime\n", + "from sqlalchemy import create_engine\n", + "import os\n", + "\n", + "# To use DuckDB with siuba, just follow these steps! \n", + "# 1. pip install duckdb\n", + "# 2. pip install duckdb_engine (install the DuckDB SQLAlchemy driver)\n", + "\n", + "# For more details on DuckDB, visit https://www.duckdb.org\n", + "\n", + "# This creates an in-memory duckdb, but a file-based one can also be created by replacing :memory: with a path to the file\n", + "engine = create_engine(\"duckdb:///:memory:\")\n", + "# engine = create_engine('duckdb:////path/to/duck.db')\n", + "\n", + "metadata = MetaData()\n", + "# The sequence is needed because SQLAlchemy defaults to using the legacy PostgreSQL data type of Serial, which DuckDB does not support\n", + "# See the readme of duckdb_engine for details: https://github.com/Mause/duckdb_engine\n", + "user_id_seq = Sequence('user_id_seq')\n", + "users = Table('users', metadata,\n", + " Column('id', Integer, user_id_seq, server_default=user_id_seq.next_value(), primary_key=True),\n", + " Column('name', String),\n", + " Column('fullname', String),\n", + ")\n", + "\n", + "# The sequence is needed because SQLAlchemy defaults to using the legacy PostgreSQL data type of Serial, which DuckDB does not support\n", + "# See the readme of duckdb_engine for details: https://github.com/Mause/duckdb_engine\n", + "\n", + "# A DateTime field was also added in order to test date logic\n", + "address_id_seq = Sequence('address_id_seq')\n", + "addresses = Table('addresses', metadata,\n", + " Column('id', Integer, address_id_seq,server_default=address_id_seq.next_value(), primary_key=True),\n", + " Column('user_id', None, ForeignKey('users.id')),\n", + " Column('email_address', String, nullable=False),\n", + " Column('update_dt', DateTime)\n", + " )\n", + "\n", + "metadata.drop_all(engine)\n", + "metadata.create_all(engine)\n", + "\n", + "conn = engine.connect()\n", + "\n", + "ins = users.insert().values(name='jack', fullname='Jack Jones')\n", + "result = conn.execute(ins)\n", + "\n", + "\n", + "ins = users.insert()\n", + "conn.execute(ins, id=2, name='wendy', fullname='Wendy Williams')\n", + "\n", + "\n", + "conn.execute(addresses.insert(), [\n", + " {'user_id': 1, 'email_address' : 'jack@yahoo.com', 'update_dt':datetime.now()},\n", + " {'user_id': 1, 'email_address' : 'jack@msn.com', 'update_dt':datetime.now()},\n", + " {'user_id': 2, 'email_address' : 'www@www.org', 'update_dt':datetime.now()},\n", + " {'user_id': 2, 'email_address' : 'wendy@aol.com', 'update_dt':datetime.now()},\n", + "])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Big Example" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.update_dt, anon_1.num \n", + "FROM (SELECT id, user_id, email_address, update_dt, num, min(anon_2.id) OVER (PARTITION BY anon_2.user_id) AS win1, anon_2.id > min(anon_2.id) OVER (PARTITION BY anon_2.user_id) AS win2 \n", + "FROM (SELECT id, user_id, email_address, update_dt, dense_rank() OVER (PARTITION BY addresses.user_id ORDER BY addresses.id) AS num \n", + "FROM addresses) AS anon_2) AS anon_1 \n", + "WHERE anon_1.win2 AND (anon_1.email_address LIKE 'jack' || '%')\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\sql\\translate.py:24: SiubaSqlRuntimeWarning: \n", + "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", + "\n", + "This warning only displays once per function\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dtnum
021jack@msn.com2022-01-19 08:12:01.0206802
\n", + "
" + ], + "text/plain": [ + " id user_id email_address update_dt num\n", + "0 2 1 jack@msn.com 2022-01-19 08:12:01.020680 2" + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#If on Windows, be sure to pip install pyreadline\n", + "from siuba import *\n", + "from siuba.sql.verbs import LazyTbl, collect, show_query\n", + "from siuba.sql.dply.vector import dense_rank\n", + "import siuba.meta_hook.sqlalchemy.sql.functions as F\n", + "\n", + "from sqlalchemy import sql\n", + "\n", + "tbl_addresses = LazyTbl(conn, addresses)\n", + "tbl_users = LazyTbl(conn, users)\n", + "\n", + "#tbl_addresses >> mutate(_, num = dense_rank(_.id)) >> show_query(_)\n", + "q = (tbl_addresses\n", + " >> group_by(\"user_id\")\n", + " >> mutate(num = dense_rank(_.id))\n", + " >> filter(\n", + " _.id > _.id.min(),\n", + " _.email_address.str.startswith(\"jack\")\n", + " )\n", + " >> ungroup()\n", + " >> show_query(simplify = True)\n", + " >> collect()\n", + " )\n", + "\n", + "q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Mutate" + ] + }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.update_dt, dense_rank() OVER (ORDER BY addresses.id) + 1 AS rank \n", + "FROM addresses\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\sql\\translate.py:24: SiubaSqlRuntimeWarning: \n", + "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", + "\n", + "This warning only displays once per function\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dtrank
011jack@yahoo.com2022-01-19 08:12:01.0206802
121jack@msn.com2022-01-19 08:12:01.0206803
232www@www.org2022-01-19 08:12:01.0206804
342wendy@aol.com2022-01-19 08:12:01.0206805
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " id user_id email_address update_dt rank\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680 2\n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680 3\n", + "2 3 2 www@www.org 2022-01-19 08:12:01.020680 4\n", + "3 4 2 wendy@aol.com 2022-01-19 08:12:01.020680 5\n", + "# .. may have more rows" + ] + }, + "execution_count": 75, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> mutate(rank = dense_rank(_.id) + 1)\n", + " >> show_query()\n", + " )\n", + "q" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.update_dt, addresses.id > dense_rank() OVER (PARTITION BY addresses.user_id, addresses.user_id, addresses.user_id ORDER BY addresses.id) + 1 AS rank \n", + "FROM addresses\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\sql\\translate.py:24: SiubaSqlRuntimeWarning: \n", + "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", + "\n", + "This warning only displays once per function\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dtrank
032www@www.org2022-01-19 08:12:01.020680True
142wendy@aol.com2022-01-19 08:12:01.020680True
211jack@yahoo.com2022-01-19 08:12:01.020680False
321jack@msn.com2022-01-19 08:12:01.020680False
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " id user_id email_address update_dt rank\n", + "0 3 2 www@www.org 2022-01-19 08:12:01.020680 True\n", + "1 4 2 wendy@aol.com 2022-01-19 08:12:01.020680 True\n", + "2 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680 False\n", + "3 2 1 jack@msn.com 2022-01-19 08:12:01.020680 False\n", + "# .. may have more rows" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> group_by(\"user_id\")\n", + " >> mutate(rank = _.id > dense_rank(_.id) + 1)\n", + " >> show_query()\n", + " )\n", + "q" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.email, anon_1.is_mikey, CAST(anon_1.is_mikey AS INTEGER) + 1 AS mikey2 \n", + "FROM (SELECT anon_2.email AS email, (anon_2.email LIKE 'mikey' || '%') AS is_mikey \n", + "FROM (SELECT addresses.email_address AS email \n", + "FROM addresses) AS anon_2) AS anon_1\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
emailis_mikeymikey2
0jack@yahoo.comFalse1
1jack@msn.comFalse1
2www@www.orgFalse1
3wendy@aol.comFalse1
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " email is_mikey mikey2\n", + "0 jack@yahoo.com False 1\n", + "1 jack@msn.com False 1\n", + "2 www@www.org False 1\n", + "3 wendy@aol.com False 1\n", + "# .. may have more rows" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# rename and first mutate in same query,\n", + "# second mutate is outer query (since uses to prev col)\n", + "# Need to convert the boolean variable to an integer in order for DuckDB to be able to add 1 to it\n", + "q = (tbl_addresses\n", + " >> select(_.email == _.email_address)\n", + " >> mutate(is_mikey = _.email.str.startswith(\"mikey\"), mikey2 = _.is_mikey.astype(int) + 1)\n", + " >> show_query()\n", + " )\n", + "q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Filter" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.update_dt \n", + "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.update_dt AS update_dt \n", + "FROM addresses) AS anon_1 \n", + "WHERE anon_1.id > 1\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dt
021jack@msn.com2022-01-19 08:12:01.020680
132www@www.org2022-01-19 08:12:01.020680
242wendy@aol.com2022-01-19 08:12:01.020680
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " id user_id email_address update_dt\n", + "0 2 1 jack@msn.com 2022-01-19 08:12:01.020680\n", + "1 3 2 www@www.org 2022-01-19 08:12:01.020680\n", + "2 4 2 wendy@aol.com 2022-01-19 08:12:01.020680\n", + "# .. may have more rows" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> filter(_.id > 1)\n", + " >> show_query()\n", + " )\n", + "q" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.update_dt \n", + "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.update_dt AS update_dt \n", + "FROM addresses) AS anon_1 \n", + "WHERE anon_1.id > 1\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dt
021jack@msn.com2022-01-19 08:12:01.020680
132www@www.org2022-01-19 08:12:01.020680
242wendy@aol.com2022-01-19 08:12:01.020680
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " id user_id email_address update_dt\n", + "0 2 1 jack@msn.com 2022-01-19 08:12:01.020680\n", + "1 3 2 www@www.org 2022-01-19 08:12:01.020680\n", + "2 4 2 wendy@aol.com 2022-01-19 08:12:01.020680\n", + "# .. may have more rows" + ] + }, + "execution_count": 79, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> group_by(\"user_id\")\n", + " >> filter(_.id > 1)\n", + " >> show_query()\n", + " )\n", + "q" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.update_dt \n", + "FROM (SELECT anon_2.id AS id, anon_2.user_id AS user_id, anon_2.email_address AS email_address, anon_2.update_dt AS update_dt, dense_rank() OVER (PARTITION BY anon_2.user_id, anon_2.user_id ORDER BY anon_2.id) AS win1, dense_rank() OVER (PARTITION BY anon_2.user_id, anon_2.user_id ORDER BY anon_2.id) > 1 AS win2 \n", + "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.update_dt AS update_dt \n", + "FROM addresses) AS anon_2) AS anon_1 \n", + "WHERE anon_1.win2\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\sql\\translate.py:24: SiubaSqlRuntimeWarning: \n", + "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", + "\n", + "This warning only displays once per function\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dt
042wendy@aol.com2022-01-19 08:12:01.020680
121jack@msn.com2022-01-19 08:12:01.020680
\n", + "
" + ], + "text/plain": [ + " id user_id email_address update_dt\n", + "0 4 2 wendy@aol.com 2022-01-19 08:12:01.020680\n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680" + ] + }, + "execution_count": 80, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> group_by(\"user_id\")\n", + " >> filter(dense_rank(_.id) > 1)\n", + " >> show_query()\n", + " >> collect()\n", + " )\n", + "\n", + "q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summarize" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.user_id, avg(addresses.id) AS avg_id \n", + "FROM addresses GROUP BY addresses.user_id\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idavg_id
011.5
123.5
\n", + "
" + ], + "text/plain": [ + " user_id avg_id\n", + "0 1 1.5\n", + "1 2 3.5" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> group_by(\"user_id\")\n", + " >> summarize(avg_id = _.id.mean())\n", + " >> show_query()\n", + " >> collect()\n", + " )\n", + "\n", + "q" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT avg(anon_1.id2) AS m_id \n", + "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.update_dt AS update_dt, addresses.id + 1 AS id2 \n", + "FROM addresses) AS anon_1\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
m_id
03.5
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " m_id\n", + "0 3.5\n", + "# .. may have more rows" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses >> mutate(_, id2 = _.id + 1) >> summarize(_, m_id = _.id2.mean())) >> show_query()\n", + "q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Count" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.user_id, anon_1.id, count(*) AS n \n", + "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.update_dt AS update_dt \n", + "FROM addresses) AS anon_1 GROUP BY anon_1.user_id, anon_1.id ORDER BY n DESC\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_ididn
0111
1121
2231
3241
\n", + "
" + ], + "text/plain": [ + " user_id id n\n", + "0 1 1 1\n", + "1 1 2 1\n", + "2 2 3 1\n", + "3 2 4 1" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> group_by(\"user_id\")\n", + " >> count(_.id)\n", + " >> show_query()\n", + " >> collect()\n", + ")\n", + "\n", + "q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Joins" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.update_dt, anon_2.fullname, anon_2.name \n", + "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.update_dt AS update_dt \n", + "FROM addresses) AS anon_1 LEFT OUTER JOIN (SELECT users.id AS id, users.name AS name, users.fullname AS fullname \n", + "FROM users) AS anon_2 ON anon_1.user_id = anon_2.id\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dtfullnamename
011jack@yahoo.com2022-01-19 08:12:01.020680Jack Jonesjack
121jack@msn.com2022-01-19 08:12:01.020680Jack Jonesjack
232www@www.org2022-01-19 08:12:01.020680Wendy Williamswendy
342wendy@aol.com2022-01-19 08:12:01.020680Wendy Williamswendy
\n", + "
" + ], + "text/plain": [ + " id user_id email_address update_dt fullname \\\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680 Jack Jones \n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680 Jack Jones \n", + "2 3 2 www@www.org 2022-01-19 08:12:01.020680 Wendy Williams \n", + "3 4 2 wendy@aol.com 2022-01-19 08:12:01.020680 Wendy Williams \n", + "\n", + " name \n", + "0 jack \n", + "1 jack \n", + "2 wendy \n", + "3 wendy " + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# TODO: not executable like this, how to get first SELECT out of parens?\n", + "# E.g. can use users.join, etc..\n", + "q = (tbl_addresses\n", + " >> left_join(_, tbl_users, {\"user_id\": \"id\"})\n", + " >> show_query()\n", + " >> collect()\n", + " )\n", + "\n", + "q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## case_when" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.update_dt, CASE WHEN (addresses.id > 20) THEN 0 WHEN (addresses.id > 1) THEN 1 ELSE addresses.id END AS label \n", + "FROM addresses\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dtlabel
011jack@yahoo.com2022-01-19 08:12:01.0206801
121jack@msn.com2022-01-19 08:12:01.0206801
232www@www.org2022-01-19 08:12:01.0206801
342wendy@aol.com2022-01-19 08:12:01.0206801
\n", + "
" + ], + "text/plain": [ + " id user_id email_address update_dt label\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680 1\n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680 1\n", + "2 3 2 www@www.org 2022-01-19 08:12:01.020680 1\n", + "3 4 2 wendy@aol.com 2022-01-19 08:12:01.020680 1" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TODO: fix sql case_when statements\n", + "q = (tbl_addresses\n", + " >> mutate(\n", + " label = case_when(_, {\n", + " _.id > 20: 0,\n", + " _.id > 1: 1,\n", + " True: _.id\n", + " })\n", + " )\n", + " >> show_query()\n", + " >> collect()\n", + " )\n", + "\n", + "q" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.update_dt \n", + "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.update_dt AS update_dt \n", + "FROM addresses) AS anon_1 \n", + "WHERE CASE WHEN (anon_1.id > 20) THEN true WHEN (anon_1.id > 1) THEN false ELSE true END\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dt
011jack@yahoo.com2022-01-19 08:12:01.020680
\n", + "
" + ], + "text/plain": [ + " id user_id email_address update_dt\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "## TODO: fix sql case_when statements\n", + "# works, but better to just use filter normally...\n", + "q = (tbl_addresses\n", + " >> filter(\n", + " case_when(_, {\n", + " _.id > 20: True,\n", + " _.id > 1: False,\n", + " True: True\n", + " })\n", + " )\n", + " >> show_query()\n", + " >> collect()\n", + " )\n", + "\n", + "q" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.update_dt, CASE WHEN (addresses.id > avg(addresses.id) OVER (PARTITION BY addresses.user_id)) THEN 0 WHEN (addresses.id > 20) THEN 1 ELSE addresses.id END AS label \n", + "FROM addresses\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dtlabel
011jack@yahoo.com2022-01-19 08:12:01.0206801
121jack@msn.com2022-01-19 08:12:01.0206800
232www@www.org2022-01-19 08:12:01.0206803
342wendy@aol.com2022-01-19 08:12:01.0206800
\n", + "
" + ], + "text/plain": [ + " id user_id email_address update_dt label\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680 1\n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680 0\n", + "2 3 2 www@www.org 2022-01-19 08:12:01.020680 3\n", + "3 4 2 wendy@aol.com 2022-01-19 08:12:01.020680 0" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> group_by(\"user_id\")\n", + " >> mutate(\n", + " label = case_when(_, {\n", + " _.id > _.id.mean(): 0,\n", + " _.id > 20: 1,\n", + " True: _.id\n", + " })\n", + " )\n", + " >> show_query()\n", + " >> collect()\n", + " )\n", + "\n", + "q" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "█─'__call__'\n", + "├─\n", + "├─_\n", + "└─█─''\n", + " └─█─'__call__'\n", + " ├─\n", + " └─{_.id > 1: 'yeah', True: 'no'}" + ] + }, + "execution_count": 88, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBVAL_IGNORE_OUTPUT\n", + "case_when(_, {_.id > 1: \"yeah\", True: \"no\"})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## if_else" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.update_dt, CASE WHEN (dense_rank() OVER (ORDER BY addresses.id) > 1) THEN 'yes' ELSE 'no' END AS big_id \n", + "FROM addresses\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\sql\\translate.py:24: SiubaSqlRuntimeWarning: \n", + "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", + "\n", + "This warning only displays once per function\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dtbig_id
011jack@yahoo.com2022-01-19 08:12:01.020680no
121jack@msn.com2022-01-19 08:12:01.020680yes
232www@www.org2022-01-19 08:12:01.020680yes
342wendy@aol.com2022-01-19 08:12:01.020680yes
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " id user_id email_address update_dt big_id\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680 no\n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680 yes\n", + "2 3 2 www@www.org 2022-01-19 08:12:01.020680 yes\n", + "3 4 2 wendy@aol.com 2022-01-19 08:12:01.020680 yes\n", + "# .. may have more rows" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = (tbl_addresses\n", + " >> mutate(big_id = if_else(dense_rank(_.id) > 1, \"yes\", \"no\"))\n", + " >> show_query()\n", + " )\n", + "q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Head" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.update_dt \n", + "FROM addresses \n", + " LIMIT 3\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dt
011jack@yahoo.com2022-01-19 08:12:01.020680
121jack@msn.com2022-01-19 08:12:01.020680
232www@www.org2022-01-19 08:12:01.020680
\n", + "
" + ], + "text/plain": [ + " id user_id email_address update_dt\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680\n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680\n", + "2 3 2 www@www.org 2022-01-19 08:12:01.020680" + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(tbl_addresses\n", + " >> head(3)\n", + " >> show_query()\n", + " >> collect()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Rename" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id AS id2, addresses.user_id, addresses.email_address, addresses.update_dt \n", + "FROM addresses\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
id2user_idemail_addressupdate_dt
011jack@yahoo.com2022-01-19 08:12:01.020680
121jack@msn.com2022-01-19 08:12:01.020680
232www@www.org2022-01-19 08:12:01.020680
342wendy@aol.com2022-01-19 08:12:01.020680
\n", + "
" + ], + "text/plain": [ + " id2 user_id email_address update_dt\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680\n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680\n", + "2 3 2 www@www.org 2022-01-19 08:12:01.020680\n", + "3 4 2 wendy@aol.com 2022-01-19 08:12:01.020680" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(tbl_addresses\n", + " >> rename(id2 = \"id\")\n", + " >> show_query()\n", + " >> collect()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Distinct" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT DISTINCT addresses.user_id, addresses.user_id + 1 AS user_id2 \n", + "FROM addresses\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_iduser_id2
012
123
\n", + "
" + ], + "text/plain": [ + " user_id user_id2\n", + "0 1 2\n", + "1 2 3" + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(tbl_addresses\n", + " >> distinct(_.user_id, user_id2 = _.user_id + 1)\n", + " >> show_query()\n", + " >> collect()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT DISTINCT addresses.email_address, dense_rank() OVER (PARTITION BY addresses.user_id ORDER BY addresses.user_id) AS user_id2 \n", + "FROM addresses\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\sql\\translate.py:24: SiubaSqlRuntimeWarning: \n", + "dense_rank sql translation defaults na_option to None. To return identical result as pandas, use na_option = 'keep'.\n", + "\n", + "This warning only displays once per function\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
email_addressuser_id2
0jack@yahoo.com1
1jack@msn.com1
2www@www.org1
3wendy@aol.com1
\n", + "
" + ], + "text/plain": [ + " email_address user_id2\n", + "0 jack@yahoo.com 1\n", + "1 jack@msn.com 1\n", + "2 www@www.org 1\n", + "3 wendy@aol.com 1" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(tbl_addresses\n", + " >> group_by(\"user_id\")\n", + " >> distinct(_.email_address, user_id2 = dense_rank(_.user_id))\n", + " >> show_query()\n", + " >> collect()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Technical" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Translating symbolic function calls" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
user_idn
012
122
\n", + "
" + ], + "text/plain": [ + " user_id n\n", + "0 1 2\n", + "1 2 2" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from siuba.dply.vector import n\n", + "\n", + "tbl_addresses \\\n", + " >> group_by(\"user_id\") \\\n", + " >> summarize(n = n(_)) \\\n", + " >> collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Translating str methods" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Translating dt methods" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addresshour
011jack@yahoo.com15
121jack@msn.com15
232www@www.org15
342wendy@aol.com15
\n", + "
" + ], + "text/plain": [ + " id user_id email_address hour\n", + "0 1 1 jack@yahoo.com 15\n", + "1 2 1 jack@msn.com 15\n", + "2 3 2 www@www.org 15\n", + "3 4 2 wendy@aol.com 15" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "# pd.read_sql(\"\"\"select * from information_schema.tables\"\"\",conn)\n", + "# pd.read_sql(\"\"\"select * from pragma_table_info('addresses')\"\"\",conn)\n", + "# pd.read_sql(\"\"\"select * from addresses\"\"\",conn)\n", + "pd.read_sql(\"\"\"SELECT addresses.id, addresses.user_id, addresses.email_address, EXTRACT(hour FROM current_timestamp) AS hour \n", + "FROM addresses\"\"\",conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.update_dt, EXTRACT(hour FROM addresses.update_dt) AS hour \n", + "FROM addresses\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dthour
011jack@yahoo.com2022-01-19 08:12:01.0206808
121jack@msn.com2022-01-19 08:12:01.0206808
232www@www.org2022-01-19 08:12:01.0206808
342wendy@aol.com2022-01-19 08:12:01.0206808
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " id user_id email_address update_dt hour\n", + "0 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680 8\n", + "1 2 1 jack@msn.com 2022-01-19 08:12:01.020680 8\n", + "2 3 2 www@www.org 2022-01-19 08:12:01.020680 8\n", + "3 4 2 wendy@aol.com 2022-01-19 08:12:01.020680 8\n", + "# .. may have more rows" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "q = tbl_addresses >> mutate(hour = _.update_dt.dt.hour) >> show_query()\n", + "q" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SQL escapes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Window functions" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT anon_1.id, anon_1.user_id, anon_1.email_address, anon_1.update_dt, sum(anon_1.user_id) OVER (ORDER BY anon_1.id DESC ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumsum \n", + "FROM (SELECT addresses.id AS id, addresses.user_id AS user_id, addresses.email_address AS email_address, addresses.update_dt AS update_dt \n", + "FROM addresses ORDER BY addresses.id DESC) AS anon_1 ORDER BY cumsum\n" + ] + }, + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressupdate_dtcumsum
042wendy@aol.com2022-01-19 08:12:01.0206802
132www@www.org2022-01-19 08:12:01.0206804
221jack@msn.com2022-01-19 08:12:01.0206805
311jack@yahoo.com2022-01-19 08:12:01.0206806
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " id user_id email_address update_dt cumsum\n", + "0 4 2 wendy@aol.com 2022-01-19 08:12:01.020680 2\n", + "1 3 2 www@www.org 2022-01-19 08:12:01.020680 4\n", + "2 2 1 jack@msn.com 2022-01-19 08:12:01.020680 5\n", + "3 1 1 jack@yahoo.com 2022-01-19 08:12:01.020680 6\n", + "# .. may have more rows" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from siuba.dply.vector import desc\n", + "(tbl_addresses\n", + " >> arrange(desc(_.id))\n", + " >> mutate(cumsum = _.user_id.cumsum())\n", + " >> arrange(_.cumsum)\n", + " >> show_query()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Misc" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## postgres specific" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iduser_idemail_addressid2
011jack@yahoo.com1.0
121jack@msn.com2.0
232www@www.org3.0
342wendy@aol.com4.0
\n", + "
" + ], + "text/plain": [ + " id user_id email_address id2\n", + "0 1 1 jack@yahoo.com 1.0\n", + "1 2 1 jack@msn.com 2.0\n", + "2 3 2 www@www.org 3.0\n", + "3 4 2 wendy@aol.com 4.0" + ] + }, + "execution_count": 98, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#import pandas as pd\n", + "pd.read_sql(\"\"\"SELECT addresses.id, addresses.user_id, addresses.email_address, round(CAST(addresses.id AS NUMERIC), 2) AS id2 \n", + "FROM addresses\"\"\",conn)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT addresses.id, addresses.user_id, addresses.email_address, addresses.update_dt, round(CAST(addresses.id AS NUMERIC), 2) AS id2 \n", + "FROM addresses\n" + ] + }, + { + "ename": "NotImplementedError", + "evalue": "Not implemented Error: ROUND(DECIMAL, INTEGER) with non-constant precision is not supported", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mRuntimeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\duckdb_engine\\__init__.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, statement, parameters, context)\u001b[0m\n\u001b[0;32m 92\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 93\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstatement\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 94\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mRuntimeError\u001b[0m: Not implemented Error: ROUND(DECIMAL, INTEGER) with non-constant precision is not supported", + "\nThe above exception was the direct cause of the following exception:\n", + "\u001b[1;31mNotImplementedError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_24600/4260062141.py\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m (tbl_addresses\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0mmutate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mid2\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mid\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mround\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0mshow_query\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;33m>>\u001b[0m \u001b[0mcollect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5\u001b[0m )\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\dply\\verbs.py\u001b[0m in \u001b[0;36m__rrshift__\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m 97\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mPipeable\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcalls\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcalls\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 98\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 99\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 100\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__call__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\dply\\verbs.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m 102\u001b[0m \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mx\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 103\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mf\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcalls\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 104\u001b[1;33m \u001b[0mres\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mres\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 105\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mres\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 106\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\siu.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, x)\u001b[0m\n\u001b[0;32m 200\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0moperator\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgetitem\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minst\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0mrest\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 201\u001b[0m \u001b[1;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m\"__call__\"\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 202\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0minst\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mrest\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 203\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 204\u001b[0m \u001b[1;31m# in normal case, get method to call, and then call it\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\dply\\verbs.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 196\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mdispatch_func\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mNoArgs\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mstrip_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 197\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 198\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mdispatch_func\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0mstrip_args\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mstrip_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 199\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 200\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mwrapper\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\functools.py\u001b[0m in \u001b[0;36mwrapper\u001b[1;34m(*args, **kw)\u001b[0m\n\u001b[0;32m 875\u001b[0m '1 positional argument')\n\u001b[0;32m 876\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 877\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mdispatch\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 878\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 879\u001b[0m \u001b[0mfuncname\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'__name__'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'singledispatch function'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\sql\\verbs.py\u001b[0m in \u001b[0;36m_collect\u001b[1;34m(__data, as_df)\u001b[0m\n\u001b[0;32m 432\u001b[0m \u001b[0msql_db\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_FixedSqlDatabase\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 433\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 434\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0msql_db\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_sql\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m__data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlast_op\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 435\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 436\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m__data\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlast_op\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\pandas\\io\\sql.py\u001b[0m in \u001b[0;36mread_query\u001b[1;34m(self, sql, index_col, coerce_float, parse_dates, params, chunksize, dtype)\u001b[0m\n\u001b[0;32m 1577\u001b[0m \u001b[0margs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_convert_params\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msql\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1578\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1579\u001b[1;33m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1580\u001b[0m \u001b[0mcolumns\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresult\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mkeys\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1581\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\siuba\\sql\\utils.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 46\u001b[0m \u001b[1;32mclass\u001b[0m \u001b[0m_FixedSqlDatabase\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_pd_sql\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSQLDatabase\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 47\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 48\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnectable\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 49\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\sqlalchemy\\engine\\base.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, statement, *multiparams, **params)\u001b[0m\n\u001b[0;32m 1287\u001b[0m )\n\u001b[0;32m 1288\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1289\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mmeth\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmultiparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_EMPTY_EXECUTION_OPTS\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1290\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1291\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_execute_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmultiparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexecution_options\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\sqlalchemy\\sql\\elements.py\u001b[0m in \u001b[0;36m_execute_on_connection\u001b[1;34m(self, connection, multiparams, params, execution_options, _force)\u001b[0m\n\u001b[0;32m 323\u001b[0m ):\n\u001b[0;32m 324\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0m_force\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msupports_execution\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 325\u001b[1;33m return connection._execute_clauseelement(\n\u001b[0m\u001b[0;32m 326\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmultiparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mexecution_options\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 327\u001b[0m )\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\sqlalchemy\\engine\\base.py\u001b[0m in \u001b[0;36m_execute_clauseelement\u001b[1;34m(self, elem, multiparams, params, execution_options)\u001b[0m\n\u001b[0;32m 1479\u001b[0m \u001b[0mlinting\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdialect\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompiler_linting\u001b[0m \u001b[1;33m|\u001b[0m \u001b[0mcompiler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mWARN_LINTING\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1480\u001b[0m )\n\u001b[1;32m-> 1481\u001b[1;33m ret = self._execute_context(\n\u001b[0m\u001b[0;32m 1482\u001b[0m \u001b[0mdialect\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1483\u001b[0m \u001b[0mdialect\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecution_ctx_cls\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_init_compiled\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\sqlalchemy\\engine\\base.py\u001b[0m in \u001b[0;36m_execute_context\u001b[1;34m(self, dialect, constructor, statement, parameters, execution_options, *args, **kw)\u001b[0m\n\u001b[0;32m 1843\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1844\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mBaseException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1845\u001b[1;33m self._handle_dbapi_exception(\n\u001b[0m\u001b[0;32m 1846\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatement\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcursor\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcontext\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1847\u001b[0m )\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\sqlalchemy\\engine\\base.py\u001b[0m in \u001b[0;36m_handle_dbapi_exception\u001b[1;34m(self, e, statement, parameters, cursor, context)\u001b[0m\n\u001b[0;32m 2028\u001b[0m )\n\u001b[0;32m 2029\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 2030\u001b[1;33m \u001b[0mutil\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mraise_\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mwith_traceback\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2031\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 2032\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\sqlalchemy\\util\\compat.py\u001b[0m in \u001b[0;36mraise_\u001b[1;34m(***failed resolving arguments***)\u001b[0m\n\u001b[0;32m 205\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 206\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 207\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mexception\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 208\u001b[0m \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 209\u001b[0m \u001b[1;31m# credit to\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\sqlalchemy\\engine\\base.py\u001b[0m in \u001b[0;36m_execute_context\u001b[1;34m(self, dialect, constructor, statement, parameters, execution_options, *args, **kw)\u001b[0m\n\u001b[0;32m 1800\u001b[0m \u001b[1;32mbreak\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1801\u001b[0m \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mevt_handled\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1802\u001b[1;33m self.dialect.do_execute(\n\u001b[0m\u001b[0;32m 1803\u001b[0m \u001b[0mcursor\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstatement\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcontext\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1804\u001b[0m )\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\duckdb_engine\\__init__.py\u001b[0m in \u001b[0;36mdo_execute\u001b[1;34m(self, cursor, statement, parameters, context)\u001b[0m\n\u001b[0;32m 129\u001b[0m \u001b[0mcontext\u001b[0m\u001b[1;33m:\u001b[0m \u001b[0mPGExecutionContext\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 130\u001b[0m ) -> None:\n\u001b[1;32m--> 131\u001b[1;33m \u001b[0mcursor\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstatement\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcontext\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 132\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 133\u001b[0m def do_executemany(\n", + "\u001b[1;32mC:\\ProgramData\\Anaconda3\\envs\\new_duckdb\\lib\\site-packages\\duckdb_engine\\__init__.py\u001b[0m in \u001b[0;36mexecute\u001b[1;34m(self, statement, parameters, context)\u001b[0m\n\u001b[0;32m 94\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mRuntimeError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 95\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m0\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstartswith\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Not implemented Error\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 96\u001b[1;33m \u001b[1;32mraise\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 97\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 98\u001b[0m \u001b[1;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mNotImplementedError\u001b[0m: Not implemented Error: ROUND(DECIMAL, INTEGER) with non-constant precision is not supported" + ] + } + ], + "source": [ + "(tbl_addresses\n", + " >> mutate(id2 = _.id.round(2))\n", + " >> show_query()\n", + " >> collect()\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## autoload table w/ sqlalchemy" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT users.id, users.name, users.fullname, users.id + 1 AS id2 \n", + "FROM users\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamefullnameid2
01jackJack Jones2
12wendyWendy Williams3
\n", + "
" + ], + "text/plain": [ + " id name fullname id2\n", + "0 1 jack Jack Jones 2\n", + "1 2 wendy Wendy Williams 3" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import sqlalchemy\n", + "\n", + "metadata2 = MetaData()\n", + "\n", + "#Since we are using an in memory DuckDB that is specific to the connection, need to autoload_with=conn not =engine\n", + "users2 = sqlalchemy.Table('users', metadata2, autoload = True, autoload_with = conn)\n", + "tbl_users2 = LazyTbl(conn, users2)\n", + "\n", + "(tbl_users\n", + " >> mutate(id2 = _.id + 1)\n", + " >> show_query()\n", + " >> collect()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## auto table from string" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT users.id, users.name, users.fullname, users.id + 1 AS id2 \n", + "FROM users\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamefullnameid2
01jackJack Jones2
12wendyWendy Williams3
\n", + "
" + ], + "text/plain": [ + " id name fullname id2\n", + "0 1 jack Jack Jones 2\n", + "1 2 wendy Wendy Williams 3" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import sqlalchemy\n", + "\n", + "metadata3 = MetaData()\n", + "\n", + "tbl_users3 = LazyTbl(conn, \"users\")\n", + "\n", + "(tbl_users\n", + " >> mutate(id2 = _.id + 1)\n", + " >> show_query()\n", + " >> collect()\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## LazyTbl repr" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
# Source: lazy query\n",
+       "# DB Conn: Engine(duckdb:///:memory:)\n",
+       "# Preview:\n",
+       "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idnamefullnameid2
01jackJack Jones2
12wendyWendy Williams3
\n", + "

# .. may have more rows

" + ], + "text/plain": [ + "# Source: lazy query\n", + "# DB Conn: Engine(duckdb:///:memory:)\n", + "# Preview:\n", + " id name fullname id2\n", + "0 1 jack Jack Jones 2\n", + "1 2 wendy Wendy Williams 3\n", + "# .. may have more rows" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tbl_users >> mutate(id2 = _.id + 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": { + "height": "calc(100% - 180px)", + "left": "10px", + "top": "150px", + "width": "165px" + }, + "toc_section_display": true, + "toc_window_display": true + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements-dev.txt b/requirements-dev.txt index c8db4e45..48a5f324 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -61,7 +61,7 @@ pytest==5.3.5 python-dateutil==2.8.1 pytz==2020.1 PyYAML==5.3.1 -pyzmq==19.0.0 +pyzmq==22.3.0 requests==2.24.0 scipy==1.5.2 six==1.14.0 diff --git a/requirements-test.txt b/requirements-test.txt index 6f5d0100..e08925e9 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -13,8 +13,8 @@ jsonschema==3.2.0 jupyter-client==6.0.0 jupyter-core==4.6.3 more-itertools==8.2.0 -nbformat==5.0.4 -nbval==0.9.5 +nbformat==5.1.3 +nbval==0.9.6 packaging==20.3 parso==0.6.2 pexpect==4.8.0 @@ -22,13 +22,13 @@ pickleshare==0.7.5 pluggy==0.13.1 prompt-toolkit==3.0.3 ptyprocess==0.6.0 -py==1.8.1 +py==1.11.0 Pygments==2.5.2 pyparsing==2.4.6 pyrsistent==0.15.7 -pytest==5.3.5 +pytest==6.2.5 python-dateutil==2.8.1 -pyzmq==19.0.0 +pyzmq==22.3.0 six==1.14.0 sortedcontainers==2.1.0 tornado==6.0.4 diff --git a/setup.py b/setup.py index ee1a6823..e05918c3 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ VERSION = str(ast.literal_eval(_version_re.search( f.read().decode('utf-8')).group(1))) -with open('README.md') as f: +with open('README.md', encoding="utf-8") as f: README = f.read() # setup ----------------------------------------------------------------------- diff --git a/siuba/__init__.py b/siuba/__init__.py index efc7c258..e9f771d6 100644 --- a/siuba/__init__.py +++ b/siuba/__init__.py @@ -1,5 +1,5 @@ # version --------------------------------------------------------------------- -__version__ = "0.0.25" +__version__ = "1.0.0a3" # default imports-------------------------------------------------------------- from .siu import _, Lam diff --git a/siuba/dply/forcats.py b/siuba/dply/forcats.py index f34f572f..55c00fee 100644 --- a/siuba/dply/forcats.py +++ b/siuba/dply/forcats.py @@ -16,6 +16,10 @@ def fct_reorder(fct, x, func = np.median, desc = False) -> pd.Categorical: func: function run over all values within a level of the categorical. desc: whether to sort in descending order. + Notes that NaN categories can't be ordered. When func returns NaN, sorting + is always done with NaNs last. + + Examples: >>> fct_reorder(['a', 'a', 'b'], [4, 3, 2]) ['a', 'a', 'b'] @@ -34,11 +38,11 @@ def fct_reorder(fct, x, func = np.median, desc = False) -> pd.Categorical: x_vals = x.values if isinstance(x, pd.Series) else x s = pd.Series(x_vals, index = fct) - # for each cat, calc agg func, make values of ordered the codes + # sort groups by calculated agg func. note that groupby uses dropna=True by default, + # but that's okay, since pandas categoricals can't order the NA category ordered = s.groupby(level = 0).agg(func).sort_values(ascending = not desc) - ordered[:] = np.arange(len(ordered)) - codes = ordered[s.index.values] - return pd.Categorical.from_codes(codes, list(ordered.index)) + + return pd.Categorical(fct, categories=ordered.index) # fct_recode ------------------------------------------------------------------ diff --git a/siuba/dply/vector.py b/siuba/dply/vector.py index ce02d18f..86c32acc 100644 --- a/siuba/dply/vector.py +++ b/siuba/dply/vector.py @@ -270,19 +270,19 @@ def coalesce(x, *args): *args: other Series that are the same length as x, or a scalar Examples: - >>> x = pd.Series([1., None, None]) + >>> x = pd.Series([1.1, None, None]) >>> abc = pd.Series(['a', 'b', None]) >>> xyz = pd.Series(['x', 'y', 'z']) >>> coalesce(x, abc) - 0 1 + 0 1.1 1 b 2 None dtype: object >>> coalesce(x, abc, xyz) - 0 1 - 1 b - 2 z + 0 1.1 + 1 b + 2 z dtype: object """ diff --git a/siuba/dply/verbs.py b/siuba/dply/verbs.py index f8b0d7f5..4c367a56 100644 --- a/siuba/dply/verbs.py +++ b/siuba/dply/verbs.py @@ -59,9 +59,14 @@ def install_pd_siu(): setattr(DataFrameGroupBy, method_name, f) DataFrameGroupBy._repr_html_ = _repr_grouped_df_html_ + DataFrameGroupBy.__repr__ = _repr_grouped_df_console_ def _repr_grouped_df_html_(self): - return "

(grouped data frame)

" + self._selected_obj._repr_html_() + "
" + return "

(grouped data frame)

" + self.obj._repr_html_() + "
" + +def _repr_grouped_df_console_(self): + return "(grouped data frame)\n" + repr(self.obj) + # TODO: should be a subclass of Call? class Pipeable: @@ -992,8 +997,18 @@ def nest(__data, *args, key = "data"): g_df = __data.groupby(grp_keys) splitter = g_df.grouper._get_splitter(g_df.obj[nest_keys]) + # TODO: iterating over splitter now only produces 1 item (the dataframe) + # check backwards compat + def _extract_subdf_pandas_1_3(entry): + # in pandas < 1.3, splitter.__iter__ returns tuple entries (ii, df) + if isinstance(entry, tuple): + return entry[1] + + # in pandas 1.3, each entry is just the dataframe + return entry + result_index = g_df.grouper.result_index - nested_dfs = [x for ii, x in splitter] + nested_dfs = [_extract_subdf_pandas_1_3(x) for x in splitter] out = pd.DataFrame({key: nested_dfs}, index = result_index).reset_index() @@ -1095,9 +1110,16 @@ def semi_join(left, right = None, on = None): on_cols, right_on = map(list, zip(*on.items())) right = right[right_on].rename(dict(zip(right_on, on_cols))) elif on is None: - on_cols = set(left.columns).intersection(set(right.columns)) + warnings.warn( + "No on column passed to join. " + "Inferring join columns instead using shared column names." + ) + + on_cols = list(set(left.columns).intersection(set(right.columns))) if not len(on_cols): - raise Exception("No joining column specified, and no shared column names") + raise Exception("No join column specified, and no shared column names") + + warnings.warn("Detected shared columns: %s" % on_cols) elif isinstance(on, str): on_cols = [on] else: diff --git a/siuba/experimental/datetime.py b/siuba/experimental/datetime.py index 4138f521..f6fc0441 100644 --- a/siuba/experimental/datetime.py +++ b/siuba/experimental/datetime.py @@ -104,7 +104,7 @@ def _get_series_dispatcher(f, x): >>> per = pd.PeriodIndex([a_date], freq = "S") >>> floor_date(per, "M") - PeriodIndex(['2020-02'], dtype='period[M]', freq='M') + PeriodIndex(['2020-02'], dtype='period[M]'... """ diff --git a/siuba/experimental/pd_groups/groupby.py b/siuba/experimental/pd_groups/groupby.py index e1cd3722..cc200e51 100644 --- a/siuba/experimental/pd_groups/groupby.py +++ b/siuba/experimental/pd_groups/groupby.py @@ -7,8 +7,12 @@ from pandas import Series from pandas.api.types import is_scalar -from pandas.core.groupby import SeriesGroupBy -from pandas.core import algorithms +from pandas.core.groupby import SeriesGroupBy, DataFrameGroupBy + +try: + from pandas.core.algorithms import take_1d +except ImportError: + from pandas.core.array_algos.take import take_1d # Custom SeriesGroupBy class ================================================== @@ -99,6 +103,7 @@ def broadcast_agg(groupby, result, obj): raise NotImplementedError() + @broadcast_agg.register(GroupByAgg) def _broadcast_agg_gba(groupby): """ @@ -113,9 +118,11 @@ def _broadcast_agg_gba(groupby): src = groupby._orig_obj ids, _, ngroup = groupby._orig_grouper.group_info - out = algorithms.take_1d(groupby.obj._values, ids) + out = take_1d(groupby.obj._values, ids) - return Series(out, index=src.index, name=src.name) + # Note: reductions like siuba.dply.vector.n(_) map DataFrameGroupBy -> GroupByAgg, + # so the underlying object is a DataFrame, and does not have a .name attribute. + return Series(out, index=src.index, name=getattr(src, "name", None)) @broadcast_agg.register(SeriesGroupBy) def _broadcast_agg_sgb(groupby): diff --git a/siuba/experimental/pd_groups/test_pd_groups.py b/siuba/experimental/pd_groups/test_pd_groups.py index 65508389..f6b46b7f 100644 --- a/siuba/experimental/pd_groups/test_pd_groups.py +++ b/siuba/experimental/pd_groups/test_pd_groups.py @@ -41,6 +41,7 @@ f_min = method_agg_op('min', is_property = False, accessor = None) f_add = method_el_op2('add', is_property = False, accessor = None) f_abs = method_el_op('abs', is_property = False, accessor = None) +f_df_size = lambda x: GroupByAgg.from_result(x.size(), x) # GroupByAgg is liskov substitutable, so check that our functions operate # like similarly substitutable subtypes. This means that... @@ -78,6 +79,9 @@ def test_grouped_translator_methods(f_op, f_dst, cls_result): (lambda g: f_min(g.x), lambda g: g.x.transform('min')), (lambda g: f_min(f_min(g.x)), lambda g: g.x.transform('min')), (lambda g: f_abs(f_min(g.x)), lambda g: g.x.transform('min').abs()), + + # Note that there's no way to transform a DF method, so use an arbitrary column + (lambda g: f_df_size(g), lambda g: g.x.transform('size')), ]) def test_agg_groupby_broadcasted_equal_to_transform(f_op, f_dst): g = data_default.groupby('g') @@ -91,6 +95,20 @@ def test_agg_groupby_broadcasted_equal_to_transform(f_op, f_dst): assert_series_equal(broadcasted, dst, check_names = False) +# Test generic functions ====================================================== + +def test_fast_mutate_basic(): + # sanity check of https://github.com/machow/siuba/issues/355 + from siuba.siu import _ + + res_df = data_default.groupby("g") >> fast_mutate(num = _.x / _.y * 100) + + res = res_df.num + dst = data_default.x / data_default.y * 100 + + assert_series_equal(res.obj, dst, check_names=False) + + # Test user-defined functions ================================================= from .dialect import fast_mutate, fast_summarize, fast_filter, _transform_args diff --git a/siuba/meta_hook.py b/siuba/meta_hook.py index f4fa607b..4d10a2f0 100644 --- a/siuba/meta_hook.py +++ b/siuba/meta_hook.py @@ -1,3 +1,10 @@ +""" +DEPRECATED. + +Note that this module was experimental, and created very early in siuba's development. +You should not rely on it for anything important. +""" + from importlib.abc import Loader, MetaPathFinder from importlib.machinery import ModuleSpec from importlib.util import find_spec @@ -55,7 +62,8 @@ def exec_module(self, module): #self.orig_loader.exec_module(self.orig_module) #for k,v in self.orig_module.__dict__.items(): - for k,v in self.orig_module.__dict__.items(): + all_items = list(self.orig_module.__dict__.items()) + for k,v in all_items: if k.startswith('_'): module.__dict__[k] = v else: diff --git a/siuba/ops/generics.py b/siuba/ops/generics.py index 5fd54937..27b0c624 100644 --- a/siuba/ops/generics.py +++ b/siuba/ops/generics.py @@ -5,7 +5,6 @@ ops_infix = Namespace( __add__ = operation('__add__', 'elwise', 2), __and__ = operation('__and__', 'elwise', 2), - __div__ = operation('__div__', 'elwise', 2), __eq__ = operation('__eq__', 'elwise', 2), __floordiv__ = operation('__floordiv__', 'elwise', 2), __ge__ = operation('__ge__', 'elwise', 2), @@ -22,7 +21,6 @@ __pow__ = operation('__pow__', 'elwise', 2), __radd__ = operation('__radd__', 'elwise', 2), __rand__ = operation('__rand__', 'elwise', 2), - __rdiv__ = operation('__rdiv__', 'elwise', 2), __rfloordiv__ = operation('__rfloordiv__', 'elwise', 2), __rmod__ = operation('__rmod__', 'elwise', 2), __rmul__ = operation('__rmul__', 'elwise', 2), diff --git a/siuba/ops/support/examples.yml b/siuba/ops/support/examples.yml index f38ef08e..79041bbf 100644 --- a/siuba/ops/support/examples.yml +++ b/siuba/ops/support/examples.yml @@ -2,7 +2,6 @@ T: _.T __add__: _ + _ __and__: _ & _ __array__: _.__array__() -__div__: _.__div__(_) __eq__: _ == _ __floordiv__: _ // _ __ge__: _ >= _ @@ -19,7 +18,6 @@ __pos__: +_ __pow__: _**_ __radd__: _ + _ __rand__: _ & _ -__rdiv__: _.__rdiv__(_) __rfloordiv__: _ // _ __rmod__: _ % _ __rmul__: _ * _ diff --git a/siuba/sql/dialects/base.py b/siuba/sql/dialects/base.py index edeb0222..ae2d92aa 100644 --- a/siuba/sql/dialects/base.py +++ b/siuba/sql/dialects/base.py @@ -157,7 +157,6 @@ def req_bool(f): # infix ---- __add__ = sql_colmeth("__add__"), __and__ = req_bool(sql_colmeth("__and__")), - __div__ = sql_colmeth("__div__"), __eq__ = sql_colmeth("__eq__"), __floordiv__ = sql_func_floordiv, __ge__ = sql_colmeth("__ge__"), @@ -174,7 +173,6 @@ def req_bool(f): __pow__ = sql_not_impl(), __radd__ = sql_colmeth("__radd__"), __rand__ = req_bool(sql_colmeth("__rand__")), - __rdiv__ = sql_colmeth("__rdiv__"), __rfloordiv__ = lambda x, y: sql_func_floordiv(y, x), __rmod__ = sql_colmeth("__rmod__"), __rmul__ = sql_colmeth("__rmul__"), @@ -193,8 +191,8 @@ def req_bool(f): add = sql_colmeth("__add__"), #and = - div = sql_colmeth("__div__"), - divide = sql_colmeth("__div__"), + div = sql_colmeth("__truediv__"), + divide = sql_colmeth("__truediv__"), #divmod = eq = sql_colmeth("__eq__"), #floordiv = sql_colmeth("__floordiv__"), @@ -208,7 +206,7 @@ def req_bool(f): ne = sql_colmeth("__ne__"), pow = sql_not_impl(), radd = sql_colmeth("__radd__"), - rdiv = sql_colmeth("__rdiv__"), + rdiv = sql_colmeth("__rtruediv__"), #rdivmod = #rfloordiv = sql_colmeth("__pow__"), rmod = sql_colmeth("__rmod__"), diff --git a/siuba/sql/dialects/mysql.py b/siuba/sql/dialects/mysql.py index 3bce5d07..abb1043a 100644 --- a/siuba/sql/dialects/mysql.py +++ b/siuba/sql/dialects/mysql.py @@ -73,11 +73,9 @@ def sql_func_between(col, left, right, inclusive=True): # copied from postgres. MYSQL does true division over ints by default, # but it does not produce double precision. - __div__ = sql_func_truediv, div = sql_func_truediv, divide = sql_func_truediv, rdiv = lambda x,y: sql_func_truediv(y, x), - __rdiv__ = lambda x, y: sql_func_truediv(y, x), __truediv__ = sql_func_truediv, truediv = sql_func_truediv, diff --git a/siuba/sql/dialects/postgresql.py b/siuba/sql/dialects/postgresql.py index 2b17225d..8b80e7c7 100644 --- a/siuba/sql/dialects/postgresql.py +++ b/siuba/sql/dialects/postgresql.py @@ -72,11 +72,9 @@ def sql_func_truediv(x, y): # infix and infix methods ---- - __div__ = sql_func_truediv, div = sql_func_truediv, divide = sql_func_truediv, rdiv = lambda x,y: sql_func_truediv(y, x), - __rdiv__ = lambda x, y: sql_func_truediv(y, x), __truediv__ = sql_func_truediv, truediv = sql_func_truediv, diff --git a/siuba/sql/verbs.py b/siuba/sql/verbs.py index 3a2c9425..35841211 100644 --- a/siuba/sql/verbs.py +++ b/siuba/sql/verbs.py @@ -7,6 +7,7 @@ """ +import warnings from siuba.dply.verbs import ( singledispatch2, @@ -936,7 +937,7 @@ def _semi_join(left, right = None, on = None, *args, sql_on = None): right_sel = right.last_op.alias() # handle arguments ---- - on = _validate_join_arg_on(on, sql_on) + on = _validate_join_arg_on(on, sql_on, left_sel, right_sel) # create join conditions ---- bool_clause = _create_join_conds(left_sel, right_sel, on) @@ -962,7 +963,7 @@ def _anti_join(left, right = None, on = None, *args, sql_on = None): right_sel = right.last_op.alias() # handle arguments ---- - on = _validate_join_arg_on(on, sql_on) + on = _validate_join_arg_on(on, sql_on, left, right) # create join conditions ---- bool_clause = _create_join_conds(left_sel, right_sel, on) @@ -981,7 +982,7 @@ def _raise_if_args(args): if len(args): raise NotImplemented("*args is reserved for future arguments (e.g. suffix)") -def _validate_join_arg_on(on, sql_on = None): +def _validate_join_arg_on(on, sql_on = None, lhs = None, rhs = None): # handle sql on case if sql_on is not None: if on is not None: @@ -991,12 +992,34 @@ def _validate_join_arg_on(on, sql_on = None): # handle general cases if on is None: - raise NotImplementedError("on arg currently cannot be None (default) for SQL") + # TODO: currently, we check for lhs and rhs tables to indicate whether + # a verb supports inferring columns. Otherwise, raise an error. + if lhs is not None and rhs is not None: + # TODO: consolidate with duplicate logic in pandas verb code + warnings.warn( + "No on column passed to join. " + "Inferring join columns instead using shared column names." + ) + + on_cols = list(set(lhs.columns.keys()).intersection(set(rhs.columns.keys()))) + + if not on_cols: + raise ValueError( + "No join column specified, or shared column names in join." + ) + + # trivial dict mapping shared names to themselves + warnings.warn("Detected shared columns: %s" % on_cols) + on = dict(zip(on_cols, on_cols)) + + else: + raise NotImplementedError("on arg currently cannot be None (default) for SQL") elif isinstance(on, str): on = {on: on} elif isinstance(on, (list, tuple)): on = dict(zip(on, on)) + if not isinstance(on, Mapping): raise TypeError("on must be a Mapping (e.g. dict)") diff --git a/siuba/tests/conftest.py b/siuba/tests/conftest.py index f9646460..52d33ea6 100644 --- a/siuba/tests/conftest.py +++ b/siuba/tests/conftest.py @@ -1,5 +1,5 @@ import pytest -from .helpers import assert_equal_query, PandasBackend, SqlBackend, data_frame +from .helpers import assert_equal_query, PandasBackend, SqlBackend, BigqueryBackend, data_frame def pytest_addoption(parser): parser.addoption( @@ -10,7 +10,7 @@ def pytest_addoption(parser): pytest.param(lambda: SqlBackend("postgresql"), id = "postgresql", marks=pytest.mark.postgresql), pytest.param(lambda: SqlBackend("mysql"), id = "mysql", marks=pytest.mark.mysql), pytest.param(lambda: SqlBackend("sqlite"), id = "sqlite", marks=pytest.mark.sqlite), - pytest.param(lambda: SqlBackend("bigquery"), id = "bigquery", marks=pytest.mark.bigquery), + pytest.param(lambda: BigqueryBackend("bigquery"), id = "bigquery", marks=pytest.mark.bigquery), pytest.param(lambda: PandasBackend("pandas"), id = "pandas", marks=pytest.mark.pandas) ] diff --git a/siuba/tests/helpers.py b/siuba/tests/helpers.py index 6cab99cd..567f03b1 100644 --- a/siuba/tests/helpers.py +++ b/siuba/tests/helpers.py @@ -1,4 +1,5 @@ import sqlalchemy as sqla +import uuid from siuba.sql import LazyTbl from siuba.dply.verbs import ungroup, collect @@ -114,7 +115,7 @@ def load_df(self, df = None, **kwargs): table_name = self.unique_table_name() - return copy_to_sql(df, self.unique_table_name(), self.engine) + return copy_to_sql(df, table_name, self.engine) def load_cached_df(self, df): import hashlib @@ -129,6 +130,25 @@ def load_cached_df(self, df): return res +class BigqueryBackend(SqlBackend): + @classmethod + def unique_table_name(cls): + return "siuba_{}".format(uuid.uuid4()) + + def load_df(self, df = None, **kwargs): + df = super().load_df(df, **kwargs) + + # since we are using uuids, set table to expire after 1 day, so we can + # easily inspect the tables, but also ensure cleanup + self.engine.execute(""" + ALTER TABLE `{table_name}` + SET OPTIONS ( + expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 1 DAY) + ) + """.format(table_name=df.tbl.name)) + + return df + def robust_multiple_sort(df, by): """Sort a DataFrame on multiple columns, slower but more reliable than df.sort_values diff --git a/siuba/tests/test_dply_forcats.py b/siuba/tests/test_dply_forcats.py index bac3a743..15a37244 100644 --- a/siuba/tests/test_dply_forcats.py +++ b/siuba/tests/test_dply_forcats.py @@ -34,6 +34,12 @@ def test_fct_reorder_custom_func(): assert_fct_equal(res, dst) +def test_fct_reorder_na_fct(): + import numpy as np + res = fct_reorder([None, 'x', 'y'], [4, 3, 2], np.max) + dst = Categorical([None, 'x', 'y'], ['y', 'x']) + + assert_fct_equal(res, dst) # fct_recode ------------------------------------------------------------------ diff --git a/siuba/tests/test_dply_series_methods.py b/siuba/tests/test_dply_series_methods.py index 32942540..93557739 100644 --- a/siuba/tests/test_dply_series_methods.py +++ b/siuba/tests/test_dply_series_methods.py @@ -263,15 +263,17 @@ def test_pandas_grouped_frame_fast_mutate(entry): res = fast_mutate(gdf, result = call_expr) dst = mutate(gdf, result = call_expr) - # TODO: apply mark to skip failing tests, rather than downcast - # pandas grouped aggs, when not using cython, _try_cast back to original type - # but since mutate uses apply, it doesn't :/. Currently only affects median func. - dst_obj = dst.obj + # TODO: apply mark to skip failing tests, rather than casting? + # in pandas 1.2, grouped agg returns int, ungrouped agg returns float + # in pandas 1.3, grouped agg returns float, same as ungrouped agg + # (the difference is because the grouped agg in 1.2 did not use cython, + # and tries casting back to the original column dtype) + res_obj = res.obj if str_expr == '_.x.median()': - dst_obj['result'] = dst_obj['result'].astype(gdf.x.obj.dtype) + res_obj['result'] = res_obj['result'].astype(float) assert isinstance(dst, DataFrameGroupBy) - assert_frame_equal(res.obj, dst_obj) + assert_frame_equal(res_obj, dst.obj) @pytest.mark.skip_backend('sqlite') @@ -324,7 +326,7 @@ def test_pandas_grouped_frame_fast_summarize(agg_entry): # pandas grouped aggs, when not using cython, _try_cast back to original type # but since summarize uses apply, it doesn't :/. Currently only affects median func. if str_expr == '_.x.median()': - dst['result'] = dst['result'].astype(gdf.x.obj.dtype) + res['result'] = res['result'].astype(float) assert_frame_equal(res, dst) diff --git a/siuba/tests/test_verb_join.py b/siuba/tests/test_verb_join.py index 9b64ac40..20cb39b5 100644 --- a/siuba/tests/test_verb_join.py +++ b/siuba/tests/test_verb_join.py @@ -169,6 +169,27 @@ def test_semi_join_no_cross(backend, df1, df2): DF1.iloc[:1,] ) +def test_semi_join_no_on_arg(backend, df1): + df_ii = backend.load_df(data_frame(ii = [1,1])) + + with pytest.warns(UserWarning) as record: + assert_equal_query( + df1, + semi_join(_, df_ii), + DF1.iloc[:1,] + ) + + assert "No on column passed to join." in record[0].message.args[0] + assert "['ii']" in record[1].message.args[0] + +def test_semi_join_no_on_arg_fail(backend, df1): + df_ii = backend.load_df(data_frame(ZZ = [1,1])) + + with pytest.raises(Exception) as excinfo: + collect(semi_join(df1, df_ii)) + + assert "No join column specified" in str(excinfo.value) + def test_basic_anti_join_on_map(backend, df1, df2): assert_frame_sort_equal(