diff --git a/work/run_partner_manifest_validation.ipynb b/work/run_partner_manifest_validation.ipynb index 3efb895..099fcbd 100644 --- a/work/run_partner_manifest_validation.ipynb +++ b/work/run_partner_manifest_validation.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "id": "2ed8b533", "metadata": {}, "outputs": [ @@ -24,7 +24,7 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/g7/vnlptw7d5pb544zsj_bypk44000h93/T/ipykernel_5923/3036708010.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'run'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'validate_partner_manifest_dev.ipynb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/var/folders/g7/vnlptw7d5pb544zsj_bypk44000h93/T/ipykernel_712/3036708010.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mget_ipython\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_line_magic\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'run'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'validate_partner_manifest_dev.ipynb'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/miniconda3/envs/validate_partner_manifest_dev/lib/python3.7/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mrun_line_magic\u001b[0;34m(self, magic_name, line, _stack_depth)\u001b[0m\n\u001b[1;32m 2362\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'local_ns'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_local_scope\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mstack_depth\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2363\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuiltin_trap\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2364\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2365\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2366\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/validate_partner_manifest_dev/lib/python3.7/site-packages/decorator.py\u001b[0m in \u001b[0;36mfun\u001b[0;34m(*args, **kw)\u001b[0m\n\u001b[1;32m 230\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mkwsyntax\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 231\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkw\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkw\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msig\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 232\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mcaller\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mextras\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 233\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__name__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 234\u001b[0m \u001b[0mfun\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__doc__\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__doc__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/validate_partner_manifest_dev/lib/python3.7/site-packages/IPython/core/magic.py\u001b[0m in \u001b[0;36m\u001b[0;34m(f, *a, **k)\u001b[0m\n\u001b[1;32m 185\u001b[0m \u001b[0;31m# but it's overkill for just that one bit of state.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 186\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mmagic_deco\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 187\u001b[0;31m \u001b[0mcall\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mlambda\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0ma\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 188\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 189\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcallable\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", @@ -32,7 +32,7 @@ "\u001b[0;32m~/miniconda3/envs/validate_partner_manifest_dev/lib/python3.7/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36msafe_execfile_ipy\u001b[0;34m(self, fname, shell_futures, raise_exceptions)\u001b[0m\n\u001b[1;32m 2849\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrun_cell\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcell\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msilent\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshell_futures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mshell_futures\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2850\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mraise_exceptions\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2851\u001b[0;31m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mraise_error\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2852\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mresult\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msuccess\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2853\u001b[0m \u001b[0;32mbreak\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/miniconda3/envs/validate_partner_manifest_dev/lib/python3.7/site-packages/IPython/core/interactiveshell.py\u001b[0m in \u001b[0;36mraise_error\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 330\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_before_exec\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 331\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_in_exec\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 332\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0merror_in_exec\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 333\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 334\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__repr__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", " \u001b[0;31m[... skipping hidden 1 frame]\u001b[0m\n", - "\u001b[0;32m/var/folders/g7/vnlptw7d5pb544zsj_bypk44000h93/T/ipykernel_5923/3693612858.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Done for now'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/var/folders/g7/vnlptw7d5pb544zsj_bypk44000h93/T/ipykernel_712/3693612858.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Done for now'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mException\u001b[0m: Done for now" ] } @@ -44,7 +44,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "id": "c342af6c", "metadata": {}, "outputs": [ @@ -52,45 +52,193 @@ "name": "stderr", "output_type": "stream", "text": [ - "[WARNING] # manifest ../results/20220322/BP_ST_changes_22.03.22_Sam_NHM-BIOSCAN-Manifest Jan 2022.xlsx\n", + "[WARNING] # manifest ../results/20220405/NBGW-[20210527]-manifest.xlsx\n", + "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n" + ] + } + ], + "source": [ + "df = validate('../results/20220405/NBGW-[20210527]-manifest.xlsx', template_fn, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c238872a", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] # manifest ../results/20220405/NBGW-[20210805]-manifest.xlsx\n", + "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n", + "[WARNING] ORDER: found unexpected rank for Acari (taxid 6933): subclass\n", + "[WARNING] ORDER: found unexpected rank for Collembola (taxid 30001): class\n", + "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", + "[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy\n", + "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", + "[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy\n" + ] + } + ], + "source": [ + "df = validate('../results/20220405/NBGW-[20210805]-manifest.xlsx', template_fn, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f441b0df", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] # manifest ../results/20220405/NBGW-[20210903]-manifest.xlsx\n", + "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n", + "[ERROR] ORDER: {'Opilones'} not found in NCBI Taxonomy\n" + ] + } + ], + "source": [ + "df = validate('../results/20220405/NBGW-[20210903]-manifest.xlsx', template_fn, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9524459e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] # manifest ../results/20220405/NBGW-[20210930]-manifest.xlsx\n", + "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n" + ] + } + ], + "source": [ + "df = validate('../results/20220405/NBGW-[20210930]-manifest.xlsx', template_fn, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fdff8970", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] # manifest ../results/20220405/NBGW-[20211026]-manifest.xlsx\n", + "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n", + "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"0.5\"\n" + ] + } + ], + "source": [ + "df = validate('../results/20220405/NBGW-[20211026]-manifest.xlsx', template_fn, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "962321eb", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] # manifest ../results/20220405/NBGW-[20211130]-manifest.xlsx\n", + "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n" + ] + } + ], + "source": [ + "df = validate('../results/20220405/NBGW-[20211130]-manifest.xlsx', template_fn, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "940b497d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] # manifest ../results/20220405/NBGW-[20220105]-manifest.xlsx\n", + "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n", + "[WARNING] ORDER: found unexpected rank for Collembola (taxid 30001): class\n" + ] + } + ], + "source": [ + "df = validate('../results/20220405/NBGW-[20220105]-manifest.xlsx', template_fn, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "3c0658fc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] # manifest ../results/20220405/NBGW-[20220201]-manifest.xlsx\n", + "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n" + ] + } + ], + "source": [ + "df = validate('../results/20220405/NBGW-[20220201]-manifest.xlsx', template_fn, verbose=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f38deb80", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[WARNING] # manifest ../results/20220405/NHM-BIOSCAN-Manifest_05042022.xlsx\n", "[WARNING] trailing spaces found in column 'FAMILY', SERIES [396]. Removing for validation\n", "[WARNING] trailing spaces found in column 'GENUS', SERIES [294, 295, 394, 395, 397, 398, 409, 410, 411, 412, 635, 636, 639]. Removing for validation\n", "[WARNING] trailing spaces found in column 'SCIENTIFIC_NAME', SERIES [294, 295]. Removing for validation\n", - "[WARNING] extra columns in filled manifest compared to template: {'Other/best taxon ID', 'CATCH_BOTTLE_TEMPERATURE_STORAGE', 'PLATE_TEMPERATURE_STORAGE', 'BAITED_TRAPS', 'SORTING_SOLUTION_USED'}\n", + "[WARNING] extra columns in filled manifest compared to template: {'BAITED_TRAPS', 'Other/best taxon ID', 'PLATE_TEMPERATURE_STORAGE', 'CATCH_BOTTLE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED'}\n", "[ERROR] template columns missing from filled manifest: {'MISC_METADATA'}\n", - "[ERROR] last well H12 is not blank at SERIES [192, 384, 480, 576, 672, 768, 864, 960, 1056, 288, 96]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE']. These samples will be included in further analysis.\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] problem parsing coordinates 'NOT_APPLICABLE, NOT_APPLICABLE'\n", - "[WARNING] could not locate country for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE', partner country 'NOT_APPLICABLE'\n", + "[ERROR] for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {'NOT_APPLICABLE'}\n", + "[WARNING] for blanks, NOT_APPLICABLE expected, but not found in columns ['CATCH_LOT', 'BOTTLE_DIRECTION', 'HAZARD_GROUP', 'REGULATORY_COMPLIANCE', 'DATE_OF_COLLECTION', 'COLLECTION_LOCATION', 'DECIMAL_LATITUDE', 'DECIMAL_LONGITUDE', 'WHAT_3_WORDS']\n", "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] ORDER: unexpected case for \"lepidoptera\", changing to \"Lepidoptera\" for validation\n", "[ERROR] ORDER: {'Not_applicable'} not found in NCBI Taxonomy\n", "[WARNING] ORDER: found unexpected rank for Acari (taxid 6933): subclass\n", "[WARNING] ORDER: found unexpected rank for Arachnida (taxid 6854): class\n", "[WARNING] ORDER: found unexpected rank for Collembola (taxid 30001): class\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Staphyllinidae', 'Not_applicable', 'Ichneumonidade', 'Psilidade', 'Psycodidae'} not found in NCBI Taxonomy\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Hellophilus', 'Not_applicable', 'Sullia'} not found in NCBI Taxonomy\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['P3D' 'P6D' 'NOT_APPLICABLE' 'P1D' 'P7D']\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n", - "[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['NOT_APPLICABLE']\n", - "[ERROR] found non-integer value in ELEVATION: \"NOT_APPLICABLE\"\n" + "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['P3D' 'P6D' 'P1D' 'P7D']\n", + "[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['NOT_APPLICABLE']\n" ] } ], "source": [ - "df = validate('../results/20220322/BP_ST_changes_22.03.22_Sam_NHM-BIOSCAN-Manifest Jan 2022.xlsx', template_fn, verbose=False)" + "df = validate('../results/20220405/NHM-BIOSCAN-Manifest_05042022.xlsx', template_fn, verbose=False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "c238872a", + "id": "597705bb", "metadata": {}, "outputs": [], "source": [] diff --git a/work/validate_partner_manifest_dev.ipynb b/work/validate_partner_manifest_dev.ipynb index 3484fa6..3c40445 100644 --- a/work/validate_partner_manifest_dev.ipynb +++ b/work/validate_partner_manifest_dev.ipynb @@ -9,7 +9,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -70,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -141,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -169,7 +169,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -196,7 +196,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -224,7 +224,7 @@ "Name: TIME_OF_COLLECTION, Length: 1009, dtype: object" ] }, - "execution_count": 9, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -243,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -281,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -329,7 +329,34 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "NOT_COLLECTED 989\n", + " 50\n", + "blank sample 10\n", + "Formica rufa 3\n", + "NOT_APPLICABLE 2\n", + "Syrphus ribesii 1\n", + "Episyrphus balteatus 1\n", + "Name: SCIENTIFIC_NAME, dtype: int64" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['SCIENTIFIC_NAME'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, "metadata": {}, "outputs": [ { @@ -337,8 +364,17 @@ "output_type": "stream", "text": [ "[INFO] Checking and excluding blank samples\n", - "[ERROR] last well H12 is not blank at SERIES [96]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE']. These samples will be included in further analysis.\n", - "[INFO] found 10 blank samples based on SCIENTIFIC_NAME\n" + "[INFO] found and excluded 12 blank samples based on SCIENTIFIC_NAME\n", + "[ERROR] for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {'NOT_APPLICABLE'}\n", + "[WARNING] for blanks, NOT_APPLICABLE expected, but not found in columns ['COLLECTION_LOCATION']\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1056, 38)\n", + "(1044, 38)\n" ] } ], @@ -348,9 +384,12 @@ " \n", " logging.info('Checking and excluding blank samples')\n", " \n", + " blank_sample_sci_names = ['blank sample','NOT_APPLICABLE']\n", + " \n", + " \n", " # last well of plate expected to be blank\n", " last_well = df[df['TUBE_OR_WELL_ID'] == 'H12']\n", - " last_well_blanks = (last_well['SCIENTIFIC_NAME'] == 'blank sample')\n", + " last_well_blanks = (last_well['SCIENTIFIC_NAME'].isin(blank_sample_sci_names))\n", " if not last_well_blanks.all():\n", " logging.error('last well H12 is not blank at SERIES {}: in SCIENTIFIC_NAME, '\n", " 'expected \"blank sample\", found {}. '\n", @@ -359,10 +398,11 @@ " last_well[~last_well_blanks].SCIENTIFIC_NAME.to_list()\n", " ))\n", " \n", - " is_blank = (df['SCIENTIFIC_NAME'] == 'blank sample')\n", + " # also exclude blanks in non-last plate\n", + " is_blank = df['SCIENTIFIC_NAME'].isin(blank_sample_sci_names)\n", " blank_df = df[is_blank]\n", " \n", - " logging.info('found {} blank samples based on SCIENTIFIC_NAME'.format(blank_df.shape[0]))\n", + " logging.info('found and excluded {} blank samples based on SCIENTIFIC_NAME'.format(blank_df.shape[0]))\n", " \n", " # check organism part\n", " organism_part_pass = (blank_df['ORGANISM_PART'] == 'BLANK_SAMPLE')\n", @@ -384,13 +424,14 @@ " # logging.info('{} samples of {} left for downstream analysis'.format(df_flt.shape[0], df.shape[0]))\n", " \n", " return df[~is_blank]\n", - " \n", - "df = check_blanks(df)" + "print(df.shape)\n", + "df = check_blanks(df)\n", + "print(df.shape)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -443,7 +484,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 42, "metadata": {}, "outputs": [ { @@ -451,7 +492,7 @@ "output_type": "stream", "text": [ "[INFO] validating date column 'DATE_OF_COLLECTION'\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_COLLECTED' 'NOT_APPLICABLE' '']\n" + "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_COLLECTED' '']\n" ] }, { @@ -472,7 +513,7 @@ "Name: DATE_OF_COLLECTION, Length: 993, dtype: datetime64[ns]" ] }, - "execution_count": 14, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -515,7 +556,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -523,7 +564,7 @@ "output_type": "stream", "text": [ "[INFO] validating time column 'TIME_OF_COLLECTION'\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE' '']\n" + "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['']\n" ] }, { @@ -544,7 +585,7 @@ "Name: TIME_OF_COLLECTION, Length: 994, dtype: datetime64[ns]" ] }, - "execution_count": 15, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } @@ -575,7 +616,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 44, "metadata": {}, "outputs": [ { @@ -583,7 +624,7 @@ "output_type": "stream", "text": [ "[INFO] validating time period column 'DURATION_OF_COLLECTION'\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['NOT_APPLICABLE' '']\n" + "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['']\n" ] } ], @@ -632,7 +673,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 45, "metadata": {}, "outputs": [ { @@ -642,7 +683,7 @@ "[INFO] validating country with coordinates\n", "[ERROR] no partner location found for coordinates '52.0236, 0.2389'\n", "[ERROR] no partner location found for coordinates '51.917197, -1.148376'\n", - "[ERROR] multiple partner countries for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE': ['UNITED KINGDOM' 'NOT_APPLICABLE']skipping coordinate validation\n", + "[ERROR] no partner location found for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE'\n", "[ERROR] multiple partner countries for coordinates '50.598618, -3.7209498': ['UNITED KINGDOM' '']skipping coordinate validation\n", "[WARNING] could not locate country for coordinates ', ', partner country ''\n" ] @@ -787,7 +828,7 @@ " \n", " \n", "\n", - "

1046 rows × 6 columns

\n", + "

1044 rows × 6 columns

\n", "" ], "text/plain": [ @@ -819,10 +860,10 @@ "1054 , UNKNOWN \n", "1055 , UNKNOWN \n", "\n", - "[1046 rows x 6 columns]" + "[1044 rows x 6 columns]" ] }, - "execution_count": 17, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -931,7 +972,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -940,29 +981,25 @@ "text": [ "[INFO] validating taxonomy against NCBI\n", "[INFO] validating ORDER against NCBI\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", "[ERROR] ORDER: unexpected case for \"NOT_COLLECTED\", changing to \"Not_collected\" for validation\n", "[ERROR] ORDER: unexpected case for \"diptera\", changing to \"Diptera\" for validation\n", "[ERROR] ORDER: unexpected case for \"Diptera and Arachnidae\", changing to \"Diptera and arachnidae\" for validation\n", - "[ERROR] ORDER: {'Acari (subclass)', '', 'Not_collected', 'Not_applicable', 'Tricoptera', 'Diptera and arachnidae', 'Symphyleona'} not found in NCBI Taxonomy\n", + "[ERROR] ORDER: {'', 'Acari (subclass)', 'Not_collected', 'Tricoptera', 'Diptera and arachnidae', 'Symphyleona'} not found in NCBI Taxonomy\n", "[WARNING] ORDER: found unexpected rank for Collembola (taxid 30001): class\n", "[INFO] ORDER: using only first matching rank for Plecoptera (taxid 50622): order\n", "[WARNING] ORDER: found unexpected rank for Protura (taxid 29999): class\n", "[INFO] validating FAMILY against NCBI\n", "[ERROR] FAMILY: unexpected case for \"NOT_COLLECTED\", changing to \"Not_collected\" for validation\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", "[ERROR] FAMILY: unexpected case for \"Unkown and Acari\", changing to \"Unkown and acari\" for validation\n", - "[ERROR] FAMILY: {'', 'Aphidiodea', 'Not_collected', 'Not_applicable', 'Unkown and acari'} not found in NCBI Taxonomy\n", + "[ERROR] FAMILY: {'', 'Not_collected', 'Aphidiodea', 'Unkown and acari'} not found in NCBI Taxonomy\n", "[WARNING] FAMILY: found unexpected rank for Aphidoidea (taxid 33385): superfamily\n", "[INFO] validating GENUS against NCBI\n", "[ERROR] GENUS: unexpected case for \"NOT_COLLECTED\", changing to \"Not_collected\" for validation\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'', 'Not_collected', 'Not_applicable'} not found in NCBI Taxonomy\n", + "[ERROR] GENUS: {'', 'Not_collected'} not found in NCBI Taxonomy\n", "[INFO] GENUS: using only first matching rank for Bombus (taxid 28641): genus\n", "[INFO] validating SCIENTIFIC_NAME against NCBI\n", "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_COLLECTED\", changing to \"Not_collected\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'', 'Not_collected', 'Not_applicable'} not found in NCBI Taxonomy\n", + "[ERROR] SCIENTIFIC_NAME: {'', 'Not_collected'} not found in NCBI Taxonomy\n", "[INFO] cannot validate FAMILY for \"NOT_COLLECTED\", skipping taxonomy consistency check\n", "[INFO] cannot validate FAMILY for \"NOT_COLLECTED\", skipping taxonomy consistency check\n", "[INFO] cannot validate FAMILY for \"NOT_COLLECTED\", skipping taxonomy consistency check\n", @@ -973,7 +1010,6 @@ "[INFO] cannot validate FAMILY for \"NOT_COLLECTED\", skipping taxonomy consistency check\n", "[INFO] cannot validate FAMILY for \"NOT_COLLECTED\", skipping taxonomy consistency check\n", "[INFO] cannot validate ORDER for \"Acari (subclass)\", skipping taxonomy consistency check\n", - "[INFO] cannot validate ORDER for \"NOT_APPLICABLE\", skipping taxonomy consistency check\n", "[INFO] cannot validate FAMILY for \"NOT_COLLECTED\", skipping taxonomy consistency check\n", "[INFO] cannot validate ORDER for \"NOT_COLLECTED\", skipping taxonomy consistency check\n", "[INFO] cannot validate GENUS for \"NOT_COLLECTED\", skipping taxonomy consistency check\n", @@ -1335,7 +1371,7 @@ " \n", " \n", "\n", - "

1046 rows × 42 columns

\n", + "

1044 rows × 42 columns

\n", "" ], "text/plain": [ @@ -1437,10 +1473,10 @@ "1054 \n", "1055 \n", "\n", - "[1046 rows x 42 columns]" + "[1044 rows x 42 columns]" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1585,7 +1621,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1593,7 +1629,6 @@ "output_type": "stream", "text": [ "[INFO] validating int format in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n", "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"\"\n" ] } @@ -1619,7 +1654,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1627,7 +1662,7 @@ "output_type": "stream", "text": [ "[INFO] validating date column 'DATE_OF_COLLECTION'\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_COLLECTED' 'NOT_APPLICABLE' '']\n", + "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_COLLECTED' '']\n", "[INFO] validating date column 'DATE_OF_PRESERVATION'\n", "[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['' 'NOT_APPLICABLE']\n" ] @@ -1640,7 +1675,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -1649,7 +1684,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1669,7 +1704,7 @@ "Length: 993, dtype: bool" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1680,26 +1715,25 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def compare_dates(before, after):\n", - " \n", - " ctdf = pd.concat([before.reset_index(), after.reset_index()], axis=1)\n", + " \n", + " logging.info(f'checking that {before.name} are earlier than {after.name}')\n", + "\n", + " ctdf = pd.concat([before, after], axis=1)\n", " date_conflict = ctdf[before.name] > ctdf[after.name]\n", " \n", - "# logging.info(date_conflict)\n", " if date_conflict.any():\n", " logging.error(f'{before.name} values are later than {after.name} for SERIES'\n", - " f' {ctdf[date_conflict].index.to_list()}')\n", - " \n", - "compare_dates(bd, ad)" + " f' {ctdf[date_conflict].index.to_list()}')" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1720,48 +1754,44 @@ "[ERROR] Found and excluded 240 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID\n", "[INFO] found 1056 samples across 11 plates\n", "[INFO] Checking and excluding blank samples\n", - "[ERROR] last well H12 is not blank at SERIES [96]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE']. These samples will be included in further analysis.\n", - "[INFO] found 10 blank samples based on SCIENTIFIC_NAME\n", + "[INFO] found and excluded 12 blank samples based on SCIENTIFIC_NAME\n", + "[ERROR] for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {'NOT_APPLICABLE'}\n", + "[WARNING] for blanks, NOT_APPLICABLE expected, but not found in columns ['COLLECTION_LOCATION']\n", "[INFO] validating values in column 'PRESERVATIVE_SOLUTION'\n", "[INFO] validating values in column 'BOTTLE_DIRECTION'\n", "[ERROR] invalid values in 'BOTTLE_DIRECTION': {''}\n", "[INFO] validating values in column 'ORGANISM_PART'\n", "[ERROR] invalid values in 'ORGANISM_PART': {''}\n", "[INFO] validating values in column 'HAZARD_GROUP'\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE', ''}\n", + "[ERROR] invalid values in 'HAZARD_GROUP': {''}\n", "[INFO] validating values in column 'REGULATORY_COMPLIANCE'\n", "[ERROR] invalid values in 'REGULATORY_COMPLIANCE': {'', 'y'}\n", "[INFO] validating date column 'DATE_OF_COLLECTION'\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE' '']\n", + "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['']\n", "[INFO] validating country with coordinates\n", "[ERROR] no partner location found for coordinates '52.0236, 0.2389'\n", "[ERROR] no partner location found for coordinates '51.917197, -1.148376'\n", - "[ERROR] multiple partner countries for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE': ['UNITED KINGDOM' 'NOT_APPLICABLE']skipping coordinate validation\n", + "[ERROR] no partner location found for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE'\n", "[ERROR] multiple partner countries for coordinates '50.598618, -3.7209498': ['UNITED KINGDOM' '']skipping coordinate validation\n", "[WARNING] could not locate country for coordinates ', ', partner country ''\n", "[INFO] validating taxonomy against NCBI\n", "[INFO] validating ORDER against NCBI\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", "[ERROR] ORDER: unexpected case for \"diptera\", changing to \"Diptera\" for validation\n", "[ERROR] ORDER: unexpected case for \"Diptera and Arachnidae\", changing to \"Diptera and arachnidae\" for validation\n", - "[ERROR] ORDER: {'Acari (subclass)', '', 'Not_applicable', 'Tricoptera', 'Diptera and arachnidae', 'Symphyleona'} not found in NCBI Taxonomy\n", + "[ERROR] ORDER: {'', 'Acari (subclass)', 'Tricoptera', 'Diptera and arachnidae', 'Symphyleona'} not found in NCBI Taxonomy\n", "[WARNING] ORDER: found unexpected rank for Collembola (taxid 30001): class\n", "[INFO] ORDER: using only first matching rank for Plecoptera (taxid 50622): order\n", "[WARNING] ORDER: found unexpected rank for Protura (taxid 29999): class\n", "[INFO] validating FAMILY against NCBI\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", "[ERROR] FAMILY: unexpected case for \"Unkown and Acari\", changing to \"Unkown and acari\" for validation\n", - "[ERROR] FAMILY: {'', 'Unkown and acari', 'Not_applicable', 'Aphidiodea'} not found in NCBI Taxonomy\n", + "[ERROR] FAMILY: {'', 'Aphidiodea', 'Unkown and acari'} not found in NCBI Taxonomy\n", "[WARNING] FAMILY: found unexpected rank for Aphidoidea (taxid 33385): superfamily\n", "[INFO] validating GENUS against NCBI\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'', 'Not_applicable'} not found in NCBI Taxonomy\n", + "[ERROR] GENUS: {''} not found in NCBI Taxonomy\n", "[INFO] GENUS: using only first matching rank for Bombus (taxid 28641): genus\n", "[INFO] validating SCIENTIFIC_NAME against NCBI\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'', 'Not_applicable'} not found in NCBI Taxonomy\n", + "[ERROR] SCIENTIFIC_NAME: {''} not found in NCBI Taxonomy\n", "[INFO] cannot validate ORDER for \"Acari (subclass)\", skipping taxonomy consistency check\n", - "[INFO] cannot validate ORDER for \"NOT_APPLICABLE\", skipping taxonomy consistency check\n", "[INFO] cannot validate ORDER for \"diptera\", skipping taxonomy consistency check\n", "[INFO] cannot validate ORDER for \"Diptera and Arachnidae\", skipping taxonomy consistency check\n", "[ERROR] Family Syrphidae (taxid 34680) does not belong to Hymenoptera (taxid 7399)\n", @@ -1775,19 +1805,19 @@ "[INFO] validating values in column 'SEX'\n", "[ERROR] invalid values in 'SEX': {''}\n", "[INFO] validating time column 'TIME_OF_COLLECTION'\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE' '']\n", + "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['']\n", "[INFO] validating time period column 'DURATION_OF_COLLECTION'\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['NOT_APPLICABLE' '']\n", + "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['']\n", "[INFO] validating values in column 'COLLECTION_METHOD'\n", "[ERROR] invalid values in 'COLLECTION_METHOD': {''}\n", "[INFO] validating int format in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING\n", "[INFO] excluding 50 [''] samples without data in 'TIME_ELAPSED_FROM_COLLECTION_TO_PLATING'\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n", "[INFO] validating date column 'DATE_OF_PRESERVATION'\n", - "[INFO] excluding 1044 [''] samples without data in 'DATE_OF_PRESERVATION'\n", + "[INFO] excluding 1042 [''] samples without data in 'DATE_OF_PRESERVATION'\n", "[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['NOT_APPLICABLE']\n", + "[INFO] checking that DATE_OF_COLLECTION are earlier than DATE_OF_PRESERVATION\n", "[INFO] validating int format in ELEVATION\n", - "[INFO] excluding 1046 [''] samples without data in 'ELEVATION'\n", + "[INFO] excluding 1044 [''] samples without data in 'ELEVATION'\n", "[INFO] # ended validate_partner_manifest_v.1.0\n" ] } @@ -1815,10 +1845,11 @@ " valid_dict = get_valid_dict(template_fn)\n", "\n", " # orange cols\n", + " # exclude empty series\n", " df = validate_series(df)\n", " df = validate_plates_wells(df, 'RACK_OR_PLATE_ID', 'TUBE_OR_WELL_ID')\n", " \n", - " # check blanks\n", + " # check and exclude blanks\n", " df = check_blanks(df)\n", " \n", " validate_values('PRESERVATIVE_SOLUTION', df, valid_dict)\n", @@ -1859,9 +1890,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "Exception", + "evalue": "Done for now", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m/var/folders/g7/vnlptw7d5pb544zsj_bypk44000h93/T/ipykernel_627/3693612858.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Done for now'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mException\u001b[0m: Done for now" + ] + } + ], "source": [ "raise Exception('Done for now')" ] @@ -1876,60 +1919,25 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220322/NBGW-[20210805]-manifest.xlsx\n", - "[ERROR] Found and excluded 768 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID\n", - "[WARNING] ORDER: found unexpected rank for Acari (taxid 6933): subclass\n", - "[WARNING] ORDER: found unexpected rank for Collembola (taxid 30001): class\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy\n" - ] - } - ], + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "df = validate('../results/20220322/NBGW-[20210805]-manifest.xlsx', template_fn, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220322/NBGW-[20220105]-manifest.xlsx\n", - "[ERROR] Found and excluded non-numeric SERIES: ['']\n", - "[ERROR] in TUBE_OR_WELL_ID for plate NBGW_003, wells {'E12'} are missing, wells set() are excessive\n", - "[ERROR] last well H12 is not blank at SERIES [95]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE']. These samples will be included in further analysis.\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] problem parsing coordinates 'NOT_APPLICABLE, NOT_APPLICABLE'\n", - "[WARNING] could not locate country for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE', partner country 'NOT_APPLICABLE'\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] ORDER: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[WARNING] ORDER: found unexpected rank for Collembola (taxid 30001): class\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n" - ] - } - ], + "outputs": [], "source": [ "df = validate('../results/20220322/NBGW-[20220105]-manifest.xlsx', template_fn, verbose=False)" ] @@ -1943,61 +1951,9 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220315/Sam_NHM-BIOSCAN-Manifest Jan 2022.xlsx\n", - "[WARNING] trailing spaces found in column 'FAMILY', SERIES [396]. Removing for validation\n", - "[WARNING] trailing spaces found in column 'GENUS', SERIES [294, 295, 394, 395, 397, 398, 409, 410, 411, 412, 635, 636, 639]. Removing for validation\n", - "[WARNING] extra columns in filled manifest compared to template: {'BAITED_TRAPS', 'Other/best taxon ID'}\n", - "[ERROR] template columns missing from filled manifest: {'MISC_METADATA'}\n", - "[ERROR] last well H12 is not blank at SERIES [192, 384, 480, 576, 672, 768, 864, 960, 1056, 288, 96]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE']. These samples will be included in further analysis.\n", - "[ERROR] invalid values in 'BOTTLE_DIRECTION': {'East', 'West'}\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] problem parsing coordinates 'NOT_APPLICABLE, NOT_APPLICABLE'\n", - "[WARNING] could not locate country for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE', partner country 'NOT_APPLICABLE'\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] ORDER: unexpected case for \"lepidoptera\", changing to \"Lepidoptera\" for validation\n", - "[ERROR] ORDER: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[WARNING] ORDER: found unexpected rank for Acari (taxid 6933): subclass\n", - "[WARNING] ORDER: found unexpected rank for Arachnida (taxid 6854): class\n", - "[WARNING] ORDER: found unexpected rank for Collembola (taxid 30001): class\n", - "[ERROR] FAMILY: unexpected case for \"UNKNOWN\", changing to \"Unknown\" for validation\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Psilidade', 'Not_applicable', 'Psycodidae', 'Ichneumonidade', 'Staphyllinidae'} not found in NCBI Taxonomy\n", - "[WARNING] FAMILY: found unexpected rank for Unknown (taxid 32644): species\n", - "[ERROR] GENUS: unexpected case for \"UNKNOWN\", changing to \"Unknown\" for validation\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Sullia', 'Hellophilus', 'Not_applicable'} not found in NCBI Taxonomy\n", - "[WARNING] GENUS: found unexpected rank for Unknown (taxid 32644): species\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"UNKNOWN\", changing to \"Unknown\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"lacticolella\", changing to \"Lacticolella\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"equestris\", changing to \"Equestris\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"ocellare\", changing to \"Ocellare\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"marginata\", changing to \"Marginata\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"platypterus\", changing to \"Platypterus\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"vallata \", changing to \"Vallata \" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"rara \", changing to \"Rara \" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"balteatus\", changing to \"Balteatus\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"luteolata\", changing to \"Luteolata\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"pendulus\", changing to \"Pendulus\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'Equestris', 'Lacticolella', 'Not_applicable', 'Marginata', 'Balteatus', 'Rara\\xa0', 'Ocellare', 'Pendulus', 'Luteolata', 'Vallata\\xa0'} not found in NCBI Taxonomy\n", - "[WARNING] SCIENTIFIC_NAME: found unexpected rank for Platypterus (taxid 484252): subgenus\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE' 'Not known']\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['P2D23H' 'P5D23H30M' 'NOT_APPLICABLE' 'P1D0H0M' 'Not known' 'P1D4H20M'\n", - " 'P1D0H50M' 'unknown' '72hrs' '50hrs']\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n", - "[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['NOT_APPLICABLE']\n", - "[ERROR] found non-integer value in ELEVATION: \"NOT_APPLICABLE\"\n" - ] - } - ], + "outputs": [], "source": [ "# test columns addition\n", "df = validate('../results/20220315/Sam_NHM-BIOSCAN-Manifest Jan 2022.xlsx', template_fn, verbose=False)" @@ -2005,20 +1961,9 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220303_add_tax/final-CARR-20210727-manifestV1.xlsx\n", - "[WARNING] extra columns in filled manifest compared to template: {'PLATE_TEMPERATURE_STORAGE', 'SORTING_SOLUTION_USED', 'CATCH_BOTTLE_TEMPERATURE_STORAGE'}\n", - "[ERROR] for blanks, ORGANISM_PART expected to be BLANK_SAMPLE, found {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_PRESERVATION': ['NOT_APPLICABLE']\n" - ] - } - ], + "outputs": [], "source": [ "# test columns addition\n", "df = validate('../results/20220303_add_tax/final-CARR-20210727-manifestV1.xlsx', template_fn, verbose=False)\n", @@ -2027,26 +1972,9 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['../results/20220304/NBGW-[20210527]-manifest.xlsx',\n", - " '../results/20220304/NBGW-[20210903]-manifest.xlsx',\n", - " '../results/20220304/NBGW-[20210930]-manifest.xlsx',\n", - " '../results/20220304/NBGW-[20211026]-manifest.xlsx',\n", - " '../results/20220304/NBGW-[20211130]-manifest.xlsx',\n", - " '../results/20220304/NBGW-[20220201]-manifest.xlsx',\n", - " '../results/20220304/NGBW-[20210629]-manifest.xlsx']" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import glob\n", "fns = list(glob.glob('../results/20220304/*.xlsx'))\n", @@ -2056,228 +1984,72 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220304/NBGW-[20210527]-manifest.xlsx\n", - "[ERROR] Found and excluded 576 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['9.04' '9.45']\n" - ] - } - ], + "outputs": [], "source": [ "df = validate(fns[0], template_fn, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220304/NBGW-[20210903]-manifest.xlsx\n", - "[ERROR] last well H12 is not blank at SERIES [96, 192, 288, 384]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE']\n", - "[ERROR] invalid values in 'ORGANISM_PART': {'Diptera placed in well, removed'}\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[WARNING] could not locate country for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE', partner country 'NOT_APPLICABLE'\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] ORDER: unexpected case for \"NOT COLLECTED\", changing to \"Not collected\" for validation\n", - "[ERROR] ORDER: {'Not collected', 'Not_applicable', 'Opilones'} not found in NCBI Taxonomy\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n" - ] - } - ], + "outputs": [], "source": [ "df = validate(fns[1], template_fn, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 45, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220304/NBGW-[20210930]-manifest.xlsx\n", - "[ERROR] Found and excluded 672 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID\n", - "[ERROR] last well H12 is not blank at SERIES [96, 192, 288]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE']\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[WARNING] could not locate country for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE', partner country 'NOT_APPLICABLE'\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] ORDER: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n" - ] - } - ], + "outputs": [], "source": [ "df = validate(fns[2], template_fn, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220304/NBGW-[20211026]-manifest.xlsx\n", - "[ERROR] Found and excluded 672 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID\n", - "[ERROR] last well H12 is not blank at SERIES [96, 192, 288]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE']\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[WARNING] could not locate country for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE', partner country 'NOT_APPLICABLE'\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] ORDER: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] invalid values in 'SEX': {'ADULT'}\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"0.5\"\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n" - ] - } - ], + "outputs": [], "source": [ "df = validate(fns[3], template_fn, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 47, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220304/NBGW-[20211130]-manifest.xlsx\n", - "[ERROR] Found and excluded 672 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID\n", - "[ERROR] last well H12 is not blank at SERIES [96, 192, 288]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE']\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[WARNING] could not locate country for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE', partner country 'NOT_APPLICABLE'\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] ORDER: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n" - ] - } - ], + "outputs": [], "source": [ "df = validate(fns[4], template_fn, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220304/NBGW-[20220201]-manifest.xlsx\n", - "[ERROR] Found and excluded 672 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID\n", - "[ERROR] last well H12 is not blank at SERIES [96, 192, 288]: in SCIENTIFIC_NAME, expected \"blank sample\", found ['NOT_APPLICABLE', 'NOT_APPLICABLE', 'NOT_APPLICABLE']\n", - "[ERROR] invalid values in 'HAZARD_GROUP': {'NOT_APPLICABLE'}\n", - "[ERROR] invalid dates in 'DATE_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[WARNING] could not locate country for coordinates 'NOT_APPLICABLE, NOT_APPLICABLE', partner country 'NOT_APPLICABLE'\n", - "[ERROR] ORDER: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] ORDER: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] FAMILY: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] FAMILY: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] GENUS: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] GENUS: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] SCIENTIFIC_NAME: unexpected case for \"NOT_APPLICABLE\", changing to \"Not_applicable\" for validation\n", - "[ERROR] SCIENTIFIC_NAME: {'Not_applicable'} not found in NCBI Taxonomy\n", - "[ERROR] invalid times in 'TIME_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] invalid times in 'DURATION_OF_COLLECTION': ['NOT_APPLICABLE']\n", - "[ERROR] found non-integer value in TIME_ELAPSED_FROM_COLLECTION_TO_PLATING: \"NOT_APPLICABLE\"\n" - ] - } - ], + "outputs": [], "source": [ "df = validate(fns[5], template_fn, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[WARNING] # manifest ../results/20220304/NGBW-[20210629]-manifest.xlsx\n", - "[ERROR] Found and excluded 672 empty rows based on RACK_OR_PLATE_ID and TUBE_OR_WELL_ID\n", - "[WARNING] ORDER: found unexpected rank for Acari (taxid 6933): subclass\n", - "[ERROR] SCIENTIFIC_NAME: {'Bombus terrestris/lucorum'} not found in NCBI Taxonomy\n" - ] - } - ], + "outputs": [], "source": [ "df = validate(fns[6], template_fn, verbose=False)" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "Exception", - "evalue": "Done for now", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mException\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m/var/folders/g7/vnlptw7d5pb544zsj_bypk44000h93/T/ipykernel_46513/3693612858.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mException\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'Done for now'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mException\u001b[0m: Done for now" - ] - } - ], + "outputs": [], "source": [] }, {