Skip to content
This repository has been archived by the owner on Jan 28, 2025. It is now read-only.

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
mprallison authored Dec 7, 2018
1 parent f78dc9b commit 4652674
Showing 1 changed file with 117 additions and 0 deletions.
117 changes: 117 additions & 0 deletions jsonl_preprocess.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Script for transforming csv file into jsonl for db_in and training a spaCy model. \n",
"This is the most straightforward setup, where every abstract has an 'accept' annotation for its\n",
"corresponding journal, and a 'reject' annotation for every other journal.\n",
"\n",
"https://support.prodi.gy/t/best-practices-for-text-classifier-annotations/135"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from tqdm import tqdm\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"#csv with 'abstract' column and 'journal code' column\n",
"df = pd.read_csv('/path/to/abstract.csv')\n",
"journal_list = df['journal code'].unique().tolist()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"#generate list of json accept annotations for every abstract\n",
"accept_list = []\n",
"for i in tqdm(range(0,df.shape[0])):\n",
" accept_list.append({\"answer\":\"accept\",\"text\":r'\"'+df['abstract'][i]+'\"',\"label\":df['journal code'][i]}) "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def jsonlReject(text):\n",
" return {\"answer\":\"reject\",\"text\":r'\"'+text+'\"',\"label\":code}"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"#generate list of json reject annotations for every abstract - \n",
"#with every abstract rejected by every journal to which it does not correspond\n",
"reject_list = []\n",
"for code in tqdm(journal_list):\n",
" rej_code_bool = df['journal code'] != code\n",
" rej_df = df[rej_code_bool]\n",
" reject_list.extend(rej_df['abstract'].apply(lambda text:jsonlReject(text)))"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"jsonl = accept_list\n",
"jsonl.extend(reject_list)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"with open('/path/to/abstract.jsonl', 'w') as outfile:\n",
" for entry in jsonl:\n",
" json.dump(entry, outfile)\n",
" outfile.write('\\n')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 4652674

Please sign in to comment.