Add files via upload

sagepublishing · Dec 7, 2018 · 4652674 · 4652674
1 parent f78dc9b
commit 4652674
Showing 1 changed file with 117 additions and 0 deletions.
diff --git a/jsonl_preprocess.ipynb b/jsonl_preprocess.ipynb
@@ -0,0 +1,117 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Script for transforming csv file into jsonl for db_in and training a spaCy model. \n",
+    "This is the most straightforward setup, where every abstract has an 'accept' annotation for its\n",
+    "corresponding journal, and a 'reject' annotation for every other journal.\n",
+    "\n",
+    "https://support.prodi.gy/t/best-practices-for-text-classifier-annotations/135"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from tqdm import tqdm\n",
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#csv with 'abstract' column and 'journal code' column\n",
+    "df = pd.read_csv('/path/to/abstract.csv')\n",
+    "journal_list = df['journal code'].unique().tolist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#generate list of json accept annotations for every abstract\n",
+    "accept_list = []\n",
+    "for i in tqdm(range(0,df.shape[0])):\n",
+    "    accept_list.append({\"answer\":\"accept\",\"text\":r'\"'+df['abstract'][i]+'\"',\"label\":df['journal code'][i]}) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def jsonlReject(text):\n",
+    "    return {\"answer\":\"reject\",\"text\":r'\"'+text+'\"',\"label\":code}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#generate list of json reject annotations for every abstract - \n",
+    "#with every abstract rejected by every journal to which it does not correspond\n",
+    "reject_list = []\n",
+    "for code in tqdm(journal_list):\n",
+    "    rej_code_bool = df['journal code'] != code\n",
+    "    rej_df = df[rej_code_bool]\n",
+    "    reject_list.extend(rej_df['abstract'].apply(lambda text:jsonlReject(text)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "jsonl = accept_list\n",
+    "jsonl.extend(reject_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('/path/to/abstract.jsonl', 'w') as outfile:\n",
+    "    for entry in jsonl:\n",
+    "        json.dump(entry, outfile)\n",
+    "        outfile.write('\\n')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}