This repository has been archived by the owner on Jan 28, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
f78dc9b
commit 4652674
Showing
1 changed file
with
117 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Script for transforming csv file into jsonl for db_in and training a spaCy model. \n", | ||
"This is the most straightforward setup, where every abstract has an 'accept' annotation for its\n", | ||
"corresponding journal, and a 'reject' annotation for every other journal.\n", | ||
"\n", | ||
"https://support.prodi.gy/t/best-practices-for-text-classifier-annotations/135" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import pandas as pd\n", | ||
"from tqdm import tqdm\n", | ||
"import json" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#csv with 'abstract' column and 'journal code' column\n", | ||
"df = pd.read_csv('/path/to/abstract.csv')\n", | ||
"journal_list = df['journal code'].unique().tolist()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#generate list of json accept annotations for every abstract\n", | ||
"accept_list = []\n", | ||
"for i in tqdm(range(0,df.shape[0])):\n", | ||
" accept_list.append({\"answer\":\"accept\",\"text\":r'\"'+df['abstract'][i]+'\"',\"label\":df['journal code'][i]}) " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 6, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def jsonlReject(text):\n", | ||
" return {\"answer\":\"reject\",\"text\":r'\"'+text+'\"',\"label\":code}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#generate list of json reject annotations for every abstract - \n", | ||
"#with every abstract rejected by every journal to which it does not correspond\n", | ||
"reject_list = []\n", | ||
"for code in tqdm(journal_list):\n", | ||
" rej_code_bool = df['journal code'] != code\n", | ||
" rej_df = df[rej_code_bool]\n", | ||
" reject_list.extend(rej_df['abstract'].apply(lambda text:jsonlReject(text)))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 34, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"jsonl = accept_list\n", | ||
"jsonl.extend(reject_list)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 29, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"with open('/path/to/abstract.jsonl', 'w') as outfile:\n", | ||
" for entry in jsonl:\n", | ||
" json.dump(entry, outfile)\n", | ||
" outfile.write('\\n')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.6" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |