Download the CORD-19 dataset from Kaggle.
python3 -m pip install -r requirements.txt
./preprocessing.py # extract_text()
./preprocessing.py > titles.csv # extract_titles()
./bpe.sh
./preprocessing.py # apply_bpe()
./doc2vec.py
./kmeans.py ./docvec > clusters.csv
./csvmerge.py > docs.csv