From dbb207e6cb5a3a26f907879dbe1fe7c14eba8733 Mon Sep 17 00:00:00 2001 From: Raphael Sourty Date: Sun, 26 May 2024 01:43:20 +0200 Subject: [PATCH] Delete README.Md --- .github/workflows/CI.yml | 137 ------- .github/workflows/wheels.yml | 72 ++++ README.Md | 9 - benchmark.ipynb | 770 ----------------------------------- readme.md | 0 setup.py | 6 +- 6 files changed, 75 insertions(+), 919 deletions(-) delete mode 100644 .github/workflows/CI.yml create mode 100644 .github/workflows/wheels.yml delete mode 100644 README.Md delete mode 100644 benchmark.ipynb create mode 100644 readme.md diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml deleted file mode 100644 index 367ac29..0000000 --- a/.github/workflows/CI.yml +++ /dev/null @@ -1,137 +0,0 @@ -# This file is autogenerated by maturin v1.5.1 -# To update, run -# -# maturin generate-ci github -# -name: CI - -on: - push: - branches: - - main - tags: - - '*' - pull_request: - workflow_dispatch: - -permissions: - contents: read - -jobs: - linux: - runs-on: ${{ matrix.platform.runner }} - strategy: - matrix: - platform: - - runner: ubuntu-latest - target: x86_64 - - runner: ubuntu-latest - target: x86 - - runner: ubuntu-latest - target: aarch64 - - runner: ubuntu-latest - target: armv7 - - runner: ubuntu-latest - target: s390x - - runner: ubuntu-latest - target: ppc64le - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --release --out dist --find-interpreter - sccache: 'true' - manylinux: auto - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-linux-${{ matrix.platform.target }} - path: dist - - windows: - runs-on: ${{ matrix.platform.runner }} - strategy: - matrix: - platform: - - runner: windows-latest - target: x64 - - runner: windows-latest - target: x86 - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.10' - architecture: ${{ matrix.platform.target }} - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --release --out dist --find-interpreter - sccache: 'true' - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-windows-${{ matrix.platform.target }} - path: dist - - macos: - runs-on: ${{ matrix.platform.runner }} - strategy: - matrix: - platform: - - runner: macos-latest - target: x86_64 - - runner: macos-14 - target: aarch64 - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: '3.10' - - name: Build wheels - uses: PyO3/maturin-action@v1 - with: - target: ${{ matrix.platform.target }} - args: --release --out dist --find-interpreter - sccache: 'true' - - name: Upload wheels - uses: actions/upload-artifact@v4 - with: - name: wheels-macos-${{ matrix.platform.target }} - path: dist - - sdist: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Build sdist - uses: PyO3/maturin-action@v1 - with: - command: sdist - args: --out dist - - name: Upload sdist - uses: actions/upload-artifact@v4 - with: - name: wheels-sdist - path: dist - - release: - name: Release - runs-on: ubuntu-latest - if: "startsWith(github.ref, 'refs/tags/')" - needs: [linux, windows, macos, sdist] - steps: - - uses: actions/download-artifact@v4 - - name: Publish to PyPI - uses: PyO3/maturin-action@v1 - env: - MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} - with: - command: upload - args: --non-interactive --skip-existing wheels-*/* diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml new file mode 100644 index 0000000..9c88485 --- /dev/null +++ b/.github/workflows/wheels.yml @@ -0,0 +1,72 @@ +name: Build and Publish Wheels + +on: + push: + branches: + - main + - "release/*" + +jobs: + build: + runs-on: ubuntu-latest + + strategy: + matrix: + python-version: ["3.8", "3.9", "3.10", "3.11"] + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true + + - name: Install build dependencies + run: pip install setuptools-rust + + - name: Build wheel + run: python -m pip install . + + - name: Create wheel and source distribution + run: | + python setup.py bdist_wheel + python setup.py sdist + + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: dist/* + + publish: + needs: build + runs-on: ubuntu-latest + + steps: + - name: Download artifact + uses: actions/download-artifact@v3 + with: + name: dist + path: dist + + - name: Set up Python 3.11 + uses: actions/setup-python@v4 + with: + python-version: "3.11" + + - name: Install twine + run: pip install twine + + - name: Publish to PyPI + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: twine upload dist/* diff --git a/README.Md b/README.Md deleted file mode 100644 index 9466296..0000000 --- a/README.Md +++ /dev/null @@ -1,9 +0,0 @@ -```python -pip install maturin -``` -``` -maturin develop -``` - - - diff --git a/benchmark.ipynb b/benchmark.ipynb deleted file mode 100644 index fc90df8..0000000 --- a/benchmark.ipynb +++ /dev/null @@ -1,770 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/raphael/miniforge3/envs/antilopa/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n", - "100%|██████████| 5183/5183 [00:00<00:00, 273874.69it/s]\n" - ] - } - ], - "source": [ - "from neural_cherche import utils\n", - "from nltk.corpus import stopwords\n", - "\n", - "stop_words = list(stopwords.words(fileids=\"english\"))\n", - "\n", - "documents, queries_ids, queries, qrels = utils.load_beir(\n", - " dataset_name=\"scifact\",\n", - " split=\"test\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from neural_cherche import retrieve\n", - "\n", - "from lenlp import sparse\n", - "\n", - "count_vectorizer = sparse.BM25Vectorizer(\n", - " normalize=True,\n", - " ngram_range=(3, 5),\n", - " analyzer=\"char_wb\",\n", - " stop_words=stop_words,\n", - ")\n", - "\n", - "retriever = retrieve.TfIdf(\n", - " key=\"id\",\n", - " on=[\"title\", \"text\"],\n", - " tfidf=count_vectorizer,\n", - ")\n", - "\n", - "documents_embeddings = retriever.encode_documents(\n", - " documents=documents,\n", - ")\n", - "\n", - "retriever = retriever.add(documents_embeddings=documents_embeddings)\n", - "\n", - "queries_embeddings = retriever.encode_queries(\n", - " queries=queries,\n", - ")\n", - "\n", - "scores = retriever(\n", - " queries_embeddings=queries_embeddings,\n", - " k=10,\n", - " batch_size=1024,\n", - " tqdm_bar=False,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ndcg@10': 0.6937352011710838,\n", - " 'hits@1': 0.5666666666666667,\n", - " 'hits@2': 0.68,\n", - " 'hits@3': 0.7466666666666667,\n", - " 'hits@4': 0.8166666666666667,\n", - " 'hits@5': 0.8466666666666667,\n", - " 'hits@6': 0.8733333333333333,\n", - " 'hits@7': 0.9033333333333333,\n", - " 'hits@8': 0.9033333333333333,\n", - " 'hits@9': 0.9166666666666666,\n", - " 'hits@10': 0.9233333333333333}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "utils.evaluate(\n", - " scores=scores,\n", - " qrels=qrels,\n", - " queries_ids=queries_ids,\n", - " metrics=[\"ndcg@10\"] + [f\"hits@{k}\" for k in range(1, 11)],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "151 ms ± 13.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "{\n", - " \"ndcg@10\": 0.6947898732051012,\n", - " \"hits@1\": 0.5633333333333334,\n", - " \"hits@2\": 0.68,\n", - " \"hits@3\": 0.7566666666666667,\n", - " \"hits@4\": 0.8166666666666667,\n", - " \"hits@5\": 0.8433333333333334,\n", - " \"hits@6\": 0.8733333333333333,\n", - " \"hits@7\": 0.9,\n", - " \"hits@8\": 0.9066666666666666,\n", - " \"hits@9\": 0.9166666666666666,\n", - " \"hits@10\": 0.93,\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ndcg@10': 0.6805801205908164,\n", - " 'hits@1': 0.5633333333333334,\n", - " 'hits@2': 0.6733333333333333,\n", - " 'hits@3': 0.7433333333333333,\n", - " 'hits@4': 0.7733333333333333,\n", - " 'hits@5': 0.81,\n", - " 'hits@6': 0.8333333333333334,\n", - " 'hits@7': 0.8633333333333333,\n", - " 'hits@8': 0.87,\n", - " 'hits@9': 0.8833333333333333,\n", - " 'hits@10': 0.8966666666666666}" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from neural_cherche import models, rank\n", - "\n", - "model = models.ColBERT(\n", - " model_name_or_path=\"raphaelsty/neural-cherche-colbert\",\n", - " device=\"mps\",\n", - ")\n", - "\n", - "ranker = rank.ColBERT(\n", - " key=\"id\",\n", - " on=[\"title\", \"text\"],\n", - " model=model,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ColBERT queries embeddings: 100%|██████████| 10/10 [00:01<00:00, 7.17it/s]\n", - "ColBERT documents embeddings: 100%|██████████| 122/122 [01:33<00:00, 1.31it/s]\n" - ] - } - ], - "source": [ - "queries_embeddings = retriever.encode_queries(\n", - " queries=queries,\n", - ")\n", - "\n", - "candidates = retriever(\n", - " queries_embeddings=queries_embeddings,\n", - " k=30,\n", - " batch_size=1024,\n", - " tqdm_bar=False,\n", - ")\n", - "\n", - "ranker_queries_embeddings = ranker.encode_queries(queries=queries, batch_size=32)\n", - "\n", - "ranker_documents_embeddings = ranker.encode_candidates_documents(\n", - " documents=documents, candidates=candidates, batch_size=32\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "ColBERT ranker: 99%|█████████▉| 299/301 [00:02<00:00, 126.34it/s]\n" - ] - } - ], - "source": [ - "scores = ranker(\n", - " documents=candidates,\n", - " queries_embeddings=ranker_queries_embeddings,\n", - " documents_embeddings=ranker_documents_embeddings,\n", - " k=10,\n", - " batch_size=32,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ndcg@10': 0.7120634760361798,\n", - " 'hits@1': 0.5933333333333334,\n", - " 'hits@2': 0.7066666666666667,\n", - " 'hits@3': 0.7766666666666666,\n", - " 'hits@4': 0.85,\n", - " 'hits@5': 0.8633333333333333,\n", - " 'hits@6': 0.8933333333333333,\n", - " 'hits@7': 0.9133333333333333,\n", - " 'hits@8': 0.9233333333333333,\n", - " 'hits@9': 0.93,\n", - " 'hits@10': 0.94}" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "# MS MARCO, 8.84M documents\n", - "# 5 minutes 25 seconds to create the index\n", - "# 118 queries per second on 8.84M documents\n", - "\n", - "# 6,825,631 articles in wikipedia\n", - "# Assume wikipedia articles are 100 times longer than MS MARCO documents\n", - "# 300 minutes to create the index for wikipedia on a single computer,\n", - "# Could be 150 minutes" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "wikipedia = \"\"\"\n", - "Main menu\n", - " \n", - "WikipediaThe Free Encyclopedia\n", - "\n", - "Search\n", - "Create account\n", - "Log in\n", - "\n", - "Personal tools\n", - "Contents hide\n", - "(Top)\n", - "Design\n", - "Toggle Design subsection\n", - "Pretraining\n", - "Architecture details\n", - "Performance\n", - "Analysis\n", - "History\n", - "Recognition\n", - "References\n", - "Further reading\n", - "External links\n", - "BERT (language model)\n", - "\n", - "16 languages\n", - "Article\n", - "Talk\n", - "Read\n", - "Edit\n", - "View history\n", - "\n", - "Tools\n", - "From Wikipedia, the free encyclopedia\n", - "Bidirectional Encoder Representations from Transformers (BERT) is a language model based on the transformer architecture, notable for its dramatic improvement over previous state of the art models. It was introduced in October 2018 by researchers at Google.[1][2] A 2020 literature survey concluded that \"in a little over a year, BERT has become a ubiquitous baseline in Natural Language Processing (NLP) experiments counting over 150 research publications analyzing and improving the model.\"[3]\n", - "BERT was originally implemented in the English language at two model sizes:[1] (1) BERTBASE: 12 encoders with 12 bidirectional self-attention heads totaling 110 million parameters, and (2) BERTLARGE: 24 encoders with 16 bidirectional self-attention heads totaling 340 million parameters. Both models were pre-trained on the Toronto BookCorpus[4] (800M words) and English Wikipedia (2,500M words).\n", - "Design\n", - "BERT is an \"encoder-only\" transformer architecture.\n", - "On a high level, BERT consists of three modules:\n", - "embedding. This module converts an array of one-hot encoded tokens into an array of vectors representing the tokens.\n", - "a stack of encoders. These encoders are the Transformer encoders. They perform transformations over the array of representation vectors.\n", - "un-embedding. This module converts the final representation vectors into one-hot encoded tokens again.\n", - "The un-embedding module is necessary for pretraining, but it is often unnecessary for downstream tasks. Instead, one would take the representation vectors output at the end of the stack of encoders, and use those as a vector representation of the text input, and train a smaller model on top of that.\n", - "BERT uses WordPiece to convert each English word into an integer code. Its vocabulary has size 30,000. Any token not appearing in its vocabulary is replaced by [UNK] for \"unknown\".\n", - "Pretraining\n", - "BERT was pre-trained simultaneously on two tasks:[5]\n", - "language modeling: 15% of tokens were selected for prediction, and the training objective was to predict the selected token given its context. The selected token is\n", - "replaced with a [MASK] token with probability 80%,\n", - "replaced with a random word token with probability 10%,\n", - "not replaced with probability 10%.\n", - "For example, the sentence \"my dog is cute\" may have the 4-th token selected for prediction. The model would have input text\n", - "\"my dog is [MASK]\" with probability 80%,\n", - "\"my dog is happy\" with probability 10%,\n", - "\"my dog is cute\" with probability 10%.\n", - "After processing the input text, the model's 4-th output vector is passed to a separate neural network, which outputs a probability distribution over its 30,000-large vocabulary.\n", - "next sentence prediction: Given two spans of text, the model predicts if these two spans appeared sequentially in the training corpus, outputting either [IsNext] or [NotNext]. The first span starts with a special token [CLS] (for \"classify\"). The two spans are separated by a special token [SEP] (for \"separate\"). After processing the two spans, the 1-st output vector (the vector coding for [CLS]) is passed to a separate neural network for the binary classification into [IsNext] and [NotNext].\n", - "For example, given \"[CLS] my dog is cute [SEP] he likes playing\" the model should output token [IsNext].\n", - "Given \"[CLS] my dog is cute [SEP] how do magnets work\" the model should output token [NotNext].\n", - "As a result of this training process, BERT learns latent representations of words and sentences in context. After pre-training, BERT can be fine-tuned with fewer resources on smaller datasets to optimize its performance on specific tasks such as NLP tasks (language inference, text classification) and sequence-to-sequence based language generation tasks (question-answering, conversational response generation).[1][6] The pre-training stage is significantly more computationally expensive than fine-tuning.\n", - "Architecture details\n", - "This section describes BERTBASE. The other one, BERTLARGE, is similar, just larger.\n", - "The lowest layer is the embedding layer, which contains three components: word_embeddings, position_embeddings, token_type_embeddings.\n", - "word_embeddings takes in a one-hot vector of the input token. The one-hot vector input has dimension 30,000, because BERT has a vocabulary size that large.\n", - "position_embeddings performs absolute position embedding. It is like word_embeddings, but on a vocabulary consisting of just the time-stamps 0 to 511, since BERT has a context window of 512.\n", - "token_type_embeddings is like word_embeddings, but on a vocabulary consisting of just 0 and 1. The only type-1 tokens are those that appear after the [SEP]. All other tokens are type-0.\n", - "The three outputs are added, then pushed through a LayerNorm (layer normalization), obtaining an array of representation vectors, each having 768 dimensions.\n", - "After this, the representation vectors move through 12 Transformer encoders, then they are un-embedded by an affine-Add & LayerNorm-linear.\n", - "Performance\n", - "When BERT was published, it achieved state-of-the-art performance on a number of natural language understanding tasks:[1]\n", - "GLUE (General Language Understanding Evaluation) task set (consisting of 9 tasks)\n", - "SQuAD (Stanford Question Answering Dataset[7]) v1.1 and v2.0\n", - "SWAG (Situations With Adversarial Generations[8])\n", - "Analysis\n", - "The reasons for BERT's state-of-the-art performance on these natural language understanding tasks are not yet well understood.[9][10] Current research has focused on investigating the relationship behind BERT's output as a result of carefully chosen input sequences,[11][12] analysis of internal vector representations through probing classifiers,[13][14] and the relationships represented by attention weights.[9][10] The high performance of the BERT model could also be attributed to the fact that it is bidirectionally trained. This means that BERT, based on the Transformer model architecture, applies its self-attention mechanism to learn information from a text from the left and right side during training, and consequently gains a deep understanding of the context. For example, the word fine can have two different meanings depending on the context (I feel fine today, She has fine blond hair). BERT considers the words surrounding the target word fine from the left and right side.\n", - "However it comes at a cost: due to encoder-only architecture lacking a decoder, BERT can't be prompted and can't generate text, while bidirectional models in general do not work effectively without the right side,[clarification needed] thus being difficult to prompt, with even short text generation requiring sophisticated computationally expensive techniques.[15]\n", - "In contrast to deep learning neural networks which require very large amounts of data, BERT has already been pre-trained which means that it has learnt the representations of the words and sentences as well as the underlying semantic relations that they are connected with. BERT can then be fine-tuned on smaller datasets for specific tasks such as sentiment classification. The pre-trained models are chosen according to the content of the given dataset one uses but also the goal of the task. For example, if the task is a sentiment classification task on financial data, a pre-trained model for the analysis of sentiment of financial text should be chosen. The weights of the original pre-trained models were released on GitHub.[16]\n", - "History\n", - "BERT was originally published by Google researchers Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova. The design has its origins from pre-training contextual representations, including semi-supervised sequence learning,[17] generative pre-training, ELMo,[18] and ULMFit.[19] Unlike previous models, BERT is a deeply bidirectional, unsupervised language representation, pre-trained using only a plain text corpus. Context-free models such as word2vec or GloVe generate a single word embedding representation for each word in the vocabulary, whereas BERT takes into account the context for each occurrence of a given word. For instance, whereas the vector for \"running\" will have the same word2vec vector representation for both of its occurrences in the sentences \"He is running a company\" and \"He is running a marathon\", BERT will provide a contextualized embedding that will be different according to the sentence.[citation needed]\n", - "On October 25, 2019, Google announced that they had started applying BERT models for English language search queries within the US.[20] On December 9, 2019, it was reported that BERT had been adopted by Google Search for over 70 languages.[21] In October 2020, almost every single English-based query was processed by a BERT model.[22]\n", - "A later paper proposes RoBERTa, which preserves BERT's architecture, but improves its training, changing key hyperparameters, removing the next-sentence prediction task, and using much larger mini-batch sizes.[23]\n", - "Recognition\n", - "The research paper describing BERT won the Best Long Paper Award at the 2019 Annual Conference of the North American Chapter of the Association for Computational Linguistics (NAACL).[24]\n", - "References\n", - " Devlin, Jacob; Chang, Ming-Wei; Lee, Kenton; Toutanova, Kristina (October 11, 2018). \"BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding\". arXiv:1810.04805v2 [cs.CL].\n", - " \"Open Sourcing BERT: State-of-the-Art Pre-training for Natural Language Processing\". Google AI Blog. November 2, 2018. Retrieved November 27, 2019.\n", - " Rogers, Anna; Kovaleva, Olga; Rumshisky, Anna (2020). \"A Primer in BERTology: What We Know About How BERT Works\". Transactions of the Association for Computational Linguistics. 8: 842–866. arXiv:2002.12327. doi:10.1162/tacl_a_00349. S2CID 211532403.\n", - " Zhu, Yukun; Kiros, Ryan; Zemel, Rich; Salakhutdinov, Ruslan; Urtasun, Raquel; Torralba, Antonio; Fidler, Sanja (2015). \"Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books\". pp. 19–27. arXiv:1506.06724 [cs.CV].\n", - " \"Summary of the models — transformers 3.4.0 documentation\". huggingface.co. Retrieved February 16, 2023.\n", - " Horev, Rani (2018). \"BERT Explained: State of the art language model for NLP\". Towards Data Science. Retrieved September 27, 2021.\n", - " Rajpurkar, Pranav; Zhang, Jian; Lopyrev, Konstantin; Liang, Percy (October 10, 2016). \"SQuAD: 100,000+ Questions for Machine Comprehension of Text\". arXiv:1606.05250 [cs.CL].\n", - " Zellers, Rowan; Bisk, Yonatan; Schwartz, Roy; Choi, Yejin (August 15, 2018). \"SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference\". arXiv:1808.05326 [cs.CL].\n", - " Kovaleva, Olga; Romanov, Alexey; Rogers, Anna; Rumshisky, Anna (November 2019). \"Revealing the Dark Secrets of BERT\". Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). pp. 4364–4373. doi:10.18653/v1/D19-1445. S2CID 201645145.\n", - " Clark, Kevin; Khandelwal, Urvashi; Levy, Omer; Manning, Christopher D. (2019). \"What Does BERT Look at? An Analysis of BERT's Attention\". Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP. Stroudsburg, PA, USA: Association for Computational Linguistics: 276–286. arXiv:1906.04341. doi:10.18653/v1/w19-4828.\n", - " Khandelwal, Urvashi; He, He; Qi, Peng; Jurafsky, Dan (2018). \"Sharp Nearby, Fuzzy Far Away: How Neural Language Models Use Context\". Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers). Stroudsburg, PA, USA: Association for Computational Linguistics: 284–294. arXiv:1805.04623. doi:10.18653/v1/p18-1027. S2CID 21700944.\n", - " Gulordava, Kristina; Bojanowski, Piotr; Grave, Edouard; Linzen, Tal; Baroni, Marco (2018). \"Colorless Green Recurrent Networks Dream Hierarchically\". Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers). Stroudsburg, PA, USA: Association for Computational Linguistics. pp. 1195–1205. arXiv:1803.11138. doi:10.18653/v1/n18-1108. S2CID 4460159.\n", - " Giulianelli, Mario; Harding, Jack; Mohnert, Florian; Hupkes, Dieuwke; Zuidema, Willem (2018). \"Under the Hood: Using Diagnostic Classifiers to Investigate and Improve how Language Models Track Agreement Information\". Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP. Stroudsburg, PA, USA: Association for Computational Linguistics: 240–248. arXiv:1808.08079. doi:10.18653/v1/w18-5426. S2CID 52090220.\n", - " Zhang, Kelly; Bowman, Samuel (2018). \"Language Modeling Teaches You More than Translation Does: Lessons Learned Through Auxiliary Syntactic Task Analysis\". Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP. Stroudsburg, PA, USA: Association for Computational Linguistics: 359–361. doi:10.18653/v1/w18-5448.\n", - " Patel, Ajay; Li, Bryan; Mohammad Sadegh Rasooli; Constant, Noah; Raffel, Colin; Callison-Burch, Chris (2022). \"Bidirectional Language Models Are Also Few-shot Learners\". arXiv:2209.14500 [cs.LG].\n", - " \"BERT\". GitHub. Retrieved March 28, 2023.\n", - " Dai, Andrew; Le, Quoc (November 4, 2015). \"Semi-supervised Sequence Learning\". arXiv:1511.01432 [cs.LG].\n", - " Peters, Matthew; Neumann, Mark; Iyyer, Mohit; Gardner, Matt; Clark, Christopher; Lee, Kenton; Luke, Zettlemoyer (February 15, 2018). \"Deep contextualized word representations\". arXiv:1802.05365v2 [cs.CL].\n", - " Howard, Jeremy; Ruder, Sebastian (January 18, 2018). \"Universal Language Model Fine-tuning for Text Classification\". arXiv:1801.06146v5 [cs.CL].\n", - " Nayak, Pandu (October 25, 2019). \"Understanding searches better than ever before\". Google Blog. Retrieved December 10, 2019.\n", - " Montti, Roger (December 10, 2019). \"Google's BERT Rolls Out Worldwide\". Search Engine Journal. Retrieved December 10, 2019.\n", - " \"Google: BERT now used on almost every English query\". Search Engine Land. October 15, 2020. Retrieved November 24, 2020.\n", - " Liu, Yinhan; Ott, Myle; Goyal, Naman; Du, Jingfei; Joshi, Mandar; Chen, Danqi; Levy, Omer; Lewis, Mike; Zettlemoyer, Luke; Stoyanov, Veselin (2019). \"RoBERTa: A Robustly Optimized BERT Pretraining Approach\". arXiv:1907.11692 [cs.CL].\n", - " \"Best Paper Awards\". NAACL. 2019. Retrieved March 28, 2020.\n", - "Further reading\n", - "Rogers, Anna; Kovaleva, Olga; Rumshisky, Anna (2020). \"A Primer in BERTology: What we know about how BERT works\". arXiv:2002.12327 [cs.CL].\n", - "External links\n", - "Official GitHub repository\n", - "BERT on Devopedia\n", - "vte\n", - "Google AI\n", - "vte\n", - "Google\n", - "vte\n", - "Natural language processing\n", - "vte\n", - "Differentiable computing\n", - "Categories: Google softwareLarge language models\n", - "This page was last edited on 7 May 2024, at 09:13 (UTC).\n", - "Text is available under the Creative Commons Attribution-ShareAlike License 4.0; additional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.\n", - "Privacy policyAbout WikipediaDisclaimersContact WikipediaCode of ConductDevelopersStatisticsCookie statementMobile view\n", - "Wikimedia FoundationPowered by MediaWiki\n", - "Toggle limited content width\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "from nltk.corpus import stopwords\n", - "\n", - "from lenlp import flash\n", - "\n", - "stop_words = list(stopwords.words(fileids=\"english\"))\n", - "flash_text = flash.FlashText(normalize=True)\n", - "flash_text = flash_text.add(x=stop_words)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "681 ms ± 20.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%%timeit \n", - "extract = flash_text.extract(x=[wikipedia] * 1000, span_info=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "from flashtext import KeywordProcessor\n", - "\n", - "flash_text = KeywordProcessor()\n", - "flash_text.add_keywords_from_list(stop_words)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1.33 s ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" - ] - } - ], - "source": [ - "%%timeit \n", - "extract = []\n", - "for page in [wikipedia] * 1000:\n", - " extract.append(flash_text.extract_keywords(page))" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/raphael/miniforge3/envs/antilopa/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - }, - { - "ename": "NameError", - "evalue": "name 'documents' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[19], line 16\u001b[0m\n\u001b[1;32m 5\u001b[0m count_vectorizer \u001b[38;5;241m=\u001b[39m sparse\u001b[38;5;241m.\u001b[39mCountVectorizer(\n\u001b[1;32m 6\u001b[0m normalize\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, ngram_range\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m3\u001b[39m, \u001b[38;5;241m5\u001b[39m), analyzer\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mchar_wb\u001b[39m\u001b[38;5;124m\"\u001b[39m, stop_words\u001b[38;5;241m=\u001b[39mstop_words\n\u001b[1;32m 7\u001b[0m )\n\u001b[1;32m 9\u001b[0m retriever \u001b[38;5;241m=\u001b[39m retrieve\u001b[38;5;241m.\u001b[39mBM25(\n\u001b[1;32m 10\u001b[0m key\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mid\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 11\u001b[0m on\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtitle\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtext\u001b[39m\u001b[38;5;124m\"\u001b[39m],\n\u001b[1;32m 12\u001b[0m count_vectorizer\u001b[38;5;241m=\u001b[39mcount_vectorizer,\n\u001b[1;32m 13\u001b[0m )\n\u001b[1;32m 15\u001b[0m documents_embeddings \u001b[38;5;241m=\u001b[39m retriever\u001b[38;5;241m.\u001b[39mencode_documents(\n\u001b[0;32m---> 16\u001b[0m documents\u001b[38;5;241m=\u001b[39m\u001b[43mdocuments\u001b[49m,\n\u001b[1;32m 17\u001b[0m )\n\u001b[1;32m 19\u001b[0m retriever \u001b[38;5;241m=\u001b[39m retriever\u001b[38;5;241m.\u001b[39madd(documents_embeddings\u001b[38;5;241m=\u001b[39mdocuments_embeddings)\n", - "\u001b[0;31mNameError\u001b[0m: name 'documents' is not defined" - ] - } - ], - "source": [ - "from neural_cherche import retrieve\n", - "\n", - "from lenlp import sparse\n", - "\n", - "count_vectorizer = sparse.CountVectorizer(\n", - " normalize=True, ngram_range=(3, 5), analyzer=\"char_wb\", stop_words=stop_words\n", - ")\n", - "\n", - "retriever = retrieve.BM25(\n", - " key=\"id\",\n", - " on=[\"title\", \"text\"],\n", - " count_vectorizer=count_vectorizer,\n", - ")\n", - "\n", - "documents_embeddings = retriever.encode_documents(\n", - " documents=documents,\n", - ")\n", - "\n", - "retriever = retriever.add(documents_embeddings=documents_embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ndcg@10': 0.6805801205908164,\n", - " 'hits@1': 0.5633333333333334,\n", - " 'hits@2': 0.6733333333333333,\n", - " 'hits@3': 0.7433333333333333,\n", - " 'hits@4': 0.7733333333333333,\n", - " 'hits@5': 0.81,\n", - " 'hits@6': 0.8333333333333334,\n", - " 'hits@7': 0.8633333333333333,\n", - " 'hits@8': 0.87,\n", - " 'hits@9': 0.8833333333333333,\n", - " 'hits@10': 0.8966666666666666}" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "queries_embeddings = retriever.encode_queries(\n", - " queries=queries,\n", - ")\n", - "\n", - "scores = retriever(\n", - " queries_embeddings=queries_embeddings,\n", - " k=10,\n", - " batch_size=1024,\n", - " tqdm_bar=False,\n", - ")\n", - "\n", - "utils.evaluate(\n", - " scores=scores,\n", - " qrels=qrels,\n", - " queries_ids=queries_ids,\n", - " metrics=[\"ndcg@10\"] + [f\"hits@{k}\" for k in range(1, 11)],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [], - "source": [ - "from neural_cherche import retrieve\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "\n", - "count_vectorizer = CountVectorizer(\n", - " lowercase=True, ngram_range=(3, 5), analyzer=\"char_wb\", strip_accents=\"unicode\"\n", - ")\n", - "\n", - "retriever = retrieve.BM25(\n", - " key=\"id\",\n", - " on=[\"title\", \"text\"],\n", - " count_vectorizer=count_vectorizer,\n", - ")\n", - "\n", - "documents_embeddings = retriever.encode_documents(\n", - " documents=documents,\n", - ")\n", - "\n", - "retriever = retriever.add(documents_embeddings=documents_embeddings)" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'ndcg@10': 0.6391469805305356,\n", - " 'hits@1': 0.51,\n", - " 'hits@2': 0.62,\n", - " 'hits@3': 0.68,\n", - " 'hits@4': 0.74,\n", - " 'hits@5': 0.7866666666666666,\n", - " 'hits@6': 0.8066666666666666,\n", - " 'hits@7': 0.8266666666666667,\n", - " 'hits@8': 0.85,\n", - " 'hits@9': 0.8633333333333333,\n", - " 'hits@10': 0.8733333333333333}" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "queries_embeddings = retriever.encode_queries(\n", - " queries=queries,\n", - ")\n", - "\n", - "scores = retriever(\n", - " queries_embeddings=queries_embeddings,\n", - " k=10,\n", - " batch_size=1024,\n", - " tqdm_bar=False,\n", - ")\n", - "utils.evaluate(\n", - " scores=scores,\n", - " qrels=qrels,\n", - " queries_ids=queries_ids,\n", - " metrics=[\"ndcg@10\"] + [f\"hits@{k}\" for k in range(1, 11)],\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "{\n", - " \"ndcg@10\": 0.6805801205908164,\n", - " \"hits@1\": 0.5633333333333334,\n", - " \"hits@2\": 0.6733333333333333,\n", - " \"hits@3\": 0.7433333333333333,\n", - " \"hits@4\": 0.7733333333333333,\n", - " \"hits@5\": 0.81,\n", - " \"hits@6\": 0.8333333333333334,\n", - " \"hits@7\": 0.8633333333333333,\n", - " \"hits@8\": 0.87,\n", - " \"hits@9\": 0.8833333333333333,\n", - " \"hits@10\": 0.8966666666666666,\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[[0 0 0]\n", - " [0 0 1]\n", - " [0 0 0]]\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "from scipy.sparse import csr_matrix\n", - "\n", - "\n", - "def normalize_csr_matrix(csr_mat):\n", - " # Calculate the L2 norm for each row\n", - " row_norms = np.sqrt(csr_mat.multiply(csr_mat).sum(axis=1))\n", - "\n", - " # Avoid division by zero by setting zero norms to one (rows with all zeros)\n", - " row_norms[row_norms == 0] = 1\n", - "\n", - " # Normalize each row by its L2 norm\n", - " row_indices, col_indices = csr_mat.nonzero()\n", - " for i, j in zip(row_indices, col_indices):\n", - " csr_mat[i, j] /= row_norms[i, 0]\n", - "\n", - " return csr_mat\n", - "\n", - "\n", - "# Example usage:\n", - "# Create a sample CSR matrix\n", - "sample_data = np.array([1, 2, 3, 4, 5, 6])\n", - "sample_indices = np.array([0, 2, 2, 0, 1, 2])\n", - "sample_indptr = np.array([0, 2, 3, 6])\n", - "sample_csr_matrix = csr_matrix(\n", - " (sample_data, sample_indices, sample_indptr), shape=(3, 3)\n", - ")\n", - "\n", - "# Normalize the matrix\n", - "normalized_csr_matrix = normalize_csr_matrix(sample_csr_matrix)\n", - "\n", - "# Convert to dense format to check the result (optional)\n", - "print(normalized_csr_matrix.todense())" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "matrix([[0, 0, 0],\n", - " [0, 0, 1],\n", - " [0, 0, 0]])" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample_csr_matrix.todense()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "antilopa", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..e69de29 diff --git a/setup.py b/setup.py index 11794fd..258be55 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ from lenlp.__version__ import __version__ -with open("README.md", "r", encoding="utf-8") as fh: +with open("readme.md", "r", encoding="utf-8") as fh: long_description = fh.read() base_packages = ["scikit-learn >= 1.5.0", "scipy >= 1.13.1"] @@ -28,7 +28,7 @@ "Programming Language :: Rust", "Operating System :: OS Independent", ], - python_requires=">=3.9", + python_requires=">=3.8", rust_extensions=[RustExtension("rslenlp", binding=Binding.PyO3)], - setup_requires=["setuptools-rust>=1.4.0"], + setup_requires=["setuptools-rust>=1.4.0", "maturin >= 1.5.1"], )