diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index 3180c29..d9744af 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -25,7 +25,8 @@ jobs: - name: Publish run: | - poetry publish -u ${{ secrets.PYPI_UNAME }} -p ${{ secrets.PYPI_PWD }} + poetry config pypi-token.pypi ${{ secrets.PYPI_TOKEN }} + poetry publish - name: Upload binaries to release uses: softprops/action-gh-release@v1 if: ${{startsWith(github.ref, 'refs/tags/') }} diff --git a/README.rst b/README.rst index f30a808..3c9cc75 100644 --- a/README.rst +++ b/README.rst @@ -1,7 +1,7 @@ ===== SINr ===== -|languages| |downloads| |license| |version| |cpython| |wheel| |python| |docs| |activity| |contributors| |quality| |build| +|languages| |downloads| |license| |version| |cpython| |wheel| |python| |activity| |contributors| *SINr* is an open-source tool to efficiently compute graph and word embeddings. Its aim is to provide sparse interpretable vectors from a @@ -50,7 +50,8 @@ Usage example ============= To get started using *SINr* to build graph and word embeddings, have a -look at the `notebook <./notebooks>`__ directory. +look at the `notebook `_ +directory. Here is a minimum working example of *SINr* @@ -132,7 +133,7 @@ to disccus the changes to be made. License ======= -Released under `CeCILL 2.1 `__, see `LICENSE <./LICENSE>`__ for more details. +Released under `CeCILL 2.1 `__, see `LICENSE `__ for more details. Publications ============ @@ -141,7 +142,7 @@ Publications find *SINr* useful for your own research, please cite the appropriate papers from the list below. Publications can also be found on `publications page in the -documentation `__. +documentation `__. **Initial SINr paper, 2021** @@ -184,8 +185,6 @@ documentation `__\ ⟩. + `⟨hal-03197434⟩ `__ + +**Interpretability of SINr embedding** + +- Thibault Prouteau, Nicolas Dugué, Nathalie Camelin, Sylvain Meignier. + Are Embedding Spaces Interpretable? Results of an Intrusion Detection + Evaluation on a Large French Corpus. LREC 2022, Jun 2022, Marseille, + France. `⟨hal-03770444⟩ `__ +**Sparsity of SINr embedding** -- Thibault Prouteau, Victor Connes, Nicolas Dugué, Anthony Perez, Jean-Charles Lamirel, et al.. SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!. Advances in Intelligent Data Analysis XIX, 19th International Symposium on Intelligent Data Analysis, IDA 2021, Apr 2021, Porto, Portugal. pp.325-337, ⟨`10.1007/978-3-030-74251-5_26 `_⟩. `⟨hal-03197434⟩ `_ +- Simon Guillot, Thibault Prouteau, Nicolas Dugué. + Sparser is better: one step closer to word embedding interpretability. + IWCS 2023, Nancy, France. + `⟨hal-04321407⟩ `__ -**Interpretability of SINr embeddings, 2022** +**Filtering dimensions of SINr embedding** -- Thibault Prouteau, Nicolas Dugué, Nathalie Camelin, Sylvain Meignier. Are Embedding Spaces Interpretable? Results of an Intrusion Detection Evaluation on a Large French Corpus. LREC 2022, Jun 2022, Marseille, France. `⟨hal-03770444⟩ `_ +- Anna Béranger, Nicolas Dugué, Simon Guillot, Thibault Prouteau. + Filtering communities in word co-occurrence networks to foster the + emergence of meaning. Complex Networks 2023, Menton, France. + `⟨hal-04398742⟩ `__ .. |languages| image:: https://img.shields.io/github/languages/count/SINr-Embeddings/sinr .. |downloads| image:: https://img.shields.io/pypi/dm/sinr @@ -130,8 +176,5 @@ Publications can also be found on :ref:`Publications`. .. |cpython| image:: https://img.shields.io/pypi/implementation/sinr .. |wheel| image:: https://img.shields.io/pypi/wheel/sinr .. |python| image:: https://img.shields.io/pypi/pyversions/sinr -.. |docs| image:: https://img.shields.io/website?url=https%3A%2F%2Fsinr-embeddings.github.io%2Fsinr%2F_build%2Fhtml%2Findex.html .. |activity| image:: https://img.shields.io/github/commit-activity/y/SINr-Embeddings/sinr .. |contributors| image:: https://img.shields.io/github/contributors/SINr-Embeddings/sinr -.. |quality| image:: https://scrutinizer-ci.com/g/SINr-Embeddings/sinr/badges/quality-score.png?b=main -.. |build| image:: https://scrutinizer-ci.com/g/SINr-Embeddings/sinr/badges/build.png?b=main diff --git a/docs/source/publications.rst b/docs/source/publications.rst index f16fc72..9d21b3c 100644 --- a/docs/source/publications.rst +++ b/docs/source/publications.rst @@ -6,10 +6,20 @@ Publications **Initial SINr paper, 2021** -- Thibault Prouteau, Victor Connes, Nicolas Dugué, Anthony Perez, Jean-Charles Lamirel, et al.. SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!. Advances in Intelligent Data Analysis XIX, 19th International Symposium on Intelligent Data Analysis, IDA 2021, Apr 2021, Porto, Portugal. pp.325-337, ⟨`10.1007/978-3-030-74251-5_26 `_⟩. `⟨hal-03197434⟩ `_ +- Thibault Prouteau, Victor Connes, Nicolas Dugué, Anthony Perez, Jean-Charles Lamirel, et al.. SINr: Fast Computing of Sparse Interpretable Node Representations is not a Sin!. Advances in Intelligent Data Analysis XIX, 19th International Symposium on Intelligent Data Analysis, IDA 2021, Apr 2021, Porto, Portugal. pp.325-337, ⟨\ `10.1007/978-3-030-74251-5_26 `__\ ⟩. + `⟨hal-03197434⟩ `__ +**Interpretability of SINr embedding** -**Interpretability of SINr embeddings, 2022** +- Thibault Prouteau, Nicolas Dugué, Nathalie Camelin, Sylvain Meignier. Are Embedding Spaces Interpretable? Results of an Intrusion Detection Evaluation on a Large French Corpus. LREC 2022, Jun 2022, Marseille, France. `⟨hal-03770444⟩ `__ -- Thibault Prouteau, Nicolas Dugué, Nathalie Camelin, Sylvain Meignier. Are Embedding Spaces Interpretable? Results of an Intrusion Detection Evaluation on a Large French Corpus. LREC 2022, Jun 2022, Marseille, France. `⟨hal-03770444⟩ `_ \ No newline at end of file +**Sparsity of SINr embedding** + + +- Simon Guillot, Thibault Prouteau, Nicolas Dugué. Sparser is better: one step closer to word embedding interpretability. IWCS 2023, Nancy, France. `⟨hal-04321407⟩ `__ + +**Filtering dimensions of SINr embedding** + + +- Anna Béranger, Nicolas Dugué, Simon Guillot, Thibault Prouteau. Filtering communities in word co-occurrence networks to foster the emergence of meaning. Complex Networks 2023, Menton, France. `⟨hal-04398742v1⟩ `__ diff --git a/docs/source/sinr.text.rst b/docs/source/sinr.text.rst index 3f2de04..0e51a6a 100644 --- a/docs/source/sinr.text.rst +++ b/docs/source/sinr.text.rst @@ -33,6 +33,14 @@ Preprocess Text :members: :undoc-members: :show-inheritance: + +Evaluate +--------------------------- + +.. automodule:: sinr.text.evaluate + :members: + :undoc-members: + :show-inheritance: Module contents --------------- diff --git a/notebooks/TransfertExample.ipynb b/notebooks/TransfertExample.ipynb new file mode 100644 index 0000000..1158cb4 --- /dev/null +++ b/notebooks/TransfertExample.ipynb @@ -0,0 +1,855 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "f38ae951-486e-4a47-9eac-af96a4bff6d4", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8ccf10dc-c324-4146-9801-e2a7685a0ae2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\t\t\t\n", + "\t\t" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/lium/buster1/dugue/miniconda3-2023/envs/sinr-dev/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2024-07-19 17:48:01,117 - load_from_cooc_pkl - INFO - Building Graph.\n", + "2024-07-19 17:48:01,118 - load_pkl_text - INFO - Loading cooccurrence matrix and dictionary.\n", + "2024-07-19 17:48:01,152 - load_pkl_text - INFO - Finished loading data.\n", + "2024-07-19 17:48:03,631 - load_from_cooc_pkl - INFO - Finished building graph.\n" + ] + } + ], + "source": [ + "import sinr.graph_embeddings as ge\n", + "import sinr.text.evaluate as ev\n", + "#Subsample of OANC corpus as a co-occurrence matrix\n", + "sinr = ge.SINr.load_from_cooc_pkl(\"oanc_extracted_min_freq20_max_freq5900_prct_sampling1_matrix.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a3ce6ca7-a4b9-4539-8850-09829157afc3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(17706, 'words of vocabulary in this subsample')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sinr.wrd_to_idx), \"words of vocabulary in this subsample\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "ff36b06d-a6f7-487e-84bb-481d8065dfb6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-19 17:48:03,755 - detect_communities - INFO - Detecting communities.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Gamma for louvain : 80\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-19 17:48:04,714 - detect_communities - INFO - Finished detecting communities.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Communities detected in 0.90354 [s]\n", + "solution properties:\n", + "------------------- ------------\n", + "# communities 4933\n", + "min community size 1\n", + "max community size 32\n", + "avg. community size 3.5893\n", + "modularity 0.0345125\n", + "------------------- ------------\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sinr.detect_communities(gamma=80)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "903e3116-25e3-4feb-a4d8-521732aef5bf", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-19 17:48:04,767 - extract_embeddings - INFO - Extracting embeddings.\n", + "2024-07-19 17:48:04,767 - extract_embeddings - INFO - Applying NFM.\n", + "2024-07-19 17:48:04,769 - get_nfm_embeddings - INFO - Starting NFM\n", + "2024-07-19 17:48:30,492 - extract_embeddings - INFO - NFM successfully applied.\n", + "2024-07-19 17:48:30,494 - extract_embeddings - INFO - Finished extracting embeddings.\n" + ] + } + ], + "source": [ + "sinr.extract_embeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c36d77c4-9bed-484b-8933-f3b80c87d626", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "94 missing words\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "27 missing words\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "235 missing words\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "data": { + "text/plain": [ + "{'MEN': 0.46094523263674375,\n", + " 'WS353': 0.4636897182566408,\n", + " 'SCWS': 0.42946339109424253}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele = ge.InterpretableWordsModelBuilder(sinr, \"modele\", n_jobs=8, n_neighbors=4).build()\n", + "ev.similarity_MEN_WS353_SCWS(modele)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "14fa05a2-a6f2-4f1c-be06-45a0868fe538", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "((17706, 4933), '(nb of words, dimensions)')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele.vectors.shape, \"(nb of words, dimensions)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c315614c-fa14-4c3f-bcfe-1315e762f25f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-19 17:48:39,768 - load_from_cooc_pkl - INFO - Building Graph.\n", + "2024-07-19 17:48:39,768 - load_pkl_text - INFO - Loading cooccurrence matrix and dictionary.\n", + "2024-07-19 17:48:39,787 - load_pkl_text - INFO - Finished loading data.\n", + "2024-07-19 17:48:41,251 - load_from_cooc_pkl - INFO - Finished building graph.\n" + ] + } + ], + "source": [ + "#Small Subsample of OANC corpus as a co-occurrence matrix\n", + "sinr_small = ge.SINr.load_from_cooc_pkl(\"oanc_extracted_min_freq50_max_freq3000_prct_sampling08_matrix.pkl\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f06e8450-0489-41fe-b026-3ff891d75e8a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(9494, 'words of vocabulary')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sinr_small.wrd_to_idx), \"words of vocabulary\"" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "63bbbfa5-5ead-46d3-9911-635b10a60243", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8212, 'words are present in the big subsample, but absent in the small one')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sinr.wrd_to_idx) - len(sinr_small.wrd_to_idx), \"words are present in the big subsample, but absent in the small one\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5abeefd5-4a8f-48e3-925f-80ff3bbc6a67", + "metadata": {}, + "outputs": [], + "source": [ + "#Transferring the communities of the bigger model to the small one\n", + "sinr_small.transfert_communities_labels(modele.get_communities_as_labels_sets())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "06808028-4389-4af3-99c0-d35a815ba5d2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-07-19 17:48:41,443 - extract_embeddings - INFO - Extracting embeddings.\n", + "2024-07-19 17:48:41,443 - extract_embeddings - INFO - Applying NFM.\n", + "2024-07-19 17:48:41,445 - get_nfm_embeddings - INFO - Starting NFM\n", + "2024-07-19 17:48:56,453 - extract_embeddings - INFO - NFM successfully applied.\n", + "2024-07-19 17:48:56,454 - extract_embeddings - INFO - Finished extracting embeddings.\n" + ] + } + ], + "source": [ + "sinr_small.extract_embeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b9a8fccd-d4db-43a5-9dbd-0031a00c08e9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "205 missing words\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "86 missing words\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " " + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "482 missing words\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r" + ] + }, + { + "data": { + "text/plain": [ + "{'MEN': 0.3092365066427464,\n", + " 'WS353': 0.37938716672217637,\n", + " 'SCWS': 0.3762601899389452}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele_small = ge.InterpretableWordsModelBuilder(sinr_small, \"modele\", n_jobs=8, n_neighbors=4).build()\n", + "ev.similarity_MEN_WS353_SCWS(modele_small)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "164a049b-cc48-4798-9a53-b8cc2793b1a5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('Small model : ', (9494, 4700), '(nb of words, dimensions)')" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"Small model : \", modele_small.vectors.shape, \"(nb of words, dimensions)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "68946821-0157-4e6a-807a-a00040cc962a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('Big modele : ', (17706, 4933), '(nb of words, dimensions)')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"Big modele : \", modele.vectors.shape, \"(nb of words, dimensions)\"" + ] + }, + { + "cell_type": "markdown", + "id": "2be5f6b1-4c51-43b6-8071-5e78f842addc", + "metadata": {}, + "source": [ + "### Interpreting the small model in the big one's referential\n", + "\n", + "The small model and its projected version into the big one's referential have the same neighborings, but their neighboring may be different from the big one." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2ea44036-bffd-4fc7-add8-34efb881070e", + "metadata": {}, + "outputs": [], + "source": [ + "modele_small_bigasaref = modele.get_vectors_using_self_space(modele_small)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "6bad3dda-bf1b-4f9b-9b5e-42120f3e3c7c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'object ': 'cheese',\n", + " 'neighbors ': [('bread', 0.37), ('wine', 0.35), ('bottle', 0.32)]}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele_small_bigasaref.most_similar(\"cheese\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "aa0f8771-d871-4c22-a2ea-9e89849e9b52", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'object ': 'cheese',\n", + " 'neighbors ': [('bread', 0.37), ('wine', 0.35), ('bottle', 0.32)]}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele_small.most_similar(\"cheese\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "bb693b80-d6b6-45f3-802f-9de397dd71b9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'object ': 'cheese',\n", + " 'neighbors ': [('foodstuff', 0.49), ('sausage', 0.45), ('bread', 0.41)]}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele.most_similar(\"cheese\")" + ] + }, + { + "cell_type": "markdown", + "id": "bc020130-d66c-4693-8c45-4977c16907ad", + "metadata": {}, + "source": [ + "The small model and its projected version into the big one's referential have the same stereotypes, but the dimensions ids are different. The dimensions ids of the projected versions are aligned with the big model." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "ec58906c-a556-4bcd-8115-5864af8734a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'dimension': 769,\n", + " 'value': True,\n", + " 'stereotypes': [(0.23, 'hippocampus'),\n", + " (0.22, 'amygdala'),\n", + " (0.21, 'furrow')]},\n", + " {'dimension': 1968,\n", + " 'value': True,\n", + " 'stereotypes': [(0.38, 'tclr'), (0.1, 'spleen'), (0.09, 'intestine')]},\n", + " {'dimension': 1944,\n", + " 'value': True,\n", + " 'stereotypes': [(0.34, 'cord'), (0.32, 'l6-s1'), (0.3, 'spinal')]},\n", + " {'dimension': 1196,\n", + " 'value': True,\n", + " 'stereotypes': [(0.1, 'autistic'), (0.06, 'amyloid'), (0.06, 'amygdala')]},\n", + " {'dimension': 291,\n", + " 'value': True,\n", + " 'stereotypes': [(0.47, 'rdta'), (0.36, 'scid'), (0.3, 'balb')]}]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele.get_obj_stereotypes(\"hippocampus\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "2543b046-e608-45e1-a4a2-1e3aa2e7ab14", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'dimension': 1812,\n", + " 'value': True,\n", + " 'stereotypes': [(0.26, 'furrow'), (0.23, 'hippocampus'), (0.2, 'cortex')]},\n", + " {'dimension': 757,\n", + " 'value': True,\n", + " 'stereotypes': [(0.11, 'cirrhosis'), (0.11, 'intestine'), (0.09, 'spleen')]},\n", + " {'dimension': 1494,\n", + " 'value': True,\n", + " 'stereotypes': [(0.35, 'cord'), (0.29, 'spinal'), (0.28, 'dorsal')]},\n", + " {'dimension': 1673,\n", + " 'value': True,\n", + " 'stereotypes': [(0.03, 'hippocampus'), (0.03, 'lifelong'), (0.03, 'diet')]},\n", + " {'dimension': 2862,\n", + " 'value': True,\n", + " 'stereotypes': [(0.12, 'translocation'),\n", + " (0.07, 'localization'),\n", + " (0.07, 'chloroplast')]}]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele_small.get_obj_stereotypes(\"hippocampus\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d7097aeb-9e59-4f0f-a229-58b5a48d837b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'dimension': 769,\n", + " 'value': True,\n", + " 'stereotypes': [(0.26, 'furrow'), (0.23, 'hippocampus'), (0.2, 'cortex')]},\n", + " {'dimension': 1968,\n", + " 'value': True,\n", + " 'stereotypes': [(0.11, 'cirrhosis'), (0.11, 'intestine'), (0.09, 'spleen')]},\n", + " {'dimension': 3455,\n", + " 'value': True,\n", + " 'stereotypes': [(0.03, 'hippocampus'), (0.03, 'lifelong'), (0.03, 'diet')]},\n", + " {'dimension': 1944,\n", + " 'value': True,\n", + " 'stereotypes': [(0.35, 'cord'), (0.29, 'spinal'), (0.28, 'dorsal')]},\n", + " {'dimension': 1670,\n", + " 'value': True,\n", + " 'stereotypes': [(0.12, 'phytoestrogen'), (0.1, 'rodent'), (0.1, 'pufa')]}]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele_small_bigasaref.get_obj_stereotypes(\"hippocampus\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "235e87c1-6d7f-4cae-96d5-0539d6eb0d06", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'dimension': 640,\n", + " 'value': True,\n", + " 'stereotypes': [(0.12, 'caloric'), (0.11, 'beverage'), (0.08, 'intake')]},\n", + " {'dimension': 641,\n", + " 'value': True,\n", + " 'stereotypes': [(0.1, 'beverage'), (0.08, 'alcoholic'), (0.08, 'foc')]},\n", + " {'dimension': 4784,\n", + " 'value': True,\n", + " 'stereotypes': [(0.18, 'overdominance'),\n", + " (0.16, 'k1'),\n", + " (0.11, 'phosphatidylinositol')]},\n", + " {'dimension': 713,\n", + " 'value': True,\n", + " 'stereotypes': [(0.1, 'bottled'), (0.05, 'gin'), (0.04, 'vodka')]},\n", + " {'dimension': 439,\n", + " 'value': True,\n", + " 'stereotypes': [(0.17, 'parenchyma'),\n", + " (0.15, 'extravascular'),\n", + " (0.13, 'artifactual')]}]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele.get_obj_stereotypes(\"beverage\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "8d1883f7-5e87-4ba1-be06-6b3c20cb80e8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'dimension': 2220,\n", + " 'value': True,\n", + " 'stereotypes': [(0.14, 'beverage'), (0.09, 'intake'), (0.06, 'calorie')]},\n", + " {'dimension': 183,\n", + " 'value': True,\n", + " 'stereotypes': [(0.11, 'beverage'), (0.07, 'alcoholic'), (0.02, 'citrus')]},\n", + " {'dimension': 1362,\n", + " 'value': True,\n", + " 'stereotypes': [(0.04, 'alcoholic'), (0.03, 'beverage'), (0.02, 'liquor')]},\n", + " {'dimension': 524,\n", + " 'value': True,\n", + " 'stereotypes': [(0.05, 'antioxidant'), (0.04, 'fortify'), (0.04, 'bottle')]},\n", + " {'dimension': 2511,\n", + " 'value': True,\n", + " 'stereotypes': [(0.07, 'adherent'),\n", + " (0.07, 'emphysema'),\n", + " (0.06, 'neutrophil')]}]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele_small.get_obj_stereotypes(\"beverage\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "18193578-e3a1-4c37-a600-64ce76df0feb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'dimension': 640,\n", + " 'value': True,\n", + " 'stereotypes': [(0.14, 'beverage'), (0.09, 'intake'), (0.06, 'calorie')]},\n", + " {'dimension': 641,\n", + " 'value': True,\n", + " 'stereotypes': [(0.11, 'beverage'), (0.07, 'alcoholic'), (0.02, 'citrus')]},\n", + " {'dimension': 713,\n", + " 'value': True,\n", + " 'stereotypes': [(0.04, 'alcoholic'), (0.03, 'beverage'), (0.02, 'liquor')]},\n", + " {'dimension': 1618,\n", + " 'value': True,\n", + " 'stereotypes': [(0.05, 'antioxidant'), (0.04, 'fortify'), (0.04, 'bottle')]},\n", + " {'dimension': 439,\n", + " 'value': True,\n", + " 'stereotypes': [(0.07, 'adherent'),\n", + " (0.07, 'emphysema'),\n", + " (0.06, 'neutrophil')]}]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "modele_small_bigasaref.get_obj_stereotypes(\"beverage\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6154d09c-56f5-4690-9125-90a996e41f1e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa124f22-269f-438e-9725-f418055e6510", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sinr-dev", + "language": "python", + "name": "sinr-dev" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/oanc_extracted_min_freq20_max_freq5900_prct_sampling1_matrix.pkl b/notebooks/oanc_extracted_min_freq20_max_freq5900_prct_sampling1_matrix.pkl new file mode 100644 index 0000000..c5c1cd4 Binary files /dev/null and b/notebooks/oanc_extracted_min_freq20_max_freq5900_prct_sampling1_matrix.pkl differ diff --git a/notebooks/oanc_extracted_min_freq50_max_freq3000_prct_sampling08_matrix.pkl b/notebooks/oanc_extracted_min_freq50_max_freq3000_prct_sampling08_matrix.pkl new file mode 100644 index 0000000..552aaf7 Binary files /dev/null and b/notebooks/oanc_extracted_min_freq50_max_freq3000_prct_sampling08_matrix.pkl differ diff --git a/pyproject.toml b/pyproject.toml index 6f7eba3..aed3bd5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "sinr" -version = "v1.2.0" +version = "v1.3.1" description = "Build word and graph embeddings based on community detection in graphs." authors = ["Thibault Prouteau ", "Nicolas Dugue ", " Simon Guillot ", "Anthony Perez"] license = "CeCILL 2.1" diff --git a/sinr/__init__.py b/sinr/__init__.py index ea12dc6..80804bf 100644 --- a/sinr/__init__.py +++ b/sinr/__init__.py @@ -2,4 +2,4 @@ __author__ = """Thibault Prouteau""" __email__ = 'thibault.prouteau@univ-lemans.fr' -__version__ = 'v1.2.0' +__version__ = 'v1.3.1' diff --git a/sinr/graph_embeddings.py b/sinr/graph_embeddings.py index c383cdf..5d119f9 100644 --- a/sinr/graph_embeddings.py +++ b/sinr/graph_embeddings.py @@ -648,6 +648,55 @@ def get_communities_as_labels_sets(self): labels.add(self.vocab[u]) labels_sets.append(labels) return labels_sets + + def get_matching_communities(self, sinr_vector): + """Get the matching between two partitions with common vocabularies + + :param sinr_vector: Small model (target) + :type sinr_vector: SINrVectors + + :returns: Lists. The first indicating, at each of its index corresponding to the community's index of the self object (src), its matching number in the parameter sinr_vector's communities (tgt) if it exists. The second indicating, at each of its index corresponding to the community's index of the object in parameter, its matching number in the self object. + :rtype: (list[int],list[int]) + """ + + src_communities = self.get_communities_as_labels_sets() + l = [-1] * len(src_communities) + tgt_communities = sinr_vector.get_communities_as_labels_sets() + for id_src, lab_set_src in enumerate(src_communities): + for id_tgt, lab_set_tgt in enumerate(tgt_communities): + if len(lab_set_src.intersection(lab_set_tgt)) > 0: + l[id_src] = id_tgt + tgt_from_src = [-1] * len(tgt_communities) + for idx, val in enumerate(l): + tgt_from_src[val] = idx + + return l, tgt_from_src + + def get_vectors_using_self_space(self, sinr_vector): + """Transpose the vectors of the sinr_vector object in parameter in the embedding space of the self object, using matching communities + + :param sinr_vector: Small model (target) + :type sinr_vector: SINrVectors + + :returns: Copy of the self model (the big one) with vectors of the parameter (small one) transposed to its referential + :rtype: SINrVectors + """ + from scipy.sparse import coo_matrix + + matching_st, matching_ts = self.get_matching_communities(sinr_vector) + + vectors = sinr_vector.vectors.tocoo() + row = vectors.row + data = vectors.data + col = [matching_ts[val] for val in vectors.col] + + matrix = coo_matrix((data, (row, col)), shape=(sinr_vector.vectors.shape[0], self.vectors.shape[1])) + + import copy + self_copy = copy.deepcopy(self) + self_copy.set_vectors(matrix.tocsr()) + self_copy.vocab = sinr_vector.vocab + return self_copy def set_n_jobs(self, n_jobs): """Set the number of jobs.