From e5441fc2ea279f75f1560cee3757246144e11f30 Mon Sep 17 00:00:00 2001 From: Hans Chalupsky Date: Tue, 10 Jan 2023 13:54:31 -0800 Subject: [PATCH 1/5] Do not set/change DB journal_mode in readonly mode This caused an error, since it is a DB write operation. --- kgtk/kypher/sqlstore.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/kgtk/kypher/sqlstore.py b/kgtk/kypher/sqlstore.py index dc7833e26..ec184fc67 100644 --- a/kgtk/kypher/sqlstore.py +++ b/kgtk/kypher/sqlstore.py @@ -196,12 +196,15 @@ def configure(self): #self.pragma('main.page_size = 65536') # for zfs only self.pragma('main.cache_size = %d' % int(self.CACHE_SIZE / self.pragma('page_size'))) self.pragma('busy_timeout = %d' % int(self.LOCK_TIMEOUT * 1000)) - if self.single_user: - # prevents concurrent readers while DB is modified, but speeds up imports by 1.5-2x: - self.pragma('main.journal_mode=delete') - else: - # WAL-mode allows one writer and multiple readers, but slows down large data imports: - self.pragma('main.journal_mode=wal') + if not self.readonly: + # Skip this if the DB is readonly, since this change amounts to a write OP: + # TO DO: also consider/test write-protected DB file which amounts to read-only + if self.single_user: + # prevents concurrent readers while DB is modified, but speeds up imports by 1.5-2x: + self.pragma('main.journal_mode=delete') + else: + # WAL-mode allows one writer and multiple readers, but slows down large data imports + self.pragma('main.journal_mode=wal') def configure_temp_dir(self): """Configure the SQLite temp directory to be in the same location as the database file, From ac5145e02643e6833e90a1bcba68ad871e33cfbb Mon Sep 17 00:00:00 2001 From: Hans Chalupsky Date: Fri, 13 Jan 2023 15:18:11 -0800 Subject: [PATCH 2/5] Initial revision --- examples/kypherv-similarity-queries.ipynb | 4436 +++++++++++++++++++++ examples/kypherv-similarity-queries.py | 707 ++++ 2 files changed, 5143 insertions(+) create mode 100644 examples/kypherv-similarity-queries.ipynb create mode 100644 examples/kypherv-similarity-queries.py diff --git a/examples/kypherv-similarity-queries.ipynb b/examples/kypherv-similarity-queries.ipynb new file mode 100644 index 000000000..918366c69 --- /dev/null +++ b/examples/kypherv-similarity-queries.ipynb @@ -0,0 +1,4436 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "9ef09c46", + "metadata": {}, + "source": [ + "# Query knowledge graphs and embeddings with KGTK Kypher-V" + ] + }, + { + "cell_type": "markdown", + "id": "3977040d", + "metadata": {}, + "source": [ + "Kypher-V supports import and queries over vector data. Kypher-V extends\n", + "Kypher to allow work with unstructured data such as text, images, and so\n", + "on, represented by embedding vectors. Kypher-V provides efficient storage,\n", + "indexing and querying of large-scale vector data on a laptop. It is fully\n", + "integrated into Kypher to enable expressive hybrid queries over\n", + "Wikidata-size structured and unstructured data. To the best of our\n", + "knowledge, this is the first system providing such a functionality in a\n", + "query language for knowledge graphs." + ] + }, + { + "cell_type": "markdown", + "id": "77e2c3c7", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "Please see the [**Kypher-V Manual**](https://kgtk.readthedocs.io/en/latest/transform/query/#kypher-v)\n", + "for an introduction to the basic concepts and usage." + ] + }, + { + "cell_type": "markdown", + "id": "fc8a63d6", + "metadata": {}, + "source": [ + "\n", + "### Setup\n", + "\n", + "Some preliminaries to facilitate command invocation and result formatting:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a2d9faa5", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "from IPython.display import display, HTML\n", + "from kgtk.functions import kgtk\n", + "\n", + "def show_html(img_width=150):\n", + " \"\"\"Display command output in 'out' as HTML after munging image links for inline display.\"\"\"\n", + " output = '\\n'.join(out)\n", + " html = re.sub(r'"(https?://upload.wikimedia.org/[^<]+)"', \n", + " f'', \n", + " output)\n", + " display(HTML(html))" + ] + }, + { + "cell_type": "markdown", + "id": "aae41154", + "metadata": {}, + "source": [ + "This notebook contains a number of example queries using Kypher-V. The queries assume the existence of a number of similarity graph caches in the DB directory which are defined here via shell variables:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cc177681", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "env: DB=/kgtk-data/kypherv\n", + "env: MAIN=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-main.sqlite3.db\n", + "env: COMPLEX=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db\n", + "env: TRANSE=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-transe-embeddings.sqlite3.db\n", + "env: ABSTRACT=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db\n", + "env: IMAGE=/kgtk-data/kypherv/wikimedia-capcom-image-embeddings-v2.sqlite3.db\n" + ] + } + ], + "source": [ + "DB=\"/kgtk-data/kypherv\"\n", + "%env DB={DB}\n", + "%env MAIN={DB}/wikidata-20221102-dwd-v8-main.sqlite3.db\n", + "%env COMPLEX={DB}/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db\n", + "%env TRANSE={DB}/wikidata-20221102-dwd-v8-transe-embeddings.sqlite3.db\n", + "%env ABSTRACT={DB}/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db\n", + "%env IMAGE={DB}/wikimedia-capcom-image-embeddings-v2.sqlite3.db" + ] + }, + { + "cell_type": "markdown", + "id": "98a8081a", + "metadata": {}, + "source": [ + "If you copied the graph caches to a different location, please adjust the\n", + "paths and definitions accordingly." + ] + }, + { + "cell_type": "markdown", + "id": "3fad6635", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "Throughout the notebook we use a number of different invocation styles for\n", + "the `kgtk` command to better control the appearance of the generated output.\n", + "We either use it via the `!kgtk ...` syntax directly, use the `kgtk(...)`\n", + "function which produces an HTML rendering of a Pandas frame containing the\n", + "result, or we use the `show_html` function for some additional control on\n", + "how long texts and inline images are displayed. All of these incantations\n", + "should be straightforward to translate into a shell environment if needed." + ] + }, + { + "cell_type": "markdown", + "id": "0a8425f0", + "metadata": {}, + "source": [ + "\n", + "### Similarity graph caches\n", + "\n", + "The examples in this notebook use a number of different standard and similarity\n", + "graph caches based on `wikidata-20221102-dwd-v8`. These graph caches are\n", + "available in the `DB` directory of the `ckg06` server from where they can be\n", + "copied or accessed directly in example queries. It will generally not be\n", + "possible to run the notebook directly from that server, so if you want to\n", + "run and experiment with the notebook in a Jupyter environment, you have to\n", + "copy the graph caches to a different location where a notebook server can be run.\n", + "Make sure to also include the associated ANNS index files that end in\n", + "a `.faiss.idx` extension." + ] + }, + { + "cell_type": "markdown", + "id": "f0f027c2", + "metadata": {}, + "source": [ + "This notebook also does not show how the individual similarity caches were\n", + "constructed. To see how that can be done, please consult\n", + "the [**Kypher-V Manual**](https://kgtk.readthedocs.io/en/latest/transform/query/#kypher-v)\n", + "or look at the respective `*.db.build.txt` files in the `DB` directory. For reference,\n", + "we show just one incantation here on how the `COMPLEX` graph cache was built. Other\n", + "graph caches were built similarly with some modifications to adjust for differences in\n", + "the embedding data used (for `COMPLEX` this takes about 3 hours to run):" + ] + }, + { + "cell_type": "markdown", + "id": "f959fe37", + "metadata": {}, + "source": [ + "```\n", + "$ export WD=.../datasets/wikidata-20221102-dwd-v8\n", + "\n", + "$ cat $WD/wikidatadwd.complEx.graph-embeddings.txt | sed -e 's/ /\\t/' \\\n", + " | kgtk --debug add-id --no-input-header=False --input-column-names node1 node2 \\\n", + " --implied-label emb \\\n", + " / query --gc $DB/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db \\\n", + " -i - --as complex \\\n", + " --idx vector:node2/nn/ram=25g/nlist=16k mode:valuegraph \\\n", + " --single-user --limit 5\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "21192630", + "metadata": {}, + "source": [ + "We use the following similarity graph caches which can be combined\n", + "with a main graph cache using one or more `--auxiliary-cache` or `--ac`\n", + "options. The `COMPLEX` graph cache contains 59M 100-D ComplEx\n", + "graph embeddings:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0fd79247", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graph Cache:\r\n", + "DB file: /kgtk-data/kypherv/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db\r\n", + " size: 28.92 GB \tfree: 0 Bytes \tmodified: 2022-12-15 20:40:26\r\n", + "\r\n", + "KGTK File Information:\r\n", + "complex:\r\n", + " size: 0 Bytes \tmodified: 2022-12-15 17:55:31 \tgraph: graph_1\r\n", + "\r\n", + "Graph Table Information:\r\n", + "graph_1:\r\n", + " size: 29.76 GB \tcreated: 2022-12-15 17:55:31\r\n", + " header: ['node1', 'label', 'node2', 'id']\r\n" + ] + } + ], + "source": [ + "!kgtk query --gc $COMPLEX --sc" + ] + }, + { + "cell_type": "markdown", + "id": "f62a256f", + "metadata": {}, + "source": [ + "The `TRANSE` graph cache contains 59M 100-D TransE graph embeddings:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a0640c6a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graph Cache:\r\n", + "DB file: /kgtk-data/kypherv/wikidata-20221102-dwd-v8-transe-embeddings.sqlite3.db\r\n", + " size: 28.92 GB \tfree: 0 Bytes \tmodified: 2022-12-17 11:39:02\r\n", + "\r\n", + "KGTK File Information:\r\n", + "transe:\r\n", + " size: 0 Bytes \tmodified: 2022-12-16 14:09:02 \tgraph: graph_1\r\n", + "\r\n", + "Graph Table Information:\r\n", + "graph_1:\r\n", + " size: 29.76 GB \tcreated: 2022-12-16 14:09:02\r\n", + " header: ['node1', 'node2', 'label', 'id']\r\n" + ] + } + ], + "source": [ + "!kgtk query --gc $TRANSE --sc" + ] + }, + { + "cell_type": "markdown", + "id": "bde8f7a0", + "metadata": {}, + "source": [ + "The `ABSTRACT` graph cache contains the sentences and embedding vectors\n", + "generated from the first sentences of Wikipedia short abstracts. It\n", + "contains about 6M 768-D Roberta base vectors:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "96060121", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graph Cache:\r\n", + "DB file: /kgtk-data/kypherv/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db\r\n", + " size: 26.32 GB \tfree: 0 Bytes \tmodified: 2023-01-09 18:14:00\r\n", + "\r\n", + "KGTK File Information:\r\n", + "sentence:\r\n", + " size: 256.32 MB \tmodified: 2023-01-04 13:53:44 \tgraph: graph_2\r\n", + "abstract:\r\n", + " size: 0 Bytes \tmodified: 2023-01-09 13:45:47 \tgraph: graph_1\r\n", + "\r\n", + "Graph Table Information:\r\n", + "graph_1:\r\n", + " size: 25.16 GB \tcreated: 2023-01-09 13:45:47\r\n", + " header: ['node1', 'label', 'node2', 'id']\r\n", + "graph_2:\r\n", + " size: 1.23 GB \tcreated: 2023-01-09 18:13:31\r\n", + " header: ['node1', 'label', 'node2', 'id']\r\n" + ] + } + ], + "source": [ + "!kgtk query --gc $ABSTRACT --sc" + ] + }, + { + "cell_type": "markdown", + "id": "a1b44a10", + "metadata": {}, + "source": [ + "The `IMAGE` graph cache contains image embeddings published by the\n", + "\n", + "Wikipedia image/caption matching challenge. The embeddings are 2048-D vectors\n", + "taken from the second-to-last layer of a ResNet-50 neural network trained with\n", + "Imagenet data. We only use the 2.7M images associated with English Wikipedia\n", + "pages. The resulting vector graph cache is shown here:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f5e848a8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graph Cache:\r\n", + "DB file: /kgtk-data/kypherv/wikimedia-capcom-image-embeddings-v2.sqlite3.db\r\n", + " size: 24.39 GB \tfree: 0 Bytes \tmodified: 2023-01-11 14:10:32\r\n", + "\r\n", + "KGTK File Information:\r\n", + "wiki_image:\r\n", + " size: 0 Bytes \tmodified: 2023-01-11 12:54:36 \tgraph: graph_1\r\n", + "\r\n", + "Graph Table Information:\r\n", + "graph_1:\r\n", + " size: 24.42 GB \tcreated: 2023-01-11 12:54:36\r\n", + " header: ['node1', 'label', 'node2', 'id', 'page_url', 'qnode']\r\n" + ] + } + ], + "source": [ + "!kgtk query --gc $IMAGE --sc" + ] + }, + { + "cell_type": "markdown", + "id": "bb1f5d25", + "metadata": { + "lines_to_next_cell": 2 + }, + "source": [ + "Finally, we also use a standard Wikidata graph cache for the claims and\n", + "labels of `wikidata-20221102-dwd-v8`. It is called `MAIN` below." + ] + }, + { + "cell_type": "markdown", + "id": "5705925c", + "metadata": {}, + "source": [ + "\n", + "### Vector tables are regular KGTK files\n", + "\n", + "Any KGTK representation that associates a node or edge ID with a vector\n", + "will work. A format we commonly use is where a `node1` points to a vector\n", + "literal in `node2` via an `emb` edge (but any label will do). For example,\n", + "here we show the first three embedding edges in `COMPLEX` (the `node2;_kgtk_vec_qcell`\n", + "column is an auxiliary column automatically computed by ANNS indexing):" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "17b70af7", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
node1labelnode2idnode2;_kgtk_vec_qcell
0Q102108199embb'x13x99x13?x96xb7xf9xbdxb0x99x0fxbexf1xd4|>&x...E4650080
1Q28980109embb'xa1xdax8e=xdfx17x1e>xffxa4y=xf8+(xbeaxb5!xbd...E6863370
2Q42012492embb'txb8xe4xbexfcR;?x00xd6xd1>x87x1fxcdxbeTIx88x...E17629360
\n", + "
" + ], + "text/plain": [ + " node1 label node2 \\\n", + "0 Q102108199 emb b'x13x99x13?x96xb7xf9xbdxb0x99x0fxbexf1xd4|>&x... \n", + "1 Q28980109 emb b'xa1xdax8e=xdfx17x1e>xffxa4y=xf8+(xbeaxb5!xbd... \n", + "2 Q42012492 emb b'txb8xe4xbexfcR;?x00xd6xd1>x87x1fxcdxbeTIx88x... \n", + "\n", + " id node2;_kgtk_vec_qcell \n", + "0 E465008 0 \n", + "1 E686337 0 \n", + "2 E1762936 0 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"query --gc $COMPLEX -i complex --limit 3\"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "77531c4a", + "metadata": {}, + "source": [ + "\n", + "### Vector computation" + ] + }, + { + "cell_type": "markdown", + "id": "97e3c057", + "metadata": {}, + "source": [ + "The simplest operation in Kypher-V is a similarity computation between two vectors\n", + "which we perform here using the `ABSTRACT` graph cache:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "10524248", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'Aristotle'@en'Socrates'@en0.908608
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'Aristotle'@en 'Socrates'@en 0.908608" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\" \n", + " query --gc $MAIN --ac $ABSTRACT\n", + " -i abstract -i labels\n", + " --match 'abstract: (x:Q868)-[]->(xv),\n", + " (y:Q913)-[]->(yv),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "d859d976", + "metadata": {}, + "source": [ + "\n", + "### Brute-force similarity search" + ] + }, + { + "cell_type": "markdown", + "id": "4e95a7a2", + "metadata": {}, + "source": [ + "A more interesting operation is *similarity search* where we look\n", + "for the most similar matches for a given seed. In the query below, we\n", + "use a simple but expensive brute-force search over about 10,000 input\n", + "vectors by computing similarities between `x` and each possible `y`,\n", + "then sorting and returning the top-10. This is still pretty fast\n", + "given that the set of inputs is fairly small:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d7ac35fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'Socrates'@en'Socrates'@en1.000000
1'Socrates'@en'early life of Plato'@en0.938260
2'Socrates'@en'Aristippus'@en0.934973
3'Socrates'@en'Empedocles'@en0.930798
4'Socrates'@en'Adamantios Korais'@en0.928561
5'Socrates'@en'Menedemus'@en0.928002
6'Socrates'@en'Plato'@en0.926748
7'Socrates'@en'Eubulides'@en0.925711
8'Socrates'@en'Iosipos Moisiodax'@en0.924585
9'Socrates'@en'Henry Oldenburg'@en0.923927
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'Socrates'@en 'Socrates'@en 1.000000\n", + "1 'Socrates'@en 'early life of Plato'@en 0.938260\n", + "2 'Socrates'@en 'Aristippus'@en 0.934973\n", + "3 'Socrates'@en 'Empedocles'@en 0.930798\n", + "4 'Socrates'@en 'Adamantios Korais'@en 0.928561\n", + "5 'Socrates'@en 'Menedemus'@en 0.928002\n", + "6 'Socrates'@en 'Plato'@en 0.926748\n", + "7 'Socrates'@en 'Eubulides'@en 0.925711\n", + "8 'Socrates'@en 'Iosipos Moisiodax'@en 0.924585\n", + "9 'Socrates'@en 'Henry Oldenburg'@en 0.923927" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $ABSTRACT\n", + " -i abstract -i labels -i claims\n", + " --match 'abstract: (x:Q913)-[]->(xv), (y)-[]->(yv),\n", + " claims: (y)-[:P106]->(:Q4964182),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim'\n", + " --order 'sim desc'\n", + " --limit 10\n", + " \"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "7cef3f07", + "metadata": {}, + "source": [ + "There are about 9M Q5's (humans) that have short abstract vectors:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d4a26fd8", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
count(DISTINCT graph_1_c2.\"node1\")
08944218
\n", + "
" + ], + "text/plain": [ + " count(DISTINCT graph_1_c2.\"node1\")\n", + "0 8944218" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $ABSTRACT\n", + " -i abstract -i labels -i claims\n", + " --match 'abstract: (x:Q913)-[]->(xv),\n", + " claims: (y)-[:P31]->(:Q5)'\n", + " --return 'count(distinct y)' --force\n", + " \"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "2dbc73f5", + "metadata": {}, + "source": [ + "If we used the same brute-force search from above on this much larger set,\n", + "it would take about 5 min to run (which is why this command is disabled):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70592e3a", + "metadata": {}, + "outputs": [], + "source": [ + "!time DISABLED kgtk query --gc $MAIN \\\n", + " --ac $ABSTRACT \\\n", + " -i abstract -i labels -i claims \\\n", + " --match 'abstract: (x:Q913)-[]->(xv), (y)-[]->(yv), \\\n", + " claims: (y)-[:P31]->(:Q5), \\\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)' \\\n", + " --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim' \\\n", + " --order 'sim desc' \\\n", + " --limit 10" + ] + }, + { + "cell_type": "markdown", + "id": "cf153415", + "metadata": {}, + "source": [ + "```\n", + "xlabel\tylabel\tsim\n", + "'Socrates'@en\t'Socrates'@en\t1.0000001192092896\n", + "'Socrates'@en\t'Anytus'@en\t0.9346579909324646\n", + "'Socrates'@en\t'Heraclitus'@en\t0.9344534277915955\n", + "'Socrates'@en\t'Hippocrates'@en\t0.9304061532020569\n", + "'Socrates'@en\t'Cleisthenes'@en\t0.9292828440666199\n", + "'Socrates'@en\t'Aristides'@en\t0.9283562898635864\n", + "'Socrates'@en\t'Yannis Xirotiris'@en\t0.926308274269104\n", + "'Socrates'@en\t'Sotiris Trivizas'@en\t0.9255445003509521\n", + "'Socrates'@en\t'Aris Maragkopoulos'@en\t0.9234243035316467\n", + "'Socrates'@en\t'Valerios Stais'@en\t0.919943630695343\n", + "93.859u 38.640s 4:49.84 45.7%\t0+0k 18782808+8io 0pf+0w\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "952b21f2", + "metadata": {}, + "source": [ + "\n", + "### Indexed similarity search" + ] + }, + { + "cell_type": "markdown", + "id": "82a9b508", + "metadata": {}, + "source": [ + "For much faster search, we use an ANNS index constructed when the vector data\n", + "was imported which now runs in less than a second compared to 5 minutes before.\n", + "Results here are slightly different from above, since it does not restrict on\n", + "occupation = philosopher (we will address that later):" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2aab7ebe", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'Socrates'@en'Socrates'@en1.000000
1'Socrates'@en'Histories'@en0.937620
2'Socrates'@en'Cadmus'@en0.915083
3'Socrates'@en'Eudorus of Alexandria'@en0.914027
4'Socrates'@en'John Wilkins'@en0.913926
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'Socrates'@en 'Socrates'@en 1.000000\n", + "1 'Socrates'@en 'Histories'@en 0.937620\n", + "2 'Socrates'@en 'Cadmus'@en 0.915083\n", + "3 'Socrates'@en 'Eudorus of Alexandria'@en 0.914027\n", + "4 'Socrates'@en 'John Wilkins'@en 0.913926" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $ABSTRACT\n", + " -i abstract -i labels -i claims\n", + " --match 'abstract: (x:Q913)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 5, nprobe: 4}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " --limit 10\n", + " \"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "fdafba56", + "metadata": {}, + "source": [ + "\n", + "### Full similarity join\n", + "\n", + "Below we query for three philosophers' top-k similar neighbors that are also humans and have\n", + "occupation (`P106`) philosopher. Dynamic scaling ensures that `k` gets increased dynamically\n", + "up to `maxk` until we've found enough qualifying results for each:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "29dc4f04", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'Plato'@en'Aenesidemus'@en0.965394
1'Plato'@en'Hicetas'@en0.964990
2'Plato'@en'Empedocles'@en0.962913
3'Plato'@en'Eubulides'@en0.962904
4'Plato'@en'Aristotle'@en0.961594
5'Aristotle'@en'Bryson of Achaea'@en0.974303
6'Aristotle'@en'Michael Papageorgiou'@en0.970041
7'Aristotle'@en'Hicetas'@en0.967692
8'Aristotle'@en'Anaxarchus'@en0.967682
9'Aristotle'@en'Metrodorus of Lampsacus'@en0.967349
10'Socrates'@en'Eudorus of Alexandria'@en0.914027
11'Socrates'@en'John Wilkins'@en0.913926
12'Socrates'@en'Eurytus'@en0.911165
13'Socrates'@en'Syrianus'@en0.908286
14'Socrates'@en'Peter the Iberian'@en0.907799
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'Plato'@en 'Aenesidemus'@en 0.965394\n", + "1 'Plato'@en 'Hicetas'@en 0.964990\n", + "2 'Plato'@en 'Empedocles'@en 0.962913\n", + "3 'Plato'@en 'Eubulides'@en 0.962904\n", + "4 'Plato'@en 'Aristotle'@en 0.961594\n", + "5 'Aristotle'@en 'Bryson of Achaea'@en 0.974303\n", + "6 'Aristotle'@en 'Michael Papageorgiou'@en 0.970041\n", + "7 'Aristotle'@en 'Hicetas'@en 0.967692\n", + "8 'Aristotle'@en 'Anaxarchus'@en 0.967682\n", + "9 'Aristotle'@en 'Metrodorus of Lampsacus'@en 0.967349\n", + "10 'Socrates'@en 'Eudorus of Alexandria'@en 0.914027\n", + "11 'Socrates'@en 'John Wilkins'@en 0.913926\n", + "12 'Socrates'@en 'Eurytus'@en 0.911165\n", + "13 'Socrates'@en 'Syrianus'@en 0.908286\n", + "14 'Socrates'@en 'Peter the Iberian'@en 0.907799" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $ABSTRACT\n", + " -i abstract -i labels -i claims\n", + " --match 'abstract: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 5, maxk: 1024, nprobe: 4}]->(y),\n", + " claims: (y)-[:P106]->(:Q4964182),\n", + " (y)-[:P31]->(:Q5),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q859\", \"Q868\", \"Q913\"] and x != y'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "0a1b627c", + "metadata": {}, + "source": [ + "For comparison, here is a run without dynamic scaling which returns much fewer results, since\n", + "only a small number of the top-5 similar results for each input also satisfy the post conditions:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "17cdf5f7", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'Plato'@en'Aenesidemus'@en0.965394
1'Plato'@en'Hicetas'@en0.964990
2'Plato'@en'Empedocles'@en0.962913
3'Plato'@en'Eubulides'@en0.962904
4'Aristotle'@en'Bryson of Achaea'@en0.974303
5'Aristotle'@en'Michael Papageorgiou'@en0.970041
6'Aristotle'@en'Hicetas'@en0.967692
7'Aristotle'@en'Anaxarchus'@en0.967682
8'Socrates'@en'Eudorus of Alexandria'@en0.914027
9'Socrates'@en'John Wilkins'@en0.913926
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'Plato'@en 'Aenesidemus'@en 0.965394\n", + "1 'Plato'@en 'Hicetas'@en 0.964990\n", + "2 'Plato'@en 'Empedocles'@en 0.962913\n", + "3 'Plato'@en 'Eubulides'@en 0.962904\n", + "4 'Aristotle'@en 'Bryson of Achaea'@en 0.974303\n", + "5 'Aristotle'@en 'Michael Papageorgiou'@en 0.970041\n", + "6 'Aristotle'@en 'Hicetas'@en 0.967692\n", + "7 'Aristotle'@en 'Anaxarchus'@en 0.967682\n", + "8 'Socrates'@en 'Eudorus of Alexandria'@en 0.914027\n", + "9 'Socrates'@en 'John Wilkins'@en 0.913926" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $ABSTRACT\n", + " -i abstract -i labels -i claims\n", + " --match 'abstract: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 5, nprobe: 4}]->(y),\n", + " claims: (y)-[:P106]->(:Q4964182),\n", + " (y)-[:P31]->(:Q5),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q859\", \"Q868\", \"Q913\"] and x != y'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "markdown", + "id": "17354570", + "metadata": {}, + "source": [ + "\n", + "## Example applications" + ] + }, + { + "cell_type": "markdown", + "id": "2580e356", + "metadata": {}, + "source": [ + "### Image search" + ] + }, + { + "cell_type": "markdown", + "id": "48725c8b", + "metadata": {}, + "source": [ + "In the examples below, we use image similarity to link QNodes in Wikidata. We\n", + "use the precomputed `IMAGE` graph cache (see above) which contains embeddings\n", + "for about 2.7M images linked to their respective Wikipedia pages and Wikidata\n", + "QNodes. " + ] + }, + { + "cell_type": "markdown", + "id": "882d2c4a", + "metadata": {}, + "source": [ + "We start with a QNode (such a the one for Barack Obama below), find one or more\n", + "images associated with that QNode, look up their image embeddings and then find\n", + "other similar images and their associated QNodes." + ] + }, + { + "cell_type": "markdown", + "id": "12333d3d", + "metadata": {}, + "source": [ + "We do not compute any image embeddings on the fly here, we simply link nodes based\n", + "on similarity of images they are associated with. Note that this will often not\n", + "preserve the type of the source node as can be seen in the result for Barack Obama.\n", + "To enforce such type or other restrictions additional clauses can be added.\n", + "Since there are multiple images associated with Barack Obama, we use a `not exists`\n", + "clause to only look at the first one to make the results less cluttered:\n", + "\n", + "Barack Obama:" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "b2385005", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
qnodelabelsimimage
Q76'Barack Obama'@en1
Q567497'France–Germany relations'@en0.77576
Q27804564'Wahidullah Waissi'@en0.75814
Q7747'Vladimir Putin'@en0.75264
Q188888'Teachers\\' Day'@en0.75262
Q702725'Shirani Bandaranayake'@en0.75063
Q18274595'list of international presidential trips made by Serzh Sargsyan'@en0.74954
Q1151352'John Piper'@en0.74702
Q170645'2018 FIFA World Cup'@en0.74702
Q381157'Orrin Hatch'@en0.74424
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $IMAGE --ac $MAIN \\\n", + " -i wiki_image -i labels \\\n", + " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", + " (xiv)-[r:kvec_topk_cos_sim {k: 10, nprobe: 8}]->(yimg), \\\n", + " (yimg)-[ry {qnode: y}]->(), \\\n", + " labels: (y)-[]->(ylabel)' \\\n", + " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", + " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", + " --para SEED=Q76 \\\n", + " / html\n", + "\n", + "show_html(img_width=200)" + ] + }, + { + "cell_type": "markdown", + "id": "d843c115", + "metadata": {}, + "source": [ + "To get more type appropriate matches, we can add a restriction to only return matches of\n", + "type human (`Q5`):" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "bbdb90b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
qnodelabelsimimage
Q76'Barack Obama'@en1
Q27804564'Wahidullah Waissi'@en0.75814
Q7747'Vladimir Putin'@en0.75264
Q702725'Shirani Bandaranayake'@en0.75063
Q1151352'John Piper'@en0.74702
Q381157'Orrin Hatch'@en0.74424
Q2339668'Twan Huys'@en0.749
Q128949'Miri Regev'@en0.73791
Q160157'Joe Lieberman'@en0.7345
Q355130'Richard Petty'@en0.75015
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $IMAGE --ac $MAIN \\\n", + " -i wiki_image -i labels -i claims \\\n", + " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", + " (xiv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(yimg), \\\n", + " (yimg)-[ry {qnode: y}]->(), \\\n", + " claims: (y)-[:P31]->(:Q5), \\\n", + " labels: (y)-->(ylabel)' \\\n", + " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", + " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", + " --para SEED=Q76 \\\n", + " / html\n", + "\n", + "show_html(img_width=200)" + ] + }, + { + "cell_type": "markdown", + "id": "23953f81", + "metadata": {}, + "source": [ + "Charles Dadant: again, note that some of the results are not of type human but are\n", + "just linked to a similar image:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "965f657a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
qnodelabelsimimage
Q582964'Charles Dadant'@en1
Q5956831'Hymns Ancient and Modern'@en0.84983
Q3759575'list of American Civil War generals (Confederate)'@en0.84305
Q6084534'Ismael Cerna'@en0.832
Q26003'Sergey Botkin'@en0.82388
Q5494660'Fred Bonsor'@en0.81946
Q3303297'ironmaster'@en0.81704
Q4631421'22nd Regiment Alabama Infantry'@en0.80955
Q4641399'5th North Carolina Regiment'@en0.80858
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $IMAGE --ac $MAIN \\\n", + " -i wiki_image -i labels \\\n", + " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", + " (xiv)-[r:kvec_topk_cos_sim {k: 10, nprobe: 8}]->(yimg), \\\n", + " (yimg)-[ry {qnode: y}]->(), \\\n", + " labels: (y)-[]->(ylabel)' \\\n", + " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", + " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", + " --para SEED=Q582964 \\\n", + " --limit 20 \\\n", + " / html\n", + "\n", + "show_html(img_width=100)" + ] + }, + { + "cell_type": "markdown", + "id": "933c92d5", + "metadata": {}, + "source": [ + "Beaumaris Castle in Wales:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "fe02b490", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
qnodelabelsimimage
Q756815'Beaumaris Castle'@en1
Q267153'list of monasteries dissolved by Henry VIII of England'@en0.79353
Q6566349'list of Category A listed buildings in Dumfries and Galloway'@en0.79212
Q40889043'Scheduled monuments in Renfrewshire'@en0.7897
Q912664'Clan MacDougall'@en0.78582
Q922422'Warkworth Castle'@en0.78453
Q6566359'list of Category A listed buildings in Fife'@en0.78269
Q16148507'list of Historic Scotland properties'@en0.78237
Q11808'castles in Great Britain and Ireland'@en0.78151
Q16148507'list of Historic Scotland properties'@en0.78122
Q733902'Pirou'@en0.77984
Q2705370'Middleham Castle'@en0.77753
Q941276'Warwick Castle'@en0.77307
Q1043908'Carnasserie Castle'@en0.77267
Q2970999'Château de Villandraut'@en0.7721
Q4185866'Ford Castle'@en0.77163
Q1477839'Tautra Abbey'@en0.7709
Q4434870'list of museums in the Republic of Ireland'@en0.77058
Q42646'Windsor Castle'@en0.76578
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $IMAGE --ac $MAIN \\\n", + " -i wiki_image -i labels \\\n", + " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", + " (xiv)-[r:kvec_topk_cos_sim {k: 20, nprobe: 8}]->(yimg), \\\n", + " (yimg)-[ry {qnode: y}]->(), \\\n", + " labels: (y)-[]->(ylabel)' \\\n", + " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", + " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", + " --para SEED=Q756815 \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "503ecae3", + "metadata": {}, + "source": [ + "" + ] + }, + { + "cell_type": "markdown", + "id": "0635480c", + "metadata": {}, + "source": [ + "Castles similar to Beaumaris Castle but that are located in Austria (with\n", + "country (`P17`) equal to `Q40`). We use a full vector join to get relevant\n", + "results further down the similarity list. Note that even with `maxk=1024` we only\n", + "get a few results, and that the similarities are significantly lower than in the\n", + "previous example:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "6a7e72cc", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
qnodelabelsimimage
Q1012592'Burgruine Kaja'@en0.72402
Q15954565'Austrian walled towns'@en0.74951
Q1015533'Burgruine Steuerberg'@en0.70776
Q1015457'Prandegg Castle'@en0.70276
Q188358'Burgruine Dürnstein'@en0.70275
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $IMAGE --ac $MAIN \\\n", + " -i wiki_image -i labels -i claims \\\n", + " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", + " (xiv)-[r:kvec_topk_cos_sim {k: 20, nprobe: 4, maxk: 1024}]->(yimg), \\\n", + " (yimg)-[ry {qnode: y}]->(), \\\n", + " labels: (y)-[]->(ylabel), \\\n", + " claims: (y)-[:P17]->(c:Q40)' \\\n", + " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", + " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", + " --para SEED=Q756815 \\\n", + " --limit 20 \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "a4547c45", + "metadata": {}, + "source": [ + "\n", + "## Text embedding queries:" + ] + }, + { + "cell_type": "markdown", + "id": "ce74f23f", + "metadata": {}, + "source": [ + "In the following example we dynamically compute an embedding vector\n", + "for a text query and then use the similarity machinery to query for\n", + "matching QNodes. The basic story here is the following:" + ] + }, + { + "cell_type": "markdown", + "id": "33444394", + "metadata": {}, + "source": [ + "- formulate a simple textual query such as 'Ancient Greek philosopher'\n", + "- create a KGTK input file for it/them and run them through the 'text-embedding' command\n", + "- query WD by finding top-k matches based on short abstract text embedding vectors\n", + "- then filter with additional restrictions to get more relevant results." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "cc90540b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Running with logging level 30\n", + "2023-01-13 13:53:13.932934: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", + "2023-01-13 13:53:13.932961: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n", + "/home/hans/.local/share/virtualenvs/ksink39/lib/python3.9/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 9010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:115.)\n", + " return torch._C._cuda_getDeviceCount() > 0\n", + "Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 16.01it/s]\n", + "node1\n", + "q1\n", + "q2\n", + "q3\n" + ] + } + ], + "source": [ + "!echo '\\\n", + "q1\tAncient Greek philosopher\\n\\\n", + "q2\tcastle in Austria\\n\\\n", + "q3\taward-winning actor and comedian' | \\\n", + "sed -e 's/^ *//' | \\\n", + "kgtk cat --no-input-header --input-column-names node1 node2 --implied-label sentence \\\n", + " / add-id \\\n", + " / text-embedding -i - --model roberta-base-nli-mean-tokens \\\n", + " --output-data-format kgtk --output-property emb -o - \\\n", + " / query -i - --idx vector:node2 --as text_emb_queries --match '(x)' --return x" + ] + }, + { + "cell_type": "markdown", + "id": "e79d64e4", + "metadata": {}, + "source": [ + "The above created 768-D text embedding vector for three short queries\n", + "using the same text embedding type as used in our `ABSTRACT` embeddings.\n", + "Now we find Wikidata QNodes whose short-abstract embedding vector is most similar\n", + "to the queries, and that satisfy any additional conditions we might have.\n", + "Note that the queries in this example are much shorter than the first sentences\n", + "of our Wikipedia abstracts, thus the similarity matching is not very good, but\n", + "we can compensate for some of that by adding additional restrictions:" + ] + }, + { + "cell_type": "markdown", + "id": "888c6c20", + "metadata": {}, + "source": [ + "Matches for \"Ancient Greek philosopher\" that have occupation (`P106`) philosopher:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "a98a939d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
yylabelsimysent
Q325955'Speusippus'@en0.9440442323684692Speusippus (/spjuːˈsɪpəs/; Greek: Σπεύσιππος; c. 408 – 339/8 BC) was an ancient Greek philosopher.
Q1200209'Dercil·lides'@en0.9357015490531921Dercyllides was an ancient Greek Platonist philosopher.
Q2927235'Bryson of Achaea'@en0.9300292134284973Bryson of Achaea (or Bryson the Achaean; Greek: Βρύσων ὁ Ἀχαιός Vryson o Acheos, gen.: Βρύσωνος Vrysonos; fl. 330 BC) was an ancient Greek philosopher.
Q9250176'Echecratides'@en0.9262670874595642Echecratides (Ancient Greek: Ἐχεκρατίδης) was an Ancient Greek Peripatetic philosopher who is mentioned among the disciples of Aristotle.
Q668009'Aristotelis the Dialectician'@en0.9235112071037292Aristotle the Dialectician (or Aristoteles of Argos, Greek: Ἀριστοτέλης; fl. 3rd century BC), was an ancient Greek dialectic philosopher from Argos.
Q366031'Anaxarchus'@en0.9216422438621521Anaxarchus (/ˌænəɡˈzɑːrkəs/; Ancient Greek: Ἀνάξαρχος; c. 380 – c. 320 BC) was a Greek philosopher of the school of Democritus.
Q297420'Panaetius'@en0.9199343919754028Panaetius (/pəˈniːʃiəs/; Greek: Παναίτιος, translit. Panetios; c.  185 – c.  110/109 BC) of Rhodes was an ancient Greek Stoic philosopher.
Q962486'Echecrates of Flius'@en0.9173671007156372Echecrates (Greek: Ἐχεκράτης) was a Pythagorean philosopher from the ancient Greek town of Phlius.
Q365977'Bias of Priene'@en0.9115197658538818Bias (/ˈbaɪəs/; Greek: Βίας ὁ Πριηνεύς; fl. 6th century BC) of Priene was a Greek sage.
Q13634113'Michael Papageorgiou'@en0.9098575115203857Michail Papageorgiou (Greek: Μιχαήλ Παπαγεωργίου; 1727–1796) was a Greek philosopher.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --ac $MAIN --ac $ABSTRACT \\\n", + " -i text_emb_queries -i abstract -i labels -i claims -i sentence \\\n", + " --match 'queries: (x:q1)-[]->(xv), \\\n", + " abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 4}]->(y), \\\n", + " claims: (y)-[:P106]->(:Q4964182), \\\n", + " labels: (y)-->(yl), \\\n", + " sentence: (y)-->(ys)' \\\n", + " --return 'y as y, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "ef9fb4af", + "metadata": {}, + "source": [ + "Matches for \"castle in Austria\" that have country (`P17`) Austria:" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3e6c1169", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
yylabelsimysent
Q673952'Haidershofen'@en0.9632641077041626Haidershofen is a town located in Austria.
Q256996'Grieskirchen'@en0.9585073590278625Grieskirchen is a town in Austria.
Q2240044'Annabichl Castle'@en0.9552702307701111Annabichl Castle is a castle in Austria.
Q7378773'Ruine Hauenstein'@en0.9487172365188599Ruine Hauenstein is a castle in Styria, Austria.
Q37809497'Ruine Neudeck'@en0.9469427466392517Ruine Neudeck is a castle in Styria, Austria.
Q7378781'Ruine Raabeck'@en0.946899950504303Ruine Raabeck is a castle in Styria, Austria.
Q4998499'Burg Kaisersberg'@en0.9449542760848999Burg Kaisersberg is a castle in Styria, Austria.
Q674097'Mannersdorf am Leithagebirge'@en0.9442192316055298Mannersdorf am Leithagebirge is a town in Austria.
Q7378769'Ruine Kalsberg'@en0.943941056728363Ruine Kalsberg is a castle in Styria, Austria.
Q1012734'Burg Krems'@en0.942907452583313Burg Krems is a castle in Styria, Austria.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --ac $MAIN --ac $ABSTRACT \\\n", + " -i text_emb_queries -i abstract -i labels -i claims -i sentence \\\n", + " --match 'queries: (x:q2)-[]->(xv), \\\n", + " abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", + " claims: (y)-[:P17]->(:Q40), \\\n", + " labels: (y)-->(yl), \\\n", + " sentence: (y)-->(ys)' \\\n", + " --return 'y as y, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "0a0999da", + "metadata": {}, + "source": [ + "Matches for \"award-winning actor and comedian\" that are of type human\n", + "and have country of citizenship (`P27`) UK:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3bc0fe55", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
yylabelsimysent
Q27924985'Toby Williams'@en0.9054120779037476Toby Williams is a British actor, writer and award-winning stand-up comedian performing both as himself and Dr George Ryegold.
Q7087463'Oliver Cotton'@en0.8896428942680359Oliver Charles Cotton (born 20 June 1944) is an English actor, comedian and playwright, known for his prolific work on stage, TV and film.
Q7704327'Terry Duggan'@en0.8872928619384766Terence A. Duggan (15 April 1932 – 1 May 2008) was a British comedian and actor who had a successful career in cabaret and variety, and played numerous character roles on television.
Q23772268'Guz Khan'@en0.8805128335952759Ghulam Dustgir \\"Guz\\" Khan (born 1986) is a British comedian, impressionist, and actor best known for his work in the TV show Man Like Mobeen and stand up appearances in Live at the Apollo.
Q6988861'Neil Linpow'@en0.8776082992553711Neil Linpow is a multi-award-winning English actor, writer and filmmaker.
Q7320263'Rhashan Stone'@en0.8773206472396851Rhashan Stone is an American actor and comedian based in the UK. He is best known for appearing in many comedy shows such as Desmond\\'s and Mutual Friends.
Q7608608'Stephen Ashfield'@en0.8739212155342102Stephen Ashfield is an Olivier Award-winning Scottish actor.
Q5290454'Dominic Anciano'@en0.8687206506729126Dominic Anciano (born 1959) is an English producer, actor, director, writer and comedian best known for his role as Sgt.
Q7626524'Stuart Fell'@en0.8686633110046387Stuart Fell is a professional actor and stuntman.
Q5534773'Geoffrey McGivern'@en0.8672927021980286Geoffrey M. McGivern is a British actor in film, radio, stage and television, as well as a comedian.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --ac $MAIN --ac $ABSTRACT \\\n", + " -i text_emb_queries -i abstract -i labels -i claims -i sentence \\\n", + " --match 'queries: (x:q3)-[]->(xv), \\\n", + " abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", + " claims: (y)-[:P31]->(:Q5), \\\n", + " (y)-[:P27]->(:Q145), \\\n", + " labels: (y)-->(yl), \\\n", + " sentence: (y)-->(ys)' \\\n", + " --return 'y as y, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "5aa4d67d", + "metadata": {}, + "source": [ + "\n", + "## Comparing different types of embeddings" + ] + }, + { + "cell_type": "markdown", + "id": "58ee0ab0", + "metadata": {}, + "source": [ + "Below we run a number of similarity queries for each of our various types of\n", + "embeddings to see how they behave relative to each other. Note how they\n", + "behave quite differently, reasonable for some use cases but not so much for others:" + ] + }, + { + "cell_type": "markdown", + "id": "27d5f099", + "metadata": {}, + "source": [ + "### Philosophers:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "1b1ee570", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'Plato'@en'Plato'@en1.000000
1'Plato'@en'Socrates'@en0.778851
2'Plato'@en'Epicurus'@en0.768200
3'Plato'@en'Aratus'@en0.744131
4'Plato'@en'Hippocrates'@en0.742684
5'Plato'@en'Theophrastus'@en0.732886
6'Plato'@en'Aeschines'@en0.727185
7'Plato'@en'Antiphon of Rhamnus'@en0.725084
8'Plato'@en'Gorgias'@en0.724764
9'Plato'@en'Antisthenes'@en0.723077
10'Aristotle'@en'Aristotle'@en1.000000
11'Aristotle'@en'Isocrates'@en0.750148
12'Aristotle'@en'Theophrastus'@en0.738264
13'Aristotle'@en'Augustin-Jean Fresnel'@en0.733388
14'Aristotle'@en'Epicurus'@en0.732325
15'Aristotle'@en'Democritus'@en0.725204
16'Aristotle'@en'Philostratus'@en0.724171
17'Aristotle'@en'Thalis'@en0.722407
18'Aristotle'@en'Simplicius of Cilicia'@en0.721479
19'Aristotle'@en'Apollonius of Tyana'@en0.719047
20'Socrates'@en'Socrates'@en1.000000
21'Socrates'@en'Isocrates'@en0.823131
22'Socrates'@en'Xenocrates'@en0.817216
23'Socrates'@en'Antisthenes'@en0.805441
24'Socrates'@en'Seneca'@en0.789611
25'Socrates'@en'Xenophon'@en0.780991
26'Socrates'@en'Theopompus'@en0.777682
27'Socrates'@en'Clitomachus'@en0.774966
28'Socrates'@en'Aeschines'@en0.773590
29'Socrates'@en'Antiphon of Rhamnus'@en0.767029
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'Plato'@en 'Plato'@en 1.000000\n", + "1 'Plato'@en 'Socrates'@en 0.778851\n", + "2 'Plato'@en 'Epicurus'@en 0.768200\n", + "3 'Plato'@en 'Aratus'@en 0.744131\n", + "4 'Plato'@en 'Hippocrates'@en 0.742684\n", + "5 'Plato'@en 'Theophrastus'@en 0.732886\n", + "6 'Plato'@en 'Aeschines'@en 0.727185\n", + "7 'Plato'@en 'Antiphon of Rhamnus'@en 0.725084\n", + "8 'Plato'@en 'Gorgias'@en 0.724764\n", + "9 'Plato'@en 'Antisthenes'@en 0.723077\n", + "10 'Aristotle'@en 'Aristotle'@en 1.000000\n", + "11 'Aristotle'@en 'Isocrates'@en 0.750148\n", + "12 'Aristotle'@en 'Theophrastus'@en 0.738264\n", + "13 'Aristotle'@en 'Augustin-Jean Fresnel'@en 0.733388\n", + "14 'Aristotle'@en 'Epicurus'@en 0.732325\n", + "15 'Aristotle'@en 'Democritus'@en 0.725204\n", + "16 'Aristotle'@en 'Philostratus'@en 0.724171\n", + "17 'Aristotle'@en 'Thalis'@en 0.722407\n", + "18 'Aristotle'@en 'Simplicius of Cilicia'@en 0.721479\n", + "19 'Aristotle'@en 'Apollonius of Tyana'@en 0.719047\n", + "20 'Socrates'@en 'Socrates'@en 1.000000\n", + "21 'Socrates'@en 'Isocrates'@en 0.823131\n", + "22 'Socrates'@en 'Xenocrates'@en 0.817216\n", + "23 'Socrates'@en 'Antisthenes'@en 0.805441\n", + "24 'Socrates'@en 'Seneca'@en 0.789611\n", + "25 'Socrates'@en 'Xenophon'@en 0.780991\n", + "26 'Socrates'@en 'Theopompus'@en 0.777682\n", + "27 'Socrates'@en 'Clitomachus'@en 0.774966\n", + "28 'Socrates'@en 'Aeschines'@en 0.773590\n", + "29 'Socrates'@en 'Antiphon of Rhamnus'@en 0.767029" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $COMPLEX\n", + " -i complex -i labels\n", + " --match 'complex: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q859\", \"Q868\", \"Q913\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "b70c4344", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'Plato'@en'Plato'@en1.000000
1'Plato'@en'Plotinus'@en0.752719
2'Plato'@en'Cornelius Nepos'@en0.723320
3'Plato'@en'Bret Harte'@en0.706325
4'Plato'@en'Federico Caffè'@en0.702316
5'Plato'@en'Marcel Duchamp'@en0.677284
6'Plato'@en'Quintus Julius Balbus'@en0.662613
7'Plato'@en'Laurentius Abstemius'@en0.662188
8'Plato'@en'Celso Lucio'@en0.654929
9'Plato'@en'Peter von Cornelius'@en0.684013
10'Aristotle'@en'Aristotle'@en1.000000
11'Aristotle'@en'Hans Christian Andersen'@en0.838684
12'Aristotle'@en'Søren Kierkegaard'@en0.819337
13'Aristotle'@en'Jean-Paul Sartre'@en0.809694
14'Aristotle'@en'Ibn Taymiyyah'@en0.797813
15'Aristotle'@en'Carl Friedrich Gauss'@en0.795596
16'Aristotle'@en'Louis Agassiz'@en0.788264
17'Aristotle'@en'Sigmund Freud'@en0.783095
18'Aristotle'@en'Baháʼu'lláh'@en0.777348
19'Aristotle'@en'Alexis Clairaut'@en0.740817
20'Socrates'@en'Socrates'@en1.000000
21'Socrates'@en'Euclid'@en0.822714
22'Socrates'@en'Menexenus'@en0.819183
23'Socrates'@en'Epicurus'@en0.800473
24'Socrates'@en'Titus Pomponius Atticus'@en0.784178
25'Socrates'@en'Hippias'@en0.781964
26'Socrates'@en'Phaenarete'@en0.778690
27'Socrates'@en'Lamprocles'@en0.776861
28'Socrates'@en'Antisthenes'@en0.773629
29'Socrates'@en'Hermogenes'@en0.773619
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'Plato'@en 'Plato'@en 1.000000\n", + "1 'Plato'@en 'Plotinus'@en 0.752719\n", + "2 'Plato'@en 'Cornelius Nepos'@en 0.723320\n", + "3 'Plato'@en 'Bret Harte'@en 0.706325\n", + "4 'Plato'@en 'Federico Caffè'@en 0.702316\n", + "5 'Plato'@en 'Marcel Duchamp'@en 0.677284\n", + "6 'Plato'@en 'Quintus Julius Balbus'@en 0.662613\n", + "7 'Plato'@en 'Laurentius Abstemius'@en 0.662188\n", + "8 'Plato'@en 'Celso Lucio'@en 0.654929\n", + "9 'Plato'@en 'Peter von Cornelius'@en 0.684013\n", + "10 'Aristotle'@en 'Aristotle'@en 1.000000\n", + "11 'Aristotle'@en 'Hans Christian Andersen'@en 0.838684\n", + "12 'Aristotle'@en 'Søren Kierkegaard'@en 0.819337\n", + "13 'Aristotle'@en 'Jean-Paul Sartre'@en 0.809694\n", + "14 'Aristotle'@en 'Ibn Taymiyyah'@en 0.797813\n", + "15 'Aristotle'@en 'Carl Friedrich Gauss'@en 0.795596\n", + "16 'Aristotle'@en 'Louis Agassiz'@en 0.788264\n", + "17 'Aristotle'@en 'Sigmund Freud'@en 0.783095\n", + "18 'Aristotle'@en 'Baháʼu'lláh'@en 0.777348\n", + "19 'Aristotle'@en 'Alexis Clairaut'@en 0.740817\n", + "20 'Socrates'@en 'Socrates'@en 1.000000\n", + "21 'Socrates'@en 'Euclid'@en 0.822714\n", + "22 'Socrates'@en 'Menexenus'@en 0.819183\n", + "23 'Socrates'@en 'Epicurus'@en 0.800473\n", + "24 'Socrates'@en 'Titus Pomponius Atticus'@en 0.784178\n", + "25 'Socrates'@en 'Hippias'@en 0.781964\n", + "26 'Socrates'@en 'Phaenarete'@en 0.778690\n", + "27 'Socrates'@en 'Lamprocles'@en 0.776861\n", + "28 'Socrates'@en 'Antisthenes'@en 0.773629\n", + "29 'Socrates'@en 'Hermogenes'@en 0.773619" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $TRANSE\n", + " -i transe -i labels\n", + " --match 'transe: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q859\", \"Q868\", \"Q913\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "042784ec", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
xlabelylabelsimysent
'Plato'@en'Plato'@en1.0Plato (/ˈpleɪtoʊ/ PLAY-toe; Greek: Πλάτων Plátōn; 428/427 or 424/423 – 348/347 BC) was a Greek philosopher born in Athens during the Classical period in Ancient Greece.
'Plato'@en'Aenesidemus'@en0.965393602848053Aenesidemus (Ancient Greek: Αἰνησίδημος or Αἰνεσίδημος) was a Greek Pyrrhonist philosopher, born in Knossos on the island of Crete.
'Plato'@en'Hicetas'@en0.9649903178215027Hicetas (Ancient Greek: Ἱκέτας or Ἱκέτης; c. 400 – c. 335 BC) was a Greek philosopher of the Pythagorean School.
'Plato'@en'Empedocles'@en0.9629127979278564Empedocles (/ɛmˈpɛdəkliːz/; Greek: Ἐμπεδοκλῆς; c. 494 – c. 434 BC, fl. 444–443 BC) was a Greek pre-Socratic philosopher and a native citizen of Akragas, a Greek city in Sicily.
'Plato'@en'Eubulides'@en0.9629042744636536Eubulides of Miletus (Ancient Greek: Εὐβουλίδης; fl. 4th century BCE) was a Greek philosopher of the Megarian school, a pupil of Euclid of Megara and a contemporary of Aristotle.
'Plato'@en'Aristotle'@en0.9615942239761353Aristotle (/ˈærɪstɒtəl/; Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece.
'Plato'@en'Metrodorus of Lampsacus'@en0.9613872766494751Metrodorus of Lampsacus (Greek: Μητρόδωρος Λαμψακηνός, Mētrodōros Lampsakēnos; 331/0–278/7 BC) was a Greek philosopher of the Epicurean school.
'Plato'@en'Xenophon'@en0.960830569267273Xenophon of Athens (/ˈzɛnəfən, zi-, -fɒn/; Ancient Greek: Ξενοφῶν [ksenopʰɔ̂ːn]; c. 430 – probably 355 or 354 BC) was a Greek military leader, philosopher, and historian, born in Athens.
'Plato'@en'Anaxarchus'@en0.9582780599594116Anaxarchus (/ˌænəɡˈzɑːrkəs/; Ancient Greek: Ἀνάξαρχος; c. 380 – c. 320 BC) was a Greek philosopher of the school of Democritus.
'Plato'@en'Clearchus of Soli'@en0.957090437412262Clearchus of Soli (Greek: Kλέαρχoς ὁ Σολεύς, Klearkhos ho Soleus) was a Greek philosopher of the 4th–3rd century BCE, belonging to Aristotle\\'s Peripatetic school.
'Aristotle'@en'Aristotle'@en0.9999998807907104Aristotle (/ˈærɪstɒtəl/; Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece.
'Aristotle'@en'Bryson of Achaea'@en0.9743033647537231Bryson of Achaea (or Bryson the Achaean; Greek: Βρύσων ὁ Ἀχαιός Vryson o Acheos, gen.: Βρύσωνος Vrysonos; fl. 330 BC) was an ancient Greek philosopher.
'Aristotle'@en'Michael Papageorgiou'@en0.9700412750244141Michail Papageorgiou (Greek: Μιχαήλ Παπαγεωργίου; 1727–1796) was a Greek philosopher.
'Aristotle'@en'Hicetas'@en0.9676922559738159Hicetas (Ancient Greek: Ἱκέτας or Ἱκέτης; c. 400 – c. 335 BC) was a Greek philosopher of the Pythagorean School.
'Aristotle'@en'Anaxarchus'@en0.9676817655563354Anaxarchus (/ˌænəɡˈzɑːrkəs/; Ancient Greek: Ἀνάξαρχος; c. 380 – c. 320 BC) was a Greek philosopher of the school of Democritus.
'Aristotle'@en'Metrodorus of Lampsacus'@en0.9673494100570679Metrodorus of Lampsacus (Greek: Μητρόδωρος Λαμψακηνός, Mētrodōros Lampsakēnos; 331/0–278/7 BC) was a Greek philosopher of the Epicurean school.
'Aristotle'@en'Philolaus'@en0.9670777916908264Philolaus (/ˌfɪləˈleɪəs/; Ancient Greek: Φιλόλαος, Philólaos; c. 470 – c. 385 BCE) was a Greek Pythagorean and pre-Socratic philosopher.
'Aristotle'@en'Phaedo of Elis'@en0.9669275283813477Phaedo of Elis (/ˈfiːdoʊ/; also Phaedon; Greek: Φαίδων ὁ Ἠλεῖος, gen.: Φαίδωνος; fl. 4th century BCE) was a Greek philosopher.
'Aristotle'@en'Asclepiades of Phlius'@en0.965190589427948Asclepiades of Phlius (Greek: Ἀσκληπιάδης ὁ Φλιάσιος; c. 350 – c. 270 BC) was a Greek philosopher in the Eretrian school of philosophy.
'Aristotle'@en'Dionysius of Chalcedon'@en0.965139627456665Dionysius of Chalcedon (Greek: Διονύσιος; fl. 320 BC) was a Greek philosopher and dialectician connected with the Megarian school.
'Socrates'@en'Socrates'@en0.9999998807907104Socrates (/ˈsɒkrətiːz/; Greek: Σωκράτης; c. 470–399 BC) was a Greek philosopher from Athens who is credited as the founder of Western philosophy and among the first moral philosophers of the ethical tradition of thought.
'Socrates'@en'Histories'@en0.9376196265220642The Histories (Greek: Ἱστορίαι, Ancient Greek: [historíai̯]; also known as The History) of Herodotus is considered the founding work of history in Western literature.
'Socrates'@en'Károly Kerényi'@en0.930176854133606Károly (Carl, Karl) Kerényi (Hungarian: Kerényi Károly, pronounced [ˈkɛreːɲi ˈkaːroj]; 19 January 1897 – 14 April 1973) was a Hungarian scholar in classical philology and one of the founders of modern studies of Greek mythology.
'Socrates'@en'Xenokrates of Sicyon'@en0.9301142692565918Xenokrates of Athens or of Sicyon (Greek: Ξενοκράτης; fl. c. 280 BC) was an ancient Greek sculptor and writer, and one of the world\\'s first art historians.
'Socrates'@en'Iosipos Moisiodax'@en0.9245849847793579Iosipos Moisiodax or Moesiodax (/ˈmiːsiədæks/; Greek: Ιώσηπος Μοισιόδαξ; 1725–1800) was a Greek philosopher, an Eastern Orthodox deacon, and one of the greatest exponents of the modern Greek Enlightenment.
'Socrates'@en'Hippodamus of Miletus'@en0.9245232343673706Hippodamus of Miletus (/hɪˈpɒdəməs/; Greek: Ἱππόδαμος ὁ Μιλήσιος, Hippodamos ho Milesios; 498 – 408 BC) was an ancient Greek architect, urban planner, physician, mathematician, meteorologist and philosopher, who is considered to be \\"the father of European urban planning\\", and the namesake of the \\"Hippodamian plan\\" (grid plan) of city layout.
'Socrates'@en'Henry Oldenburg'@en0.9239271879196167Henry Oldenburg (also Henry Oldenbourg) FRS (c. 1618 as Heinrich Oldenburg – 5 September 1677). was a German theologian, diplomat, and natural philosopher, known as one of the creators of modern scientific peer review.
'Socrates'@en'Inachus'@en0.9215598702430725In Greek mythology, Inachus, Inachos or Inakhos (Ancient Greek: Ἴναχος) was the first king of Argos after whom a river was called Inachus River, the modern Panitsa that drains the western margin of the Argive plain.
'Socrates'@en'Hippobotus'@en0.917095959186554Hippobotus (/hɪˈpɒbətəs/; Ancient Greek: Ἱππόβοτος; fl. c. 200 BC) was a Greek historian of philosophers and philosophical schools.
'Socrates'@en'Georgios Hatzidakis'@en0.9140840172767639Georgios Nicolaou Hatzidakis, aka Georgios Nikolaou Chatzidakis (Greek: Γεώργιος Νικολάου Χατζιδάκις; 23 November [O.S. 11 November] 1843, in Myrthios, Ottoman Crete – 28 June 1941, in Athens) was a Greek philologist, who is regarded as the father of linguistics in Greece.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", + " -i abstract -i labels -i sentence \\\n", + " --match 'abstract: (x)-[]->(xv), \\\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", + " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", + " sent: (y)-[]->(ys)' \\\n", + " --where 'x in [\"Q859\", \"Q868\", \"Q913\"]' \\\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "39b00728", + "metadata": {}, + "source": [ + "### Countries:" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "ef957685", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'United States of America'@en'United States of America'@en1.000000
1'United States of America'@en'United Kingdom'@en0.819738
2'United States of America'@en'France'@en0.810034
3'United States of America'@en'Canada'@en0.793150
4'United States of America'@en'Spain'@en0.791431
5'United States of America'@en'Australia'@en0.780531
6'United States of America'@en'Thailand'@en0.742816
7'United States of America'@en'South Korea'@en0.734353
8'United States of America'@en'India'@en0.730247
9'United States of America'@en'Mexico'@en0.717486
10'Austria'@en'Austria'@en1.000000
11'Austria'@en'Poland'@en0.776092
12'Austria'@en'Hungary'@en0.772626
13'Austria'@en'Germany'@en0.766456
14'Austria'@en'Australia'@en0.762866
15'Austria'@en'United States of America'@en0.759724
16'Austria'@en'Spain'@en0.753217
17'Austria'@en'Iceland'@en0.748018
18'Austria'@en'United Kingdom'@en0.744055
19'Austria'@en'Guatemala'@en0.724540
20'Greece'@en'Greece'@en1.000000
21'Greece'@en'Turkey'@en0.777969
22'Greece'@en'Romania'@en0.742026
23'Greece'@en'Ukraine'@en0.734370
24'Greece'@en'Hungary'@en0.733742
25'Greece'@en'Poland'@en0.733643
26'Greece'@en'Austria'@en0.733425
27'Greece'@en'Thailand'@en0.732736
28'Greece'@en'Australia'@en0.725093
29'Greece'@en'South Korea'@en0.722797
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'United States of America'@en 'United States of America'@en 1.000000\n", + "1 'United States of America'@en 'United Kingdom'@en 0.819738\n", + "2 'United States of America'@en 'France'@en 0.810034\n", + "3 'United States of America'@en 'Canada'@en 0.793150\n", + "4 'United States of America'@en 'Spain'@en 0.791431\n", + "5 'United States of America'@en 'Australia'@en 0.780531\n", + "6 'United States of America'@en 'Thailand'@en 0.742816\n", + "7 'United States of America'@en 'South Korea'@en 0.734353\n", + "8 'United States of America'@en 'India'@en 0.730247\n", + "9 'United States of America'@en 'Mexico'@en 0.717486\n", + "10 'Austria'@en 'Austria'@en 1.000000\n", + "11 'Austria'@en 'Poland'@en 0.776092\n", + "12 'Austria'@en 'Hungary'@en 0.772626\n", + "13 'Austria'@en 'Germany'@en 0.766456\n", + "14 'Austria'@en 'Australia'@en 0.762866\n", + "15 'Austria'@en 'United States of America'@en 0.759724\n", + "16 'Austria'@en 'Spain'@en 0.753217\n", + "17 'Austria'@en 'Iceland'@en 0.748018\n", + "18 'Austria'@en 'United Kingdom'@en 0.744055\n", + "19 'Austria'@en 'Guatemala'@en 0.724540\n", + "20 'Greece'@en 'Greece'@en 1.000000\n", + "21 'Greece'@en 'Turkey'@en 0.777969\n", + "22 'Greece'@en 'Romania'@en 0.742026\n", + "23 'Greece'@en 'Ukraine'@en 0.734370\n", + "24 'Greece'@en 'Hungary'@en 0.733742\n", + "25 'Greece'@en 'Poland'@en 0.733643\n", + "26 'Greece'@en 'Austria'@en 0.733425\n", + "27 'Greece'@en 'Thailand'@en 0.732736\n", + "28 'Greece'@en 'Australia'@en 0.725093\n", + "29 'Greece'@en 'South Korea'@en 0.722797" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $COMPLEX\n", + " -i complex -i labels\n", + " --match 'complex: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q40\", \"Q41\", \"Q30\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "086b299e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'United States of America'@en'United States of America'@en1.000000
1'United States of America'@en'State of Scott'@en0.790001
2'United States of America'@en'.سورية'@en0.781829
3'United States of America'@en'Republic of South Carolina'@en0.776540
4'United States of America'@en'State of Kanawha'@en0.770454
5'United States of America'@en'Wedge: The Secret War between the FBI and CIA...0.762854
6'United States of America'@en'Marin County'@en0.741442
7'United States of America'@en'Light Stations of the United States MPS'@en0.731502
8'United States of America'@en'Republic of Florida'@en0.730721
9'United States of America'@en'Women's Professional Racquetball Organization...0.737493
10'Austria'@en'Austria'@en1.000000
11'Austria'@en'Ostösterreich'@en0.821322
12'Austria'@en'Westösterreich'@en0.792200
13'Austria'@en'WikiProject Austria'@en0.787478
14'Austria'@en'Südösterreich'@en0.778928
15'Austria'@en'Lethal Records'@en0.767960
16'Austria'@en'Spitzer Riegel'@en0.762144
17'Austria'@en'International Association of Judges'@en0.756613
18'Austria'@en'Lackenkar'@en0.753951
19'Austria'@en'World Sustainable Energy Days'@en0.733782
20'Greece'@en'Greece'@en1.000000
21'Greece'@en'Third Hellenic Republic'@en0.854657
22'Greece'@en'Greece'@en0.836717
23'Greece'@en'Dropull municipality'@en0.800688
24'Greece'@en'hockey in Greece'@en0.786580
25'Greece'@en'Kalamatianos'@en0.764173
26'Greece'@en'baseball in Greece'@en0.753401
27'Greece'@en'motorsports in Greece'@en0.724663
28'Greece'@en'Operation Manna'@en0.722541
29'Greece'@en'Hierissos (Greece)'@en0.721703
\n", + "
" + ], + "text/plain": [ + " xlabel \\\n", + "0 'United States of America'@en \n", + "1 'United States of America'@en \n", + "2 'United States of America'@en \n", + "3 'United States of America'@en \n", + "4 'United States of America'@en \n", + "5 'United States of America'@en \n", + "6 'United States of America'@en \n", + "7 'United States of America'@en \n", + "8 'United States of America'@en \n", + "9 'United States of America'@en \n", + "10 'Austria'@en \n", + "11 'Austria'@en \n", + "12 'Austria'@en \n", + "13 'Austria'@en \n", + "14 'Austria'@en \n", + "15 'Austria'@en \n", + "16 'Austria'@en \n", + "17 'Austria'@en \n", + "18 'Austria'@en \n", + "19 'Austria'@en \n", + "20 'Greece'@en \n", + "21 'Greece'@en \n", + "22 'Greece'@en \n", + "23 'Greece'@en \n", + "24 'Greece'@en \n", + "25 'Greece'@en \n", + "26 'Greece'@en \n", + "27 'Greece'@en \n", + "28 'Greece'@en \n", + "29 'Greece'@en \n", + "\n", + " ylabel sim \n", + "0 'United States of America'@en 1.000000 \n", + "1 'State of Scott'@en 0.790001 \n", + "2 '.سورية'@en 0.781829 \n", + "3 'Republic of South Carolina'@en 0.776540 \n", + "4 'State of Kanawha'@en 0.770454 \n", + "5 'Wedge: The Secret War between the FBI and CIA... 0.762854 \n", + "6 'Marin County'@en 0.741442 \n", + "7 'Light Stations of the United States MPS'@en 0.731502 \n", + "8 'Republic of Florida'@en 0.730721 \n", + "9 'Women's Professional Racquetball Organization... 0.737493 \n", + "10 'Austria'@en 1.000000 \n", + "11 'Ostösterreich'@en 0.821322 \n", + "12 'Westösterreich'@en 0.792200 \n", + "13 'WikiProject Austria'@en 0.787478 \n", + "14 'Südösterreich'@en 0.778928 \n", + "15 'Lethal Records'@en 0.767960 \n", + "16 'Spitzer Riegel'@en 0.762144 \n", + "17 'International Association of Judges'@en 0.756613 \n", + "18 'Lackenkar'@en 0.753951 \n", + "19 'World Sustainable Energy Days'@en 0.733782 \n", + "20 'Greece'@en 1.000000 \n", + "21 'Third Hellenic Republic'@en 0.854657 \n", + "22 'Greece'@en 0.836717 \n", + "23 'Dropull municipality'@en 0.800688 \n", + "24 'hockey in Greece'@en 0.786580 \n", + "25 'Kalamatianos'@en 0.764173 \n", + "26 'baseball in Greece'@en 0.753401 \n", + "27 'motorsports in Greece'@en 0.724663 \n", + "28 'Operation Manna'@en 0.722541 \n", + "29 'Hierissos (Greece)'@en 0.721703 " + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $TRANSE\n", + " -i transe -i labels\n", + " --match 'transe: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q40\", \"Q41\", \"Q30\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "11b54d06", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
xlabelylabelsimysent
'United States of America'@en'United States of America'@en1.0000001192092896The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America.
'United States of America'@en'North African American'@en0.9427798390388489North African Americans are Americans with origins in the region of North Africa.
'United States of America'@en'Central America'@en0.9361214637756348Central America (Spanish: América Central [aˈmeɾika senˈtɾal] or Centroamérica [sentɾoaˈmeɾika]) is a subcontinent of North America.
'United States of America'@en'Northern United States'@en0.9349914789199829The Northern United States, commonly referred to as the American North, the Northern States, or simply the North, is a geographical or historical region of the United States.
'United States of America'@en'Episcopal Diocese of Atlanta'@en0.9307031035423279The Episcopal Diocese of Atlanta is the diocese of the Episcopal Church in the United States of America, with jurisdiction over middle and north Georgia.
'United States of America'@en'Tidewater region of Virginia'@en0.9302077293395996Tidewater refers to the north Atlantic coastal plain region of the United States of America.
'United States of America'@en'Great Northern Railway'@en0.9266144037246704The Great Northern Railway (reporting mark GN) was an American Class I railroad.
'United States of America'@en'American people of North American descent'@en0.9230941534042358American people of North American descent refers to inhabitants of the United States with lineage tracing to other North American countries.
'United States of America'@en'Episcopal Diocese of Northern Michigan'@en0.922684907913208The Episcopal Diocese of Northern Michigan is the diocese of the Episcopal Church in the United States of America (TEC) with canonical jurisdiction in the Upper Peninsula of Michigan.
'United States of America'@en'list of metropolitan areas in Northern America'@en0.9206945300102234This is a list of metropolitan areas in Northern America, typically defined to include Canada and the United States as well as Bermuda (UK), Greenland (Denmark), and St. Pierre and Miquelon (France).
'Austria'@en'Austria'@en0.9999998807907104Austria, officially the Republic of Austria, is a landlocked country in the southern part of Central Europe, situated at Eastern Alps.
'Austria'@en'Mount Royal'@en0.9394936561584473Mount Royal is situated at the southern end of the Mount Royal Range in the Barrington Tops region of eastern Australia.
'Austria'@en'Theresienfeld'@en0.9383996725082397Theresienfeld is a town in the Wiener Neustadt-Land district of Lower Austria, in eastern Austria.
'Austria'@en'Paruna'@en0.9354041814804077Paruna is a town in eastern South Australia.
'Austria'@en'Middleton'@en0.9351267218589783Middleton is a town in South Australia on the eastern end of the south coast of the Fleurieu Peninsula.
'Austria'@en'Lavanttal'@en0.9330481290817261The Lavant Valley (German: Lavanttal, Slovene: Labotska dolina or Laboška dolina; Southern Bavarian: Lovnthol) lies in the Lavanttal Alps in southern Austria in the eastern part of the state of Carinthia.
'Austria'@en'Belarus'@en0.9315851926803589Belarus, officially the Republic of Belarus, is a landlocked country in Eastern Europe.
'Austria'@en'Towitta'@en0.9289759397506714Towitta is a locality in the Murray Mallee region of South Australia at the foot of the eastern side of the Mount Lofty Ranges.
'Austria'@en'Port Elliot'@en0.9283883571624756Port Elliot is a town in South Australia toward the eastern end of the south coast of the Fleurieu Peninsula.
'Austria'@en'list of companies of Andorra'@en0.928240180015564Andorra is a sovereign landlocked microstate in Southwestern Europe, located in the eastern Pyrenees mountains and bordered by Spain and France.
'Greece'@en'Greece'@en1.0Greece or Hellas (Greek: Ελλάδα, romanized: Elláda or Ελλάς, romanized: Ellas), officially the Hellenic Republic (Greek: Ελληνική Δημοκρατία, romanized: Elliniki Dimokratia) is a country in Southeast Europe.
'Greece'@en'Bulgaria'@en0.9444983005523682Bulgaria (/bʌlˈɡɛəriə, bʊl-/; Bulgarian: България, romanized: Balgariya), officially the Republic of Bulgaria, is a country in Southeast Europe.
'Greece'@en'Croatia'@en0.9344392418861389Croatia (/kroʊˈeɪʃə/, kroh-AY-shə; Croatian: Hrvatska, pronounced [xř̩ʋaːtskaː]), officially the Republic of Croatia (Croatian: Republika Hrvatska,), is a country at the crossroads of Central and Southeast Europe.
'Greece'@en'Caenophrurium'@en0.9327539205551147Caenophrurium (also written as Cenophrurium and Coenophrurium; Greek: Καινοφρούριον Kainophrurion) was a settlement in the Roman province of Europa (the southeasternmost part of Thrace), between Byzantium and Heraclea Perinthus.
'Greece'@en'Chech'@en0.9299066662788391Chech (Bulgarian: Чеч, Greek: Τσέτσι) or Chechko (Bulgarian: Чечко) is a geographical and historical region of the Balkan peninsula in southeastern Europe in modern-day Bulgaria and Greece.
'Greece'@en'history of Bosnia and Herzegovina'@en0.926729142665863Bosnia and Herzegovina, sometimes referred to simply as Bosnia, is a country in Southeast Europe on the Balkan Peninsula.
'Greece'@en'Zagem'@en0.9253758788108826Zagem or Bazari (Georgian: ბაზარი) was a town in the southeast Caucasus, in the eastern Georgian kingdom of Kakheti.
'Greece'@en'Barony of Gritzena'@en0.9247817993164062The Barony of Gritzena or Gritsena was a medieval Frankish fiefdom of the Principality of Achaea, located in eastern Messenia, in the Peloponnese peninsula in Greece, centred on the settlement of Gritzena (Greek: Γρίτζενα/Γρίτσενα; French: La Grite).
'Greece'@en'Maroneia-Sapes Municipality'@en0.924449622631073Maroneia-Sapes (Greek: Μαρώνεια-Σάπες) is a municipality in the Rhodope regional unit, East Macedonia and Thrace, Greece.
'Greece'@en'Globočice pri Kostanjevici'@en0.9231210947036743Globočice pri Kostanjevici (pronounced [ɡlɔbɔˈtʃiːtsɛ pɾi kɔˈstaːnjɛʋitsa]; in older sources also Globočica, German: Globoschitz) is a settlement southeast of Kostanjevica na Krki in eastern Slovenia.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", + " -i abstract -i labels -i sentence \\\n", + " --match 'abstract: (x)-[]->(xv), \\\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", + " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", + " sent: (y)-[]->(ys)' \\\n", + " --where 'x in [\"Q40\", \"Q41\", \"Q30\"]' \\\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "d890577a", + "metadata": {}, + "source": [ + "### Types of animals:" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "5aea5d3e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'dog'@en'dog'@en1.000000
1'dog'@en'hat'@en0.721310
2'dog'@en'house cat'@en0.706597
3'dog'@en'body armor'@en0.692382
4'dog'@en'woman'@en0.687660
5'dog'@en'peafowl'@en0.686448
6'dog'@en'bouquet'@en0.678995
7'dog'@en'hunting dog'@en0.667602
8'dog'@en'logo'@en0.647672
9'dog'@en'sceptre'@en0.644042
10'house cat'@en'house cat'@en1.000000
11'house cat'@en'monkey'@en0.721723
12'house cat'@en'scarf'@en0.717398
13'house cat'@en'old woman'@en0.699330
14'house cat'@en'mythology'@en0.689698
15'house cat'@en'caress'@en0.686948
16'house cat'@en'maternal bond'@en0.681039
17'house cat'@en'body armor'@en0.678056
18'house cat'@en'soap bubble'@en0.674951
19'house cat'@en'lioness'@en0.671338
20'horse'@en'horse'@en1.000000
21'horse'@en'racehorse'@en0.768213
22'horse'@en'dog'@en0.719937
23'horse'@en'standing'@en0.696390
24'horse'@en'house cat'@en0.682117
25'horse'@en'broad-leaved tree'@en0.663244
26'horse'@en'sky'@en0.662595
27'horse'@en'horse-drawn vehicle'@en0.641660
28'horse'@en'drapery'@en0.628287
29'horse'@en'sceptre'@en0.618746
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'dog'@en 'dog'@en 1.000000\n", + "1 'dog'@en 'hat'@en 0.721310\n", + "2 'dog'@en 'house cat'@en 0.706597\n", + "3 'dog'@en 'body armor'@en 0.692382\n", + "4 'dog'@en 'woman'@en 0.687660\n", + "5 'dog'@en 'peafowl'@en 0.686448\n", + "6 'dog'@en 'bouquet'@en 0.678995\n", + "7 'dog'@en 'hunting dog'@en 0.667602\n", + "8 'dog'@en 'logo'@en 0.647672\n", + "9 'dog'@en 'sceptre'@en 0.644042\n", + "10 'house cat'@en 'house cat'@en 1.000000\n", + "11 'house cat'@en 'monkey'@en 0.721723\n", + "12 'house cat'@en 'scarf'@en 0.717398\n", + "13 'house cat'@en 'old woman'@en 0.699330\n", + "14 'house cat'@en 'mythology'@en 0.689698\n", + "15 'house cat'@en 'caress'@en 0.686948\n", + "16 'house cat'@en 'maternal bond'@en 0.681039\n", + "17 'house cat'@en 'body armor'@en 0.678056\n", + "18 'house cat'@en 'soap bubble'@en 0.674951\n", + "19 'house cat'@en 'lioness'@en 0.671338\n", + "20 'horse'@en 'horse'@en 1.000000\n", + "21 'horse'@en 'racehorse'@en 0.768213\n", + "22 'horse'@en 'dog'@en 0.719937\n", + "23 'horse'@en 'standing'@en 0.696390\n", + "24 'horse'@en 'house cat'@en 0.682117\n", + "25 'horse'@en 'broad-leaved tree'@en 0.663244\n", + "26 'horse'@en 'sky'@en 0.662595\n", + "27 'horse'@en 'horse-drawn vehicle'@en 0.641660\n", + "28 'horse'@en 'drapery'@en 0.628287\n", + "29 'horse'@en 'sceptre'@en 0.618746" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $COMPLEX\n", + " -i complex -i labels\n", + " --match 'complex: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q144\", \"Q146\", \"Q726\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "7b3a139c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'dog'@en'dog'@en1.000000
1'dog'@en'Salty and Roselle'@en0.840819
2'dog'@en'Fred Basset'@en0.831087
3'dog'@en'Theo'@en0.821203
4'dog'@en'Heaven Sent Brandy'@en0.820738
5'dog'@en'Old Hemp'@en0.818933
6'dog'@en'Rubia'@en0.816860
7'dog'@en'Alcmène'@en0.815260
8'dog'@en'Alex the Dog'@en0.813087
9'dog'@en'Edda'@en0.810013
10'house cat'@en'house cat'@en1.000000
11'house cat'@en'Mike'@en0.813615
12'house cat'@en'Tiddles'@en0.796653
13'house cat'@en'Merlin'@en0.788791
14'house cat'@en'goat'@en0.785772
15'house cat'@en'Luca'@en0.774328
16'house cat'@en'Musashi'@en0.770614
17'house cat'@en'Kotora'@en0.792088
18'house cat'@en'Liv'@en0.779756
19'house cat'@en'Pixel'@en0.779563
20'horse'@en'horse'@en1.000000
21'horse'@en'Sindhi horse'@en0.839297
22'horse'@en'male organism'@en0.742897
23'horse'@en'allergy to horses'@en0.713499
24'horse'@en'Thoroughbred'@en0.712364
25'horse'@en'racehorse'@en0.711143
26'horse'@en'female organism'@en0.680730
27'horse'@en'gelding'@en0.653782
28'horse'@en'Forehand'@en0.678915
29'horse'@en'Pierata'@en0.673963
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'dog'@en 'dog'@en 1.000000\n", + "1 'dog'@en 'Salty and Roselle'@en 0.840819\n", + "2 'dog'@en 'Fred Basset'@en 0.831087\n", + "3 'dog'@en 'Theo'@en 0.821203\n", + "4 'dog'@en 'Heaven Sent Brandy'@en 0.820738\n", + "5 'dog'@en 'Old Hemp'@en 0.818933\n", + "6 'dog'@en 'Rubia'@en 0.816860\n", + "7 'dog'@en 'Alcmène'@en 0.815260\n", + "8 'dog'@en 'Alex the Dog'@en 0.813087\n", + "9 'dog'@en 'Edda'@en 0.810013\n", + "10 'house cat'@en 'house cat'@en 1.000000\n", + "11 'house cat'@en 'Mike'@en 0.813615\n", + "12 'house cat'@en 'Tiddles'@en 0.796653\n", + "13 'house cat'@en 'Merlin'@en 0.788791\n", + "14 'house cat'@en 'goat'@en 0.785772\n", + "15 'house cat'@en 'Luca'@en 0.774328\n", + "16 'house cat'@en 'Musashi'@en 0.770614\n", + "17 'house cat'@en 'Kotora'@en 0.792088\n", + "18 'house cat'@en 'Liv'@en 0.779756\n", + "19 'house cat'@en 'Pixel'@en 0.779563\n", + "20 'horse'@en 'horse'@en 1.000000\n", + "21 'horse'@en 'Sindhi horse'@en 0.839297\n", + "22 'horse'@en 'male organism'@en 0.742897\n", + "23 'horse'@en 'allergy to horses'@en 0.713499\n", + "24 'horse'@en 'Thoroughbred'@en 0.712364\n", + "25 'horse'@en 'racehorse'@en 0.711143\n", + "26 'horse'@en 'female organism'@en 0.680730\n", + "27 'horse'@en 'gelding'@en 0.653782\n", + "28 'horse'@en 'Forehand'@en 0.678915\n", + "29 'horse'@en 'Pierata'@en 0.673963" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $TRANSE\n", + " -i transe -i labels\n", + " --match 'transe: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q144\", \"Q146\", \"Q726\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "28194492", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
xlabelylabelsimysent
'dog'@en'dog'@en1.0000001192092896The dog or domestic dog (Canis familiaris or Canis lupus familiaris) is a domesticated descendant of the wolf.
'dog'@en'Canis simensis'@en0.9140985012054443The Ethiopian wolf (Canis simensis), also called the Simien jackal and Simien fox, is a canine native to the Ethiopian Highlands.
'dog'@en'Bucovina Shepherd Dog'@en0.8995509743690491The Romanian Bucovina Shepherd (Romanian: Ciobănesc Românesc de Bucovina) is a breed of livestock guardian dogs native to historical Bukovina (Bucovina) region.
'dog'@en'Karst Shepherd'@en0.8969720602035522The Karst Shepherd (Slovene: kraški ovčar or kraševec ) is a breed of dog of the livestock guardian type, originating in Slovenia.
'dog'@en'Majorca Shepherd Dog'@en0.8946498036384583The Majorca Shepherd Dog (Catalan: Ca de bestiar, Spanish: Perro de pastor mallorquín) is a domesticated breed of dog, used in the Balearic Islands of Spain, both for guarding sheep and as a general purpose farm dog.
'dog'@en'Tornjak'@en0.8933776021003723The Tornjak (pronounced [torɲâk]), is a breed of livestock guardian dog native to Bosnia and Herzegovina and Croatia.
'dog'@en'Schapendoes'@en0.8902978301048279The Schapendoes (Dutch pronunciation: [ˈsxaːpəndus]) or Dutch Sheepdog, is a breed of dog originating in the Netherlands.
'dog'@en'Native American dogs'@en0.8873973488807678Native American dogs, or Pre-Columbian dogs, were dogs living with people indigenous to the Americas.
'dog'@en'Mozart family'@en0.8867671489715576The Mozart family were the ancestors, relatives, and descendants of Wolfgang Amadeus Mozart.
'dog'@en'Hare Indian Dog'@en0.88480544090271The Hare Indian dog is an extinct domesticated canine; possibly a breed of domestic dog, coydog, or domesticated coyote; formerly found and originally bred in northern Canada by the Hare Indians for coursing.
'house cat'@en'house cat'@en1.0000001192092896The cat (Felis catus) is a domestic species of small carnivorous mammal.
'house cat'@en'Ragamuffin'@en0.9178828597068787The Ragamuffin is a breed of domestic cat.
'house cat'@en'Cymric'@en0.9060893058776855The Cymric (/ˈkɪmrɪk/ KIM-rik, /ˈkʌmrɪk/ KUM-rik) is a breed of domestic cat.
'house cat'@en'Turkish Angora'@en0.9040964841842651The Turkish Angora (Turkish: Ankara kedisi, \\"Ankara cat\\") is a breed of domestic cat.
'house cat'@en'Cornish Rex'@en0.9037503004074097The Cornish Rex is a breed of domestic cat.
'house cat'@en'Flat-headed cat'@en0.8956901431083679The flat-headed cat (Prionailurus planiceps) is a small wild cat native to the Thai-Malay Peninsula, Borneo, and Sumatra.
'house cat'@en'Oriental Longhair'@en0.8942360877990723The Oriental Longhair is a variety of domestic cat.
'house cat'@en'Prionailurus'@en0.8825944662094116Prionailurus is a genus of spotted, small wild cats native to Asia.
'house cat'@en'German Rex'@en0.8820987343788147The German Rex is a breed of domestic cat.
'house cat'@en'Cheyletiella blakei'@en0.8800439238548279Cheyletiella blakei is a small mite and ectoparasitic of domestic cats.
'horse'@en'horse'@en0.9999999403953552The horse (Equus ferus caballus) is a domesticated, one-toed, hoofed mammal.
'horse'@en'Equus ferus'@en0.8957256078720093The wild horse (Equus ferus) is a species of the genus Equus, which includes as subspecies the modern domesticated horse (Equus ferus caballus) as well as the endangered Przewalski\\'s horse (Equus ferus przewalskii).
'horse'@en'Tolfetano'@en0.8931700587272644The Tolfetano or Cavallo Tolfetano is a breed of horse from the northern part of the Lazio region of Italy.
'horse'@en'Asturcón'@en0.8833141922950745The Asturcón is an ancient breed of small horse or pony from the autonomous region of Asturias in northern Spain.
'horse'@en'Riwoche horse'@en0.8793144822120667The Riwoche horse /ˈriːwoʊtʃeɪ/ is a dun-colored, pony-sized horse indigenous to northeastern Tibet.
'horse'@en'Catria horse'@en0.8720530867576599The Catria Horse (Italian: Cavallo del Catria) is a breed of horse originating in the mountainous area of the massif of Monte Catria in the Marche region of Italy, and surrounding areas in the provinces of Ancona, Perugia and Pesaro.
'horse'@en'Hequ horse'@en0.8712781071662903The Hequ horse, previously called the Nanfan, is a horse breed native to the northwestern Tibetan plateau.
'horse'@en'Pentro horse'@en0.8711110949516296The Pentro Horse (Italian: Cavallo Pentro) is a breed of horse originating in the area of Isernia, in the Molise region of Italy.
'horse'@en'Dasyrhamphis'@en0.871029794216156Dasyrhamphis is a species of \\'horse fly\\' belonging to the family Tabanidae subfamily Tabaninae.
'horse'@en'Akhal-Teke'@en0.8686932325363159The Akhal-Teke (/ˌækəlˈtɛk/ or /ˌækəlˈtɛki/; from Turkmen Ahalteke, [axalˈteke]) is a Turkmen horse breed.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", + " -i abstract -i labels -i sentence \\\n", + " --match 'abstract: (x)-[]->(xv), \\\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", + " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", + " sent: (y)-[]->(ys)' \\\n", + " --where 'x in [\"Q144\", \"Q146\", \"Q726\"]' \\\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "19e07eb9", + "metadata": {}, + "source": [ + "### Handball:" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "2d601069", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'handball'@en'handball'@en1.000000
1'handball'@en'beach handball'@en0.755496
2'handball'@en'field hockey'@en0.747243
3'handball'@en'korfball'@en0.735095
4'handball'@en'indoor handball'@en0.729936
5'handball'@en'biathlon'@en0.705106
6'handball'@en'volleyball'@en0.704901
7'handball'@en'softball'@en0.686220
8'handball'@en'field handball'@en0.683182
9'handball'@en'orienteering'@en0.666485
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'handball'@en 'handball'@en 1.000000\n", + "1 'handball'@en 'beach handball'@en 0.755496\n", + "2 'handball'@en 'field hockey'@en 0.747243\n", + "3 'handball'@en 'korfball'@en 0.735095\n", + "4 'handball'@en 'indoor handball'@en 0.729936\n", + "5 'handball'@en 'biathlon'@en 0.705106\n", + "6 'handball'@en 'volleyball'@en 0.704901\n", + "7 'handball'@en 'softball'@en 0.686220\n", + "8 'handball'@en 'field handball'@en 0.683182\n", + "9 'handball'@en 'orienteering'@en 0.666485" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $COMPLEX\n", + " -i complex -i labels\n", + " --match 'complex: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q8418\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "c08e155a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'handball'@en'handball'@en1.000000
1'handball'@en'Wikipedia:WikiProject Handball'@en0.830904
2'handball'@en'women's beach handball'@en0.714068
3'handball'@en'ski jumping'@en0.700670
4'handball'@en'futsal'@en0.689599
5'handball'@en'indoor handball'@en0.687324
6'handball'@en'biathlon'@en0.683123
7'handball'@en'women's association football'@en0.678200
8'handball'@en'Qatch'@en0.659170
9'handball'@en'hockey'@en0.649488
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'handball'@en 'handball'@en 1.000000\n", + "1 'handball'@en 'Wikipedia:WikiProject Handball'@en 0.830904\n", + "2 'handball'@en 'women's beach handball'@en 0.714068\n", + "3 'handball'@en 'ski jumping'@en 0.700670\n", + "4 'handball'@en 'futsal'@en 0.689599\n", + "5 'handball'@en 'indoor handball'@en 0.687324\n", + "6 'handball'@en 'biathlon'@en 0.683123\n", + "7 'handball'@en 'women's association football'@en 0.678200\n", + "8 'handball'@en 'Qatch'@en 0.659170\n", + "9 'handball'@en 'hockey'@en 0.649488" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $TRANSE\n", + " -i transe -i labels\n", + " --match 'transe: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q8418\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "ce658ebb", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
xlabelylabelsimysent
'handball'@en'handball'@en1.0Handball (also known as team handball, European handball or Olympic handball) is a team sport in which two teams of seven players each (six outcourt players and a goalkeeper) pass a ball using their hands with the aim of throwing it into the goal of the other team.
'handball'@en'beach handball'@en0.8905559182167053Beach handball is a team sport where two teams pass and bounce or roll a ball, trying to throw it in the goal of the opposing team.
'handball'@en'volleyball injury'@en0.8481535315513611Volleyball is a game played between two opposing sides, with six players on each team, where the players use mainly their hands to hit the ball over a net and try to make the ball land on the opposing team\\'s side of the court.
'handball'@en'ball boy'@en0.8265464305877686Ball boys and ball girls, also known as ball kids are individuals, usually human youths but sometimes dogs, who retrieve and supply balls for players or officials in sports such as association football, American football, bandy, cricket, tennis, baseball and basketball.
'handball'@en'Balonpesado'@en0.8249591588973999The balonpesado is a team sport, devised for both open field as closed, in which two sets of five players each try to score goals within circles drawn on the ground of each end of the field.
'handball'@en'Screwball Scramble'@en0.8165739178657532Screwball Scramble is a toy made by Tomy that involves guiding a 14-millimeter-diameter chrome steel ball bearing around an obstacle course.
'handball'@en'sepak takraw'@en0.808526337146759Sepak takraw, or Sepaktakraw, also called kick volleyball, is a team sport played with a ball made of rattan or synthetic plastic between two teams of two to four players on a court resembling a badminton court.
'handball'@en'muggle quidditch'@en0.7998005151748657Quidditch, also known as quadball, is a sport of two teams of seven players each mounted on a broomstick, and is played on a hockey rink-sized pitch.
'handball'@en'tag'@en0.7951781749725342Tag (also called tig, it, tiggy, tips, tick, tip) is a playground game involving two or more players chasing other players in an attempt to \\"tag\\" and mark them out of play, usually by touching with a hand.
'handball'@en'dodgeball'@en0.7933317422866821Dodgeball is a team sport in which players on two teams try to throw balls and hit opponents, while avoiding being hit themselves.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", + " -i abstract -i labels -i sentence \\\n", + " --match 'abstract: (x)-[]->(xv), \\\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", + " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", + " sent: (y)-[]->(ys)' \\\n", + " --where 'x in [\"Q8418\"]' \\\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "161d0c0d", + "metadata": {}, + "source": [ + "### Journalist:" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "1dcb4117", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'journalist'@en'journalist'@en1.000000
1'journalist'@en'television presenter'@en0.806589
2'journalist'@en'writer'@en0.794790
3'journalist'@en'poet'@en0.785165
4'journalist'@en'playwright'@en0.775690
5'journalist'@en'politician'@en0.756889
6'journalist'@en'short story writer'@en0.756759
7'journalist'@en'actor'@en0.751951
8'journalist'@en'film critic'@en0.743542
9'journalist'@en'teacher'@en0.743155
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'journalist'@en 'journalist'@en 1.000000\n", + "1 'journalist'@en 'television presenter'@en 0.806589\n", + "2 'journalist'@en 'writer'@en 0.794790\n", + "3 'journalist'@en 'poet'@en 0.785165\n", + "4 'journalist'@en 'playwright'@en 0.775690\n", + "5 'journalist'@en 'politician'@en 0.756889\n", + "6 'journalist'@en 'short story writer'@en 0.756759\n", + "7 'journalist'@en 'actor'@en 0.751951\n", + "8 'journalist'@en 'film critic'@en 0.743542\n", + "9 'journalist'@en 'teacher'@en 0.743155" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $COMPLEX\n", + " -i complex -i labels\n", + " --match 'complex: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q1930187\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "6571a57e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'journalist'@en'journalist'@en1.000000
1'journalist'@en'Category:Journalists'@en0.719601
2'journalist'@en'journalistic scandal'@en0.693972
3'journalist'@en'children's writer'@en0.667045
4'journalist'@en'László Török'@en0.658004
5'journalist'@en'columnist'@en0.650447
6'journalist'@en'novelist'@en0.644886
7'journalist'@en'foreign correspondent'@en0.626349
8'journalist'@en'Category:Journalists of Ceará'@en0.650367
9'journalist'@en'business journalist'@en0.640736
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'journalist'@en 'journalist'@en 1.000000\n", + "1 'journalist'@en 'Category:Journalists'@en 0.719601\n", + "2 'journalist'@en 'journalistic scandal'@en 0.693972\n", + "3 'journalist'@en 'children's writer'@en 0.667045\n", + "4 'journalist'@en 'László Török'@en 0.658004\n", + "5 'journalist'@en 'columnist'@en 0.650447\n", + "6 'journalist'@en 'novelist'@en 0.644886\n", + "7 'journalist'@en 'foreign correspondent'@en 0.626349\n", + "8 'journalist'@en 'Category:Journalists of Ceará'@en 0.650367\n", + "9 'journalist'@en 'business journalist'@en 0.640736" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $TRANSE\n", + " -i transe -i labels\n", + " --match 'transe: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q1930187\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "57ed28f9", + "metadata": { + "lines_to_next_cell": 2 + }, + "outputs": [ + { + "data": { + "text/html": [ + "
xlabelylabelsimysent
'journalist'@en'journalist'@en0.9999999403953552A journalist is an individual that collects/gathers information in form of text, audio, or pictures, processes them into a news-worthy form, and disseminates it to the public.
'journalist'@en'technology journalism'@en0.8715080618858337Technology journalism is the activity, or product, of journalists engaged in the preparation of written, visual, audio or multi-media material intended for dissemination through public media, focusing on technology-related subjects.
'journalist'@en'Information subsidy'@en0.8664877414703369An information subsidy is the provision of ready-to-use newsworthy information to the news media by various sources interested in gaining access to media time and space.
'journalist'@en'media relations'@en0.8634818196296692Media Relations involves working with media for the purpose of informing the public of an organization\\'s mission, policies and practices in a positive, consistent and credible manner.
'journalist'@en'journalism'@en0.8631913065910339Journalism is the production and distribution of reports on the interaction of events, facts, ideas, and people that are the \\"news of the day\\" and that informs society to at least some degree.
'journalist'@en'Mediated deliberation'@en0.851024329662323Mediated deliberation is a form of deliberation that is achieved through the media which acts as a mediator between the mass public and elected officials.
'journalist'@en'news conference'@en0.8386815190315247A press conference or news conference is a media event in which notable individuals or organizations invite journalists to hear them speak and ask questions.
'journalist'@en'Media pilgrimage'@en0.8380250930786133A media pilgrimage refers to visits made to the sites mentioned in popular media.
'journalist'@en'press kit'@en0.836276650428772A press kit, often referred to as a media kit in business environments, is a pre-packaged set of promotional materials that provide information about a person, company, organization or cause and which is distributed to members of the media for promotional use.
'journalist'@en'multimedia journalism'@en0.8353787064552307Multimedia journalism is the practice of contemporary journalism that distributes news content either using two or more media formats via the Internet, or disseminating news report via multiple media platforms.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", + " -i abstract -i labels -i sentence \\\n", + " --match 'abstract: (x)-[]->(xv), \\\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", + " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", + " sent: (y)-[]->(ys)' \\\n", + " --where 'x in [\"Q1930187\"]' \\\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + }, + { + "cell_type": "markdown", + "id": "395888d4", + "metadata": {}, + "source": [ + "### Head of state:" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "31aa9ffc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'head of state'@en'head of state'@en1.000000
1'head of state'@en'head of government'@en0.831704
2'head of state'@en'leader of organisation'@en0.715446
3'head of state'@en'governor'@en0.669001
4'head of state'@en'consul general'@en0.642665
5'head of state'@en'Floor leader'@en0.605335
6'head of state'@en'defence minister'@en0.691881
7'head of state'@en'French ambassador'@en0.636968
8'head of state'@en'supreme court justice'@en0.635844
9'head of state'@en'Executive Secretary of the Secretariat'@en0.568668
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'head of state'@en 'head of state'@en 1.000000\n", + "1 'head of state'@en 'head of government'@en 0.831704\n", + "2 'head of state'@en 'leader of organisation'@en 0.715446\n", + "3 'head of state'@en 'governor'@en 0.669001\n", + "4 'head of state'@en 'consul general'@en 0.642665\n", + "5 'head of state'@en 'Floor leader'@en 0.605335\n", + "6 'head of state'@en 'defence minister'@en 0.691881\n", + "7 'head of state'@en 'French ambassador'@en 0.636968\n", + "8 'head of state'@en 'supreme court justice'@en 0.635844\n", + "9 'head of state'@en 'Executive Secretary of the Secretariat'@en 0.568668" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $COMPLEX\n", + " -i complex -i labels\n", + " --match 'complex: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q48352\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "f67324bf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
xlabelylabelsim
0'head of state'@en'head of state'@en1.000000
1'head of state'@en'head of government'@en0.861243
2'head of state'@en'governor'@en0.792502
3'head of state'@en'prime minister'@en0.789789
4'head of state'@en'speaker'@en0.781205
5'head of state'@en'Governor-general'@en0.730634
6'head of state'@en'attorney general'@en0.713970
7'head of state'@en'colonial governor'@en0.707370
8'head of state'@en'foreign minister'@en0.703298
9'head of state'@en'minister'@en0.765075
\n", + "
" + ], + "text/plain": [ + " xlabel ylabel sim\n", + "0 'head of state'@en 'head of state'@en 1.000000\n", + "1 'head of state'@en 'head of government'@en 0.861243\n", + "2 'head of state'@en 'governor'@en 0.792502\n", + "3 'head of state'@en 'prime minister'@en 0.789789\n", + "4 'head of state'@en 'speaker'@en 0.781205\n", + "5 'head of state'@en 'Governor-general'@en 0.730634\n", + "6 'head of state'@en 'attorney general'@en 0.713970\n", + "7 'head of state'@en 'colonial governor'@en 0.707370\n", + "8 'head of state'@en 'foreign minister'@en 0.703298\n", + "9 'head of state'@en 'minister'@en 0.765075" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "kgtk(\"\"\"\n", + " query --gc $MAIN --ac $TRANSE\n", + " -i transe -i labels\n", + " --match 'transe: (x)-[]->(xv),\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", + " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --where 'x in [\"Q48352\"]'\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", + " \"\"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "5033d596", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
xlabelylabelsimysent
'head of state'@en'head of state'@en1.0A head of state (or chief of state) is the public persona who officially embodies a state in its unity and legitimacy.
'head of state'@en'state religion'@en0.9016953706741333A state religion (also called an established religion or official religion) is a religion or creed officially endorsed by a sovereign state.
'head of state'@en'nation state'@en0.8929967284202576A nation state is a political unit where the state and nation are congruent.
'head of state'@en'Iman, Ittihad, Nazm'@en0.8915225863456726Faith, Unity, Discipline (Urdu: ایمان، اتحاد، نظم) is the national motto of Pakistan.
'head of state'@en'Freedom and Unity'@en0.890255331993103\\"Freedom and Unity\\" is the official motto of the U.S. state of Vermont.
'head of state'@en'Ukrainian nationalism'@en0.8886831402778625Ukrainian nationalism refers to the promotion of the unity of Ukrainians and the titular Ukraine nation state (and in a modern sense, also the \\"people of Ukraine\\" in a constitutionally mandated \\"territorial-civic\\" sense), as well as nation building as a means of strengthening and protecting state sovereignty within the international system of states.
'head of state'@en'Department for Constitutional Affairs'@en0.8855394124984741The Department for Constitutional Affairs (DCA) was a United Kingdom government department.
'head of state'@en'Most Excellent Majesty'@en0.8850839734077454Most Excellent Majesty is a form of address in the United Kingdom.
'head of state'@en'Official culture'@en0.8842212557792664Official culture is the culture that receives social legitimation or institutional support in a given society.
'head of state'@en'National Enterprise Board'@en0.8837820887565613The National Enterprise Board (NEB) was a United Kingdom government body.
" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", + " -i abstract -i labels -i sentence \\\n", + " --match 'abstract: (x)-[]->(xv), \\\n", + " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", + " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", + " sent: (y)-[]->(ys)' \\\n", + " --where 'x in [\"Q48352\"]' \\\n", + " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", + " / html\n", + "\n", + "show_html()" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,py:light" + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/kypherv-similarity-queries.py b/examples/kypherv-similarity-queries.py new file mode 100644 index 000000000..d3ec0a252 --- /dev/null +++ b/examples/kypherv-similarity-queries.py @@ -0,0 +1,707 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:light +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.13.0 +# kernelspec: +# display_name: Python 3 (ipykernel) +# language: python +# name: python3 +# --- + +# # Query knowledge graphs and embeddings with KGTK Kypher-V + +# Kypher-V supports import and queries over vector data. Kypher-V extends +# Kypher to allow work with unstructured data such as text, images, and so +# on, represented by embedding vectors. Kypher-V provides efficient storage, +# indexing and querying of large-scale vector data on a laptop. It is fully +# integrated into Kypher to enable expressive hybrid queries over +# Wikidata-size structured and unstructured data. To the best of our +# knowledge, this is the first system providing such a functionality in a +# query language for knowledge graphs. + +# Please see the [**Kypher-V Manual**](https://kgtk.readthedocs.io/en/latest/transform/query/#kypher-v) +# for an introduction to the basic concepts and usage. + + +# +# ### Setup +# +# Some preliminaries to facilitate command invocation and result formatting: + +# + +import re +from IPython.display import display, HTML +from kgtk.functions import kgtk + +def show_html(img_width=150): + """Display command output in 'out' as HTML after munging image links for inline display.""" + output = '\n'.join(out) + html = re.sub(r'"(https?://upload.wikimedia.org/[^<]+)"', + f'', + output) + display(HTML(html)) + + +# - + +# This notebook contains a number of example queries using Kypher-V. The queries assume the existence of a number of similarity graph caches in the DB directory which are defined here via shell variables: + +DB="/kgtk-data/kypherv" +# %env DB={DB} +# %env MAIN={DB}/wikidata-20221102-dwd-v8-main.sqlite3.db +# %env COMPLEX={DB}/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db +# %env TRANSE={DB}/wikidata-20221102-dwd-v8-transe-embeddings.sqlite3.db +# %env ABSTRACT={DB}/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db +# %env IMAGE={DB}/wikimedia-capcom-image-embeddings-v2.sqlite3.db + +# If you copied the graph caches to a different location, please adjust the +# paths and definitions accordingly. + +# Throughout the notebook we use a number of different invocation styles for +# the `kgtk` command to better control the appearance of the generated output. +# We either use it via the `!kgtk ...` syntax directly, use the `kgtk(...)` +# function which produces an HTML rendering of a Pandas frame containing the +# result, or we use the `show_html` function for some additional control on +# how long texts and inline images are displayed. All of these incantations +# should be straightforward to translate into a shell environment if needed. + + +# +# ### Similarity graph caches +# +# The examples in this notebook use a number of different standard and similarity +# graph caches based on `wikidata-20221102-dwd-v8`. These graph caches are +# available in the `DB` directory of the `ckg06` server from where they can be +# copied or accessed directly in example queries. It will generally not be +# possible to run the notebook directly from that server, so if you want to +# run and experiment with the notebook in a Jupyter environment, you have to +# # copy the graph caches to a different location where a notebook server can be run. +# Make sure to also include the associated ANNS index files that end in +# a `.faiss.idx` extension. + +# This notebook also does not show how the individual similarity caches were +# constructed. To see how that can be done, please consult +# the [**Kypher-V Manual**](https://kgtk.readthedocs.io/en/latest/transform/query/#kypher-v) +# or look at the respective `*.db.build.txt` files in the `DB` directory. For reference, +# we show just one incantation here on how the `COMPLEX` graph cache was built. Other +# graph caches were built similarly with some modifications to adjust for differences in +# the embedding data used (for `COMPLEX` this takes about 3 hours to run): + +# ``` +# $ export WD=.../datasets/wikidata-20221102-dwd-v8 +# +# $ cat $WD/wikidatadwd.complEx.graph-embeddings.txt | sed -e 's/ /\t/' \ +# | kgtk --debug add-id --no-input-header=False --input-column-names node1 node2 \ +# --implied-label emb \ +# / query --gc $DB/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db \ +# -i - --as complex \ +# --idx vector:node2/nn/ram=25g/nlist=16k mode:valuegraph \ +# --single-user --limit 5 +# ``` + +# We use the following similarity graph caches which can be combined +# with a main graph cache using one or more `--auxiliary-cache` or `--ac` +# options. The `COMPLEX` graph cache contains 59M 100-D ComplEx +# graph embeddings: + +# !kgtk query --gc $COMPLEX --sc + +# The `TRANSE` graph cache contains 59M 100-D TransE graph embeddings: + +# !kgtk query --gc $TRANSE --sc + +# The `ABSTRACT` graph cache contains the sentences and embedding vectors +# generated from the first sentences of Wikipedia short abstracts. It +# contains about 6M 768-D Roberta base vectors: + +# !kgtk query --gc $ABSTRACT --sc + +# The `IMAGE` graph cache contains image embeddings published by the +# +# Wikipedia image/caption matching challenge. The embeddings are 2048-D vectors +# taken from the second-to-last layer of a ResNet-50 neural network trained with +# Imagenet data. We only use the 2.7M images associated with English Wikipedia +# pages. The resulting vector graph cache is shown here: + +# !kgtk query --gc $IMAGE --sc + +# Finally, we also use a standard Wikidata graph cache for the claims and +# labels of `wikidata-20221102-dwd-v8`. It is called `MAIN` below. + + +# +# ### Vector tables are regular KGTK files +# +# Any KGTK representation that associates a node or edge ID with a vector +# will work. A format we commonly use is where a `node1` points to a vector +# literal in `node2` via an `emb` edge (but any label will do). For example, +# here we show the first three embedding edges in `COMPLEX` (the `node2;_kgtk_vec_qcell` +# column is an auxiliary column automatically computed by ANNS indexing): + +kgtk("""query --gc $COMPLEX -i complex --limit 3""") + + +# +# ### Vector computation + +# The simplest operation in Kypher-V is a similarity computation between two vectors +# which we perform here using the `ABSTRACT` graph cache: + +kgtk(""" + query --gc $MAIN --ac $ABSTRACT + -i abstract -i labels + --match 'abstract: (x:Q868)-[]->(xv), + (y:Q913)-[]->(yv), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim' + """) + + +# +# ### Brute-force similarity search + +# A more interesting operation is *similarity search* where we look +# for the most similar matches for a given seed. In the query below, we +# use a simple but expensive brute-force search over about 10,000 input +# vectors by computing similarities between `x` and each possible `y`, +# then sorting and returning the top-10. This is still pretty fast +# given that the set of inputs is fairly small: + +kgtk(""" + query --gc $MAIN --ac $ABSTRACT + -i abstract -i labels -i claims + --match 'abstract: (x:Q913)-[]->(xv), (y)-[]->(yv), + claims: (y)-[:P106]->(:Q4964182), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim' + --order 'sim desc' + --limit 10 + """) + +# There are about 9M Q5's (humans) that have short abstract vectors: + +kgtk(""" + query --gc $MAIN --ac $ABSTRACT + -i abstract -i labels -i claims + --match 'abstract: (x:Q913)-[]->(xv), + claims: (y)-[:P31]->(:Q5)' + --return 'count(distinct y)' --force + """) + + +# If we used the same brute-force search from above on this much larger set, +# it would take about 5 min to run (which is why this command is disabled): + +# !time DISABLED kgtk query --gc $MAIN \ +# --ac $ABSTRACT \ +# -i abstract -i labels -i claims \ +# --match 'abstract: (x:Q913)-[]->(xv), (y)-[]->(yv), \ +# claims: (y)-[:P31]->(:Q5), \ +# labels: (x)-[]->(xl), (y)-[]->(yl)' \ +# --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim' \ +# --order 'sim desc' \ +# --limit 10 + +# ``` +# xlabel ylabel sim +# 'Socrates'@en 'Socrates'@en 1.0000001192092896 +# 'Socrates'@en 'Anytus'@en 0.9346579909324646 +# 'Socrates'@en 'Heraclitus'@en 0.9344534277915955 +# 'Socrates'@en 'Hippocrates'@en 0.9304061532020569 +# 'Socrates'@en 'Cleisthenes'@en 0.9292828440666199 +# 'Socrates'@en 'Aristides'@en 0.9283562898635864 +# 'Socrates'@en 'Yannis Xirotiris'@en 0.926308274269104 +# 'Socrates'@en 'Sotiris Trivizas'@en 0.9255445003509521 +# 'Socrates'@en 'Aris Maragkopoulos'@en 0.9234243035316467 +# 'Socrates'@en 'Valerios Stais'@en 0.919943630695343 +# 93.859u 38.640s 4:49.84 45.7% 0+0k 18782808+8io 0pf+0w +# ``` + +# +# ### Indexed similarity search + +# For much faster search, we use an ANNS index constructed when the vector data +# was imported which now runs in less than a second compared to 5 minutes before. +# Results here are slightly different from above, since it does not restrict on +# occupation = philosopher (we will address that later): + +kgtk(""" + query --gc $MAIN --ac $ABSTRACT + -i abstract -i labels -i claims + --match 'abstract: (x:Q913)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 5, nprobe: 4}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + --limit 10 + """) + + +# +# ### Full similarity join +# +# Below we query for three philosophers' top-k similar neighbors that are also humans and have +# occupation (`P106`) philosopher. Dynamic scaling ensures that `k` gets increased dynamically +# up to `maxk` until we've found enough qualifying results for each: + +kgtk(""" + query --gc $MAIN --ac $ABSTRACT + -i abstract -i labels -i claims + --match 'abstract: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 5, maxk: 1024, nprobe: 4}]->(y), + claims: (y)-[:P106]->(:Q4964182), + (y)-[:P31]->(:Q5), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q859", "Q868", "Q913"] and x != y' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + + +# For comparison, here is a run without dynamic scaling which returns much fewer results, since +# only a small number of the top-5 similar results for each input also satisfy the post conditions: + +kgtk(""" + query --gc $MAIN --ac $ABSTRACT + -i abstract -i labels -i claims + --match 'abstract: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 5, nprobe: 4}]->(y), + claims: (y)-[:P106]->(:Q4964182), + (y)-[:P31]->(:Q5), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q859", "Q868", "Q913"] and x != y' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + + +# +# ## Example applications + +# ### Image search + +# In the examples below, we use image similarity to link QNodes in Wikidata. We +# use the precomputed `IMAGE` graph cache (see above) which contains embeddings +# for about 2.7M images linked to their respective Wikipedia pages and Wikidata +# QNodes. + +# We start with a QNode (such a the one for Barack Obama below), find one or more +# images associated with that QNode, look up their image embeddings and then find +# other similar images and their associated QNodes. + +# We do not compute any image embeddings on the fly here, we simply link nodes based +# on similarity of images they are associated with. Note that this will often not +# preserve the type of the source node as can be seen in the result for Barack Obama. +# To enforce such type or other restrictions additional clauses can be added. +# Since there are multiple images associated with Barack Obama, we use a `not exists` +# clause to only look at the first one to make the results less cluttered: +# +# Barack Obama: + +# + +# out = !kgtk query --gc $IMAGE --ac $MAIN \ +# -i wiki_image -i labels \ +# --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ +# (xiv)-[r:kvec_topk_cos_sim {k: 10, nprobe: 8}]->(yimg), \ +# (yimg)-[ry {qnode: y}]->(), \ +# labels: (y)-[]->(ylabel)' \ +# --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ +# --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ +# --para SEED=Q76 \ +# / html + +show_html(img_width=200) +# - + +# To get more type appropriate matches, we can add a restriction to only return matches of +# type human (`Q5`): + +# + +# out = !kgtk query --gc $IMAGE --ac $MAIN \ +# -i wiki_image -i labels -i claims \ +# --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ +# (xiv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(yimg), \ +# (yimg)-[ry {qnode: y}]->(), \ +# claims: (y)-[:P31]->(:Q5), \ +# labels: (y)-->(ylabel)' \ +# --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ +# --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ +# --para SEED=Q76 \ +# / html + +show_html(img_width=200) +# - + +# Charles Dadant: again, note that some of the results are not of type human but are +# just linked to a similar image: + +# + +# out = !kgtk query --gc $IMAGE --ac $MAIN \ +# -i wiki_image -i labels \ +# --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ +# (xiv)-[r:kvec_topk_cos_sim {k: 10, nprobe: 8}]->(yimg), \ +# (yimg)-[ry {qnode: y}]->(), \ +# labels: (y)-[]->(ylabel)' \ +# --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ +# --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ +# --para SEED=Q582964 \ +# --limit 20 \ +# / html + +show_html(img_width=100) +# - + +# Beaumaris Castle in Wales: + +# + +# out = !kgtk query --gc $IMAGE --ac $MAIN \ +# -i wiki_image -i labels \ +# --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ +# (xiv)-[r:kvec_topk_cos_sim {k: 20, nprobe: 8}]->(yimg), \ +# (yimg)-[ry {qnode: y}]->(), \ +# labels: (y)-[]->(ylabel)' \ +# --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ +# --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ +# --para SEED=Q756815 \ +# / html + +show_html() +# - + + +# + +# Castles similar to Beaumaris Castle but that are located in Austria (with +# country (`P17`) equal to `Q40`). We use a full vector join to get relevant +# results further down the similarity list. Note that even with `maxk=1024` we only +# get a few results, and that the similarities are significantly lower than in the +# previous example: + +# + +# out = !kgtk query --gc $IMAGE --ac $MAIN \ +# -i wiki_image -i labels -i claims \ +# --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ +# (xiv)-[r:kvec_topk_cos_sim {k: 20, nprobe: 4, maxk: 1024}]->(yimg), \ +# (yimg)-[ry {qnode: y}]->(), \ +# labels: (y)-[]->(ylabel), \ +# claims: (y)-[:P17]->(c:Q40)' \ +# --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ +# --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ +# --para SEED=Q756815 \ +# --limit 20 \ +# / html + +show_html() +# - + + +# +# ## Text embedding queries: + +# In the following example we dynamically compute an embedding vector +# for a text query and then use the similarity machinery to query for +# matching QNodes. The basic story here is the following: + +# - formulate a simple textual query such as 'Ancient Greek philosopher' +# - create a KGTK input file for it/them and run them through the 'text-embedding' command +# - query WD by finding top-k matches based on short abstract text embedding vectors +# - then filter with additional restrictions to get more relevant results. + +# !echo '\ +# q1 Ancient Greek philosopher\n\ +# q2 castle in Austria\n\ +# q3 award-winning actor and comedian' | \ +# sed -e 's/^ *//' | \ +# kgtk cat --no-input-header --input-column-names node1 node2 --implied-label sentence \ +# / add-id \ +# / text-embedding -i - --model roberta-base-nli-mean-tokens \ +# --output-data-format kgtk --output-property emb -o - \ +# / query -i - --idx vector:node2 --as text_emb_queries --match '(x)' --return x + +# The above created 768-D text embedding vector for three short queries +# using the same text embedding type as used in our `ABSTRACT` embeddings. +# Now we find Wikidata QNodes whose short-abstract embedding vector is most similar +# to the queries, and that satisfy any additional conditions we might have. +# Note that the queries in this example are much shorter than the first sentences +# of our Wikipedia abstracts, thus the similarity matching is not very good, but +# we can compensate for some of that by adding additional restrictions: + +# Matches for "Ancient Greek philosopher" that have occupation (`P106`) philosopher: + +# + +# out = !kgtk query --ac $MAIN --ac $ABSTRACT \ +# -i text_emb_queries -i abstract -i labels -i claims -i sentence \ +# --match 'queries: (x:q1)-[]->(xv), \ +# abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 4}]->(y), \ +# claims: (y)-[:P106]->(:Q4964182), \ +# labels: (y)-->(yl), \ +# sentence: (y)-->(ys)' \ +# --return 'y as y, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() +# - + +# Matches for "castle in Austria" that have country (`P17`) Austria: + +# + +# out = !kgtk query --ac $MAIN --ac $ABSTRACT \ +# -i text_emb_queries -i abstract -i labels -i claims -i sentence \ +# --match 'queries: (x:q2)-[]->(xv), \ +# abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ +# claims: (y)-[:P17]->(:Q40), \ +# labels: (y)-->(yl), \ +# sentence: (y)-->(ys)' \ +# --return 'y as y, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() +# - + +# Matches for "award-winning actor and comedian" that are of type human +# and have country of citizenship (`P27`) UK: + +# + +# out = !kgtk query --ac $MAIN --ac $ABSTRACT \ +# -i text_emb_queries -i abstract -i labels -i claims -i sentence \ +# --match 'queries: (x:q3)-[]->(xv), \ +# abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ +# claims: (y)-[:P31]->(:Q5), \ +# (y)-[:P27]->(:Q145), \ +# labels: (y)-->(yl), \ +# sentence: (y)-->(ys)' \ +# --return 'y as y, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() +# - + + +# +# ## Comparing different types of embeddings + +# Below we run a number of similarity queries for each of our various types of +# embeddings to see how they behave relative to each other. Note how they +# behave quite differently, reasonable for some use cases but not so much for others: + +# ### Philosophers: + +kgtk(""" + query --gc $MAIN --ac $COMPLEX + -i complex -i labels + --match 'complex: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q859", "Q868", "Q913"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +kgtk(""" + query --gc $MAIN --ac $TRANSE + -i transe -i labels + --match 'transe: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q859", "Q868", "Q913"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +# + +# out = !kgtk query --gc $MAIN --ac $ABSTRACT \ +# -i abstract -i labels -i sentence \ +# --match 'abstract: (x)-[]->(xv), \ +# (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ +# labels: (x)-[]->(xl), (y)-[]->(yl), \ +# sent: (y)-[]->(ys)' \ +# --where 'x in ["Q859", "Q868", "Q913"]' \ +# --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() +# - + + +# ### Countries: + +kgtk(""" + query --gc $MAIN --ac $COMPLEX + -i complex -i labels + --match 'complex: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q40", "Q41", "Q30"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +kgtk(""" + query --gc $MAIN --ac $TRANSE + -i transe -i labels + --match 'transe: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q40", "Q41", "Q30"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +# + +# out = !kgtk query --gc $MAIN --ac $ABSTRACT \ +# -i abstract -i labels -i sentence \ +# --match 'abstract: (x)-[]->(xv), \ +# (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ +# labels: (x)-[]->(xl), (y)-[]->(yl), \ +# sent: (y)-[]->(ys)' \ +# --where 'x in ["Q40", "Q41", "Q30"]' \ +# --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() +# - + + +# ### Types of animals: + +kgtk(""" + query --gc $MAIN --ac $COMPLEX + -i complex -i labels + --match 'complex: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q144", "Q146", "Q726"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +kgtk(""" + query --gc $MAIN --ac $TRANSE + -i transe -i labels + --match 'transe: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q144", "Q146", "Q726"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +# + +# out = !kgtk query --gc $MAIN --ac $ABSTRACT \ +# -i abstract -i labels -i sentence \ +# --match 'abstract: (x)-[]->(xv), \ +# (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ +# labels: (x)-[]->(xl), (y)-[]->(yl), \ +# sent: (y)-[]->(ys)' \ +# --where 'x in ["Q144", "Q146", "Q726"]' \ +# --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() +# - + + +# ### Handball: + +kgtk(""" + query --gc $MAIN --ac $COMPLEX + -i complex -i labels + --match 'complex: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q8418"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +kgtk(""" + query --gc $MAIN --ac $TRANSE + -i transe -i labels + --match 'transe: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q8418"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +# + +# out = !kgtk query --gc $MAIN --ac $ABSTRACT \ +# -i abstract -i labels -i sentence \ +# --match 'abstract: (x)-[]->(xv), \ +# (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ +# labels: (x)-[]->(xl), (y)-[]->(yl), \ +# sent: (y)-[]->(ys)' \ +# --where 'x in ["Q8418"]' \ +# --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() +# - + + +# ### Journalist: + +kgtk(""" + query --gc $MAIN --ac $COMPLEX + -i complex -i labels + --match 'complex: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q1930187"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +kgtk(""" + query --gc $MAIN --ac $TRANSE + -i transe -i labels + --match 'transe: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q1930187"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +# + +# out = !kgtk query --gc $MAIN --ac $ABSTRACT \ +# -i abstract -i labels -i sentence \ +# --match 'abstract: (x)-[]->(xv), \ +# (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ +# labels: (x)-[]->(xl), (y)-[]->(yl), \ +# sent: (y)-[]->(ys)' \ +# --where 'x in ["Q1930187"]' \ +# --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() +# - + + +# ### Head of state: + +kgtk(""" + query --gc $MAIN --ac $COMPLEX + -i complex -i labels + --match 'complex: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q48352"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +kgtk(""" + query --gc $MAIN --ac $TRANSE + -i transe -i labels + --match 'transe: (x)-[]->(xv), + (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), + labels: (x)-[]->(xl), (y)-[]->(yl)' + --where 'x in ["Q48352"]' + --return 'xl as xlabel, yl as ylabel, r.similarity as sim' + """) + +# + +# out = !kgtk query --gc $MAIN --ac $ABSTRACT \ +# -i abstract -i labels -i sentence \ +# --match 'abstract: (x)-[]->(xv), \ +# (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ +# labels: (x)-[]->(xl), (y)-[]->(yl), \ +# sent: (y)-[]->(ys)' \ +# --where 'x in ["Q48352"]' \ +# --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ +# / html + +show_html() From c06318870ffe123b2e1b38ebdb44953858f569fc Mon Sep 17 00:00:00 2001 From: Hans Chalupsky Date: Sat, 14 Jan 2023 12:39:51 -0800 Subject: [PATCH 3/5] Fixed Q5 count query Various other language and query syntax cleanups. --- examples/kypherv-similarity-queries.ipynb | 169 +++++++++++----------- examples/kypherv-similarity-queries.py | 157 ++++++++++---------- 2 files changed, 164 insertions(+), 162 deletions(-) diff --git a/examples/kypherv-similarity-queries.ipynb b/examples/kypherv-similarity-queries.ipynb index 918366c69..b6788dfa5 100644 --- a/examples/kypherv-similarity-queries.ipynb +++ b/examples/kypherv-similarity-queries.ipynb @@ -70,7 +70,8 @@ "id": "aae41154", "metadata": {}, "source": [ - "This notebook contains a number of example queries using Kypher-V. The queries assume the existence of a number of similarity graph caches in the DB directory which are defined here via shell variables:" + "The Kypher-V example queries in this notebook assume the existence of a number of similarity\n", + "graph caches in the `DB` directory, which are all defined here via shell variables:" ] }, { @@ -107,8 +108,8 @@ "id": "98a8081a", "metadata": {}, "source": [ - "If you copied the graph caches to a different location, please adjust the\n", - "paths and definitions accordingly." + "If you copied the graph caches and their associated `.faiss.idx` ANNS index files\n", + "to a different location, please adjust the paths and definitions accordingly." ] }, { @@ -118,7 +119,7 @@ "lines_to_next_cell": 2 }, "source": [ - "Throughout the notebook we use a number of different invocation styles for\n", + "Throughout the notebook we use three different invocation styles for\n", "the `kgtk` command to better control the appearance of the generated output.\n", "We either use it via the `!kgtk ...` syntax directly, use the `kgtk(...)`\n", "function which produces an HTML rendering of a Pandas frame containing the\n", @@ -135,14 +136,14 @@ "\n", "### Similarity graph caches\n", "\n", - "The examples in this notebook use a number of different standard and similarity\n", + "The examples in this notebook rely on several standard and similarity\n", "graph caches based on `wikidata-20221102-dwd-v8`. These graph caches are\n", "available in the `DB` directory of the `ckg06` server from where they can be\n", "copied or accessed directly in example queries. It will generally not be\n", "possible to run the notebook directly from that server, so if you want to\n", "run and experiment with the notebook in a Jupyter environment, you have to\n", "copy the graph caches to a different location where a notebook server can be run.\n", - "Make sure to also include the associated ANNS index files that end in\n", + "In this case, make sure to also copy the associated ANNS index files that end in\n", "a `.faiss.idx` extension." ] }, @@ -151,13 +152,13 @@ "id": "f0f027c2", "metadata": {}, "source": [ - "This notebook also does not show how the individual similarity caches were\n", + "This notebook does not show how the individual similarity caches were\n", "constructed. To see how that can be done, please consult\n", "the [**Kypher-V Manual**](https://kgtk.readthedocs.io/en/latest/transform/query/#kypher-v)\n", "or look at the respective `*.db.build.txt` files in the `DB` directory. For reference,\n", "we show just one incantation here on how the `COMPLEX` graph cache was built. Other\n", "graph caches were built similarly with some modifications to adjust for differences in\n", - "the embedding data used (for `COMPLEX` this takes about 3 hours to run):" + "the embedding data used (for `COMPLEX` this takes about 2.5-3 hours to run on a laptop):" ] }, { @@ -185,8 +186,8 @@ "source": [ "We use the following similarity graph caches which can be combined\n", "with a main graph cache using one or more `--auxiliary-cache` or `--ac`\n", - "options. The `COMPLEX` graph cache contains 59M 100-D ComplEx\n", - "graph embeddings:" + "options to the `query` command. The `COMPLEX` graph cache contains\n", + "59M 100-D ComplEx graph embeddings:" ] }, { @@ -361,7 +362,7 @@ "### Vector tables are regular KGTK files\n", "\n", "Any KGTK representation that associates a node or edge ID with a vector\n", - "will work. A format we commonly use is where a `node1` points to a vector\n", + "will work. An edge format we commonly use is a `node1` pointing to a vector\n", "literal in `node2` via an `emb` edge (but any label will do). For example,\n", "here we show the first three embedding edges in `COMPLEX` (the `node2;_kgtk_vec_qcell`\n", "column is an auxiliary column automatically computed by ANNS indexing):" @@ -530,9 +531,9 @@ "kgtk(\"\"\" \n", " query --gc $MAIN --ac $ABSTRACT\n", " -i abstract -i labels\n", - " --match 'abstract: (x:Q868)-[]->(xv),\n", - " (y:Q913)-[]->(yv),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " --match 'abstract: (x:Q868)-->(xv),\n", + " (y:Q913)-->(yv),\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim'\n", " \"\"\")" ] @@ -679,9 +680,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $ABSTRACT\n", " -i abstract -i labels -i claims\n", - " --match 'abstract: (x:Q913)-[]->(xv), (y)-[]->(yv),\n", + " --match 'abstract: (x:Q913)-->(xv), (y)-->(yv),\n", " claims: (y)-[:P106]->(:Q4964182),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim'\n", " --order 'sim desc'\n", " --limit 10\n", @@ -693,12 +694,12 @@ "id": "7cef3f07", "metadata": {}, "source": [ - "There are about 9M Q5's (humans) that have short abstract vectors:" + "There are about 9M Q5's (humans) in Wikidata, 1.8M of which have short abstract vectors:" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 5, "id": "d4a26fd8", "metadata": { "lines_to_next_cell": 2 @@ -725,24 +726,24 @@ " \n", " \n", " \n", - " count(DISTINCT graph_1_c2.\"node1\")\n", + " count(DISTINCT db1_graph_1_c1.\"node1\")\n", " \n", " \n", " \n", " \n", " 0\n", - " 8944218\n", + " 1801483\n", " \n", " \n", "\n", "" ], "text/plain": [ - " count(DISTINCT graph_1_c2.\"node1\")\n", - "0 8944218" + " count(DISTINCT db1_graph_1_c1.\"node1\")\n", + "0 1801483" ] }, - "execution_count": 10, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -750,10 +751,10 @@ "source": [ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $ABSTRACT\n", - " -i abstract -i labels -i claims\n", - " --match 'abstract: (x:Q913)-[]->(xv),\n", - " claims: (y)-[:P31]->(:Q5)'\n", - " --return 'count(distinct y)' --force\n", + " -i abstract -i claims\n", + " --match 'abstract: (x)-->(),\n", + " claims: (x)-[:P31]->(:Q5)'\n", + " --return 'count(distinct x)'\n", " \"\"\")" ] }, @@ -776,9 +777,9 @@ "!time DISABLED kgtk query --gc $MAIN \\\n", " --ac $ABSTRACT \\\n", " -i abstract -i labels -i claims \\\n", - " --match 'abstract: (x:Q913)-[]->(xv), (y)-[]->(yv), \\\n", + " --match 'abstract: (x:Q913)-->(xv), (y)-->(yv), \\\n", " claims: (y)-[:P31]->(:Q5), \\\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)' \\\n", + " labels: (x)-->(xl), (y)-->(yl)' \\\n", " --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim' \\\n", " --order 'sim desc' \\\n", " --limit 10" @@ -912,9 +913,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $ABSTRACT\n", " -i abstract -i labels -i claims\n", - " --match 'abstract: (x:Q913)-[]->(xv),\n", + " --match 'abstract: (x:Q913)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 5, nprobe: 4}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " --limit 10\n", " \"\"\")" @@ -1090,11 +1091,11 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $ABSTRACT\n", " -i abstract -i labels -i claims\n", - " --match 'abstract: (x)-[]->(xv),\n", + " --match 'abstract: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 5, maxk: 1024, nprobe: 4}]->(y),\n", " claims: (y)-[:P106]->(:Q4964182),\n", " (y)-[:P31]->(:Q5),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q859\", \"Q868\", \"Q913\"] and x != y'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -1231,11 +1232,11 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $ABSTRACT\n", " -i abstract -i labels -i claims\n", - " --match 'abstract: (x)-[]->(xv),\n", + " --match 'abstract: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 5, nprobe: 4}]->(y),\n", " claims: (y)-[:P106]->(:Q4964182),\n", " (y)-[:P31]->(:Q5),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q859\", \"Q868\", \"Q913\"] and x != y'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -1319,7 +1320,7 @@ " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", " (xiv)-[r:kvec_topk_cos_sim {k: 10, nprobe: 8}]->(yimg), \\\n", " (yimg)-[ry {qnode: y}]->(), \\\n", - " labels: (y)-[]->(ylabel)' \\\n", + " labels: (y)-->(ylabel)' \\\n", " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", " --para SEED=Q76 \\\n", @@ -1406,7 +1407,7 @@ " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", " (xiv)-[r:kvec_topk_cos_sim {k: 10, nprobe: 8}]->(yimg), \\\n", " (yimg)-[ry {qnode: y}]->(), \\\n", - " labels: (y)-[]->(ylabel)' \\\n", + " labels: (y)-->(ylabel)' \\\n", " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", " --para SEED=Q582964 \\\n", @@ -1451,7 +1452,7 @@ " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", " (xiv)-[r:kvec_topk_cos_sim {k: 20, nprobe: 8}]->(yimg), \\\n", " (yimg)-[ry {qnode: y}]->(), \\\n", - " labels: (y)-[]->(ylabel)' \\\n", + " labels: (y)-->(ylabel)' \\\n", " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", " --para SEED=Q756815 \\\n", @@ -1507,7 +1508,7 @@ " --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \\\n", " (xiv)-[r:kvec_topk_cos_sim {k: 20, nprobe: 4, maxk: 1024}]->(yimg), \\\n", " (yimg)-[ry {qnode: y}]->(), \\\n", - " labels: (y)-[]->(ylabel), \\\n", + " labels: (y)-->(ylabel), \\\n", " claims: (y)-[:P17]->(c:Q40)' \\\n", " --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \\\n", " --return 'y as qnode, ylabel as label, printf(\"%.5g\", r.similarity) as sim, yimg as image' \\\n", @@ -1628,7 +1629,7 @@ "source": [ "out = !kgtk query --ac $MAIN --ac $ABSTRACT \\\n", " -i text_emb_queries -i abstract -i labels -i claims -i sentence \\\n", - " --match 'queries: (x:q1)-[]->(xv), \\\n", + " --match 'queries: (x:q1)-->(xv), \\\n", " abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 4}]->(y), \\\n", " claims: (y)-[:P106]->(:Q4964182), \\\n", " labels: (y)-->(yl), \\\n", @@ -1669,7 +1670,7 @@ "source": [ "out = !kgtk query --ac $MAIN --ac $ABSTRACT \\\n", " -i text_emb_queries -i abstract -i labels -i claims -i sentence \\\n", - " --match 'queries: (x:q2)-[]->(xv), \\\n", + " --match 'queries: (x:q2)-->(xv), \\\n", " abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", " claims: (y)-[:P17]->(:Q40), \\\n", " labels: (y)-->(yl), \\\n", @@ -1713,7 +1714,7 @@ "source": [ "out = !kgtk query --ac $MAIN --ac $ABSTRACT \\\n", " -i text_emb_queries -i abstract -i labels -i claims -i sentence \\\n", - " --match 'queries: (x:q3)-[]->(xv), \\\n", + " --match 'queries: (x:q3)-->(xv), \\\n", " abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", " claims: (y)-[:P31]->(:Q5), \\\n", " (y)-[:P27]->(:Q145), \\\n", @@ -2012,9 +2013,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $COMPLEX\n", " -i complex -i labels\n", - " --match 'complex: (x)-[]->(xv),\n", + " --match 'complex: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q859\", \"Q868\", \"Q913\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -2280,9 +2281,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $TRANSE\n", " -i transe -i labels\n", - " --match 'transe: (x)-[]->(xv),\n", + " --match 'transe: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q859\", \"Q868\", \"Q913\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -2312,10 +2313,10 @@ "source": [ "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", " -i abstract -i labels -i sentence \\\n", - " --match 'abstract: (x)-[]->(xv), \\\n", + " --match 'abstract: (x)-->(xv), \\\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", - " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", - " sent: (y)-[]->(ys)' \\\n", + " labels: (x)-->(xl), (y)-->(yl), \\\n", + " sent: (y)-->(ys)' \\\n", " --where 'x in [\"Q859\", \"Q868\", \"Q913\"]' \\\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", " / html\n", @@ -2591,9 +2592,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $COMPLEX\n", " -i complex -i labels\n", - " --match 'complex: (x)-[]->(xv),\n", + " --match 'complex: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q40\", \"Q41\", \"Q30\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -2891,9 +2892,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $TRANSE\n", " -i transe -i labels\n", - " --match 'transe: (x)-[]->(xv),\n", + " --match 'transe: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q40\", \"Q41\", \"Q30\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -2923,10 +2924,10 @@ "source": [ "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", " -i abstract -i labels -i sentence \\\n", - " --match 'abstract: (x)-[]->(xv), \\\n", + " --match 'abstract: (x)-->(xv), \\\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", - " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", - " sent: (y)-[]->(ys)' \\\n", + " labels: (x)-->(xl), (y)-->(yl), \\\n", + " sent: (y)-->(ys)' \\\n", " --where 'x in [\"Q40\", \"Q41\", \"Q30\"]' \\\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", " / html\n", @@ -3202,9 +3203,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $COMPLEX\n", " -i complex -i labels\n", - " --match 'complex: (x)-[]->(xv),\n", + " --match 'complex: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q144\", \"Q146\", \"Q726\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -3470,9 +3471,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $TRANSE\n", " -i transe -i labels\n", - " --match 'transe: (x)-[]->(xv),\n", + " --match 'transe: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q144\", \"Q146\", \"Q726\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -3502,10 +3503,10 @@ "source": [ "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", " -i abstract -i labels -i sentence \\\n", - " --match 'abstract: (x)-[]->(xv), \\\n", + " --match 'abstract: (x)-->(xv), \\\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", - " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", - " sent: (y)-[]->(ys)' \\\n", + " labels: (x)-->(xl), (y)-->(yl), \\\n", + " sent: (y)-->(ys)' \\\n", " --where 'x in [\"Q144\", \"Q146\", \"Q726\"]' \\\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", " / html\n", @@ -3641,9 +3642,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $COMPLEX\n", " -i complex -i labels\n", - " --match 'complex: (x)-[]->(xv),\n", + " --match 'complex: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q8418\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -3769,9 +3770,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $TRANSE\n", " -i transe -i labels\n", - " --match 'transe: (x)-[]->(xv),\n", + " --match 'transe: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q8418\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -3801,10 +3802,10 @@ "source": [ "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", " -i abstract -i labels -i sentence \\\n", - " --match 'abstract: (x)-[]->(xv), \\\n", + " --match 'abstract: (x)-->(xv), \\\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", - " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", - " sent: (y)-[]->(ys)' \\\n", + " labels: (x)-->(xl), (y)-->(yl), \\\n", + " sent: (y)-->(ys)' \\\n", " --where 'x in [\"Q8418\"]' \\\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", " / html\n", @@ -3940,9 +3941,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $COMPLEX\n", " -i complex -i labels\n", - " --match 'complex: (x)-[]->(xv),\n", + " --match 'complex: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q1930187\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -4068,9 +4069,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $TRANSE\n", " -i transe -i labels\n", - " --match 'transe: (x)-[]->(xv),\n", + " --match 'transe: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q1930187\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -4100,10 +4101,10 @@ "source": [ "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", " -i abstract -i labels -i sentence \\\n", - " --match 'abstract: (x)-[]->(xv), \\\n", + " --match 'abstract: (x)-->(xv), \\\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", - " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", - " sent: (y)-[]->(ys)' \\\n", + " labels: (x)-->(xl), (y)-->(yl), \\\n", + " sent: (y)-->(ys)' \\\n", " --where 'x in [\"Q1930187\"]' \\\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", " / html\n", @@ -4239,9 +4240,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $COMPLEX\n", " -i complex -i labels\n", - " --match 'complex: (x)-[]->(xv),\n", + " --match 'complex: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q48352\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -4367,9 +4368,9 @@ "kgtk(\"\"\"\n", " query --gc $MAIN --ac $TRANSE\n", " -i transe -i labels\n", - " --match 'transe: (x)-[]->(xv),\n", + " --match 'transe: (x)-->(xv),\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y),\n", - " labels: (x)-[]->(xl), (y)-[]->(yl)'\n", + " labels: (x)-->(xl), (y)-->(yl)'\n", " --where 'x in [\"Q48352\"]'\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim'\n", " \"\"\")" @@ -4397,10 +4398,10 @@ "source": [ "out = !kgtk query --gc $MAIN --ac $ABSTRACT \\\n", " -i abstract -i labels -i sentence \\\n", - " --match 'abstract: (x)-[]->(xv), \\\n", + " --match 'abstract: (x)-->(xv), \\\n", " (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \\\n", - " labels: (x)-[]->(xl), (y)-[]->(yl), \\\n", - " sent: (y)-[]->(ys)' \\\n", + " labels: (x)-->(xl), (y)-->(yl), \\\n", + " sent: (y)-->(ys)' \\\n", " --where 'x in [\"Q48352\"]' \\\n", " --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \\\n", " / html\n", diff --git a/examples/kypherv-similarity-queries.py b/examples/kypherv-similarity-queries.py index d3ec0a252..aed3a61d3 100644 --- a/examples/kypherv-similarity-queries.py +++ b/examples/kypherv-similarity-queries.py @@ -49,7 +49,8 @@ def show_html(img_width=150): # - -# This notebook contains a number of example queries using Kypher-V. The queries assume the existence of a number of similarity graph caches in the DB directory which are defined here via shell variables: +# The Kypher-V example queries in this notebook assume the existence of a number of similarity +# graph caches in the `DB` directory, which are all defined here via shell variables: DB="/kgtk-data/kypherv" # %env DB={DB} @@ -59,10 +60,10 @@ def show_html(img_width=150): # %env ABSTRACT={DB}/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db # %env IMAGE={DB}/wikimedia-capcom-image-embeddings-v2.sqlite3.db -# If you copied the graph caches to a different location, please adjust the -# paths and definitions accordingly. +# If you copied the graph caches and their associated `.faiss.idx` ANNS index files +# to a different location, please adjust the paths and definitions accordingly. -# Throughout the notebook we use a number of different invocation styles for +# Throughout the notebook we use three different invocation styles for # the `kgtk` command to better control the appearance of the generated output. # We either use it via the `!kgtk ...` syntax directly, use the `kgtk(...)` # function which produces an HTML rendering of a Pandas frame containing the @@ -74,23 +75,23 @@ def show_html(img_width=150): # # ### Similarity graph caches # -# The examples in this notebook use a number of different standard and similarity +# The examples in this notebook rely on several standard and similarity # graph caches based on `wikidata-20221102-dwd-v8`. These graph caches are # available in the `DB` directory of the `ckg06` server from where they can be # copied or accessed directly in example queries. It will generally not be # possible to run the notebook directly from that server, so if you want to # run and experiment with the notebook in a Jupyter environment, you have to # # copy the graph caches to a different location where a notebook server can be run. -# Make sure to also include the associated ANNS index files that end in +# In this case, make sure to also copy the associated ANNS index files that end in # a `.faiss.idx` extension. -# This notebook also does not show how the individual similarity caches were +# This notebook does not show how the individual similarity caches were # constructed. To see how that can be done, please consult # the [**Kypher-V Manual**](https://kgtk.readthedocs.io/en/latest/transform/query/#kypher-v) # or look at the respective `*.db.build.txt` files in the `DB` directory. For reference, # we show just one incantation here on how the `COMPLEX` graph cache was built. Other # graph caches were built similarly with some modifications to adjust for differences in -# the embedding data used (for `COMPLEX` this takes about 3 hours to run): +# the embedding data used (for `COMPLEX` this takes about 2.5-3 hours to run on a laptop): # ``` # $ export WD=.../datasets/wikidata-20221102-dwd-v8 @@ -106,8 +107,8 @@ def show_html(img_width=150): # We use the following similarity graph caches which can be combined # with a main graph cache using one or more `--auxiliary-cache` or `--ac` -# options. The `COMPLEX` graph cache contains 59M 100-D ComplEx -# graph embeddings: +# options to the `query` command. The `COMPLEX` graph cache contains +# 59M 100-D ComplEx graph embeddings: # !kgtk query --gc $COMPLEX --sc @@ -138,7 +139,7 @@ def show_html(img_width=150): # ### Vector tables are regular KGTK files # # Any KGTK representation that associates a node or edge ID with a vector -# will work. A format we commonly use is where a `node1` points to a vector +# will work. An edge format we commonly use is a `node1` pointing to a vector # literal in `node2` via an `emb` edge (but any label will do). For example, # here we show the first three embedding edges in `COMPLEX` (the `node2;_kgtk_vec_qcell` # column is an auxiliary column automatically computed by ANNS indexing): @@ -155,9 +156,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $ABSTRACT -i abstract -i labels - --match 'abstract: (x:Q868)-[]->(xv), - (y:Q913)-[]->(yv), - labels: (x)-[]->(xl), (y)-[]->(yl)' + --match 'abstract: (x:Q868)-->(xv), + (y:Q913)-->(yv), + labels: (x)-->(xl), (y)-->(yl)' --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim' """) @@ -175,22 +176,22 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $ABSTRACT -i abstract -i labels -i claims - --match 'abstract: (x:Q913)-[]->(xv), (y)-[]->(yv), + --match 'abstract: (x:Q913)-->(xv), (y)-->(yv), claims: (y)-[:P106]->(:Q4964182), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim' --order 'sim desc' --limit 10 """) -# There are about 9M Q5's (humans) that have short abstract vectors: +# There are about 9M Q5's (humans) in Wikidata, 1.8M of which have short abstract vectors: kgtk(""" query --gc $MAIN --ac $ABSTRACT - -i abstract -i labels -i claims - --match 'abstract: (x:Q913)-[]->(xv), - claims: (y)-[:P31]->(:Q5)' - --return 'count(distinct y)' --force + -i abstract -i claims + --match 'abstract: (x)-->(), + claims: (x)-[:P31]->(:Q5)' + --return 'count(distinct x)' """) @@ -200,9 +201,9 @@ def show_html(img_width=150): # !time DISABLED kgtk query --gc $MAIN \ # --ac $ABSTRACT \ # -i abstract -i labels -i claims \ -# --match 'abstract: (x:Q913)-[]->(xv), (y)-[]->(yv), \ +# --match 'abstract: (x:Q913)-->(xv), (y)-->(yv), \ # claims: (y)-[:P31]->(:Q5), \ -# labels: (x)-[]->(xl), (y)-[]->(yl)' \ +# labels: (x)-->(xl), (y)-->(yl)' \ # --return 'xl as xlabel, yl as ylabel, kvec_cos_sim(xv, yv) as sim' \ # --order 'sim desc' \ # --limit 10 @@ -233,9 +234,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $ABSTRACT -i abstract -i labels -i claims - --match 'abstract: (x:Q913)-[]->(xv), + --match 'abstract: (x:Q913)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 5, nprobe: 4}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' --limit 10 """) @@ -251,11 +252,11 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $ABSTRACT -i abstract -i labels -i claims - --match 'abstract: (x)-[]->(xv), + --match 'abstract: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 5, maxk: 1024, nprobe: 4}]->(y), claims: (y)-[:P106]->(:Q4964182), (y)-[:P31]->(:Q5), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q859", "Q868", "Q913"] and x != y' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -267,11 +268,11 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $ABSTRACT -i abstract -i labels -i claims - --match 'abstract: (x)-[]->(xv), + --match 'abstract: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 5, nprobe: 4}]->(y), claims: (y)-[:P106]->(:Q4964182), (y)-[:P31]->(:Q5), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q859", "Q868", "Q913"] and x != y' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -306,7 +307,7 @@ def show_html(img_width=150): # --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ # (xiv)-[r:kvec_topk_cos_sim {k: 10, nprobe: 8}]->(yimg), \ # (yimg)-[ry {qnode: y}]->(), \ -# labels: (y)-[]->(ylabel)' \ +# labels: (y)-->(ylabel)' \ # --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ # --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ # --para SEED=Q76 \ @@ -343,7 +344,7 @@ def show_html(img_width=150): # --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ # (xiv)-[r:kvec_topk_cos_sim {k: 10, nprobe: 8}]->(yimg), \ # (yimg)-[ry {qnode: y}]->(), \ -# labels: (y)-[]->(ylabel)' \ +# labels: (y)-->(ylabel)' \ # --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ # --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ # --para SEED=Q582964 \ @@ -361,7 +362,7 @@ def show_html(img_width=150): # --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ # (xiv)-[r:kvec_topk_cos_sim {k: 20, nprobe: 8}]->(yimg), \ # (yimg)-[ry {qnode: y}]->(), \ -# labels: (y)-[]->(ylabel)' \ +# labels: (y)-->(ylabel)' \ # --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ # --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ # --para SEED=Q756815 \ @@ -385,7 +386,7 @@ def show_html(img_width=150): # --match 'image: (ximg)-[rx {qnode: $SEED}]->(xiv), \ # (xiv)-[r:kvec_topk_cos_sim {k: 20, nprobe: 4, maxk: 1024}]->(yimg), \ # (yimg)-[ry {qnode: y}]->(), \ -# labels: (y)-[]->(ylabel), \ +# labels: (y)-->(ylabel), \ # claims: (y)-[:P17]->(c:Q40)' \ # --where 'not exists {image: (ximg2)-[{qnode: $SEED}]->() WHERE rowid(ximg2) < rowid(ximg) }' \ # --return 'y as qnode, ylabel as label, printf("%.5g", r.similarity) as sim, yimg as image' \ @@ -433,7 +434,7 @@ def show_html(img_width=150): # + # out = !kgtk query --ac $MAIN --ac $ABSTRACT \ # -i text_emb_queries -i abstract -i labels -i claims -i sentence \ -# --match 'queries: (x:q1)-[]->(xv), \ +# --match 'queries: (x:q1)-->(xv), \ # abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 4}]->(y), \ # claims: (y)-[:P106]->(:Q4964182), \ # labels: (y)-->(yl), \ @@ -449,7 +450,7 @@ def show_html(img_width=150): # + # out = !kgtk query --ac $MAIN --ac $ABSTRACT \ # -i text_emb_queries -i abstract -i labels -i claims -i sentence \ -# --match 'queries: (x:q2)-[]->(xv), \ +# --match 'queries: (x:q2)-->(xv), \ # abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ # claims: (y)-[:P17]->(:Q40), \ # labels: (y)-->(yl), \ @@ -466,7 +467,7 @@ def show_html(img_width=150): # + # out = !kgtk query --ac $MAIN --ac $ABSTRACT \ # -i text_emb_queries -i abstract -i labels -i claims -i sentence \ -# --match 'queries: (x:q3)-[]->(xv), \ +# --match 'queries: (x:q3)-->(xv), \ # abstract: (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ # claims: (y)-[:P31]->(:Q5), \ # (y)-[:P27]->(:Q145), \ @@ -491,9 +492,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $COMPLEX -i complex -i labels - --match 'complex: (x)-[]->(xv), + --match 'complex: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q859", "Q868", "Q913"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -501,9 +502,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $TRANSE -i transe -i labels - --match 'transe: (x)-[]->(xv), + --match 'transe: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q859", "Q868", "Q913"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -511,10 +512,10 @@ def show_html(img_width=150): # + # out = !kgtk query --gc $MAIN --ac $ABSTRACT \ # -i abstract -i labels -i sentence \ -# --match 'abstract: (x)-[]->(xv), \ +# --match 'abstract: (x)-->(xv), \ # (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ -# labels: (x)-[]->(xl), (y)-[]->(yl), \ -# sent: (y)-[]->(ys)' \ +# labels: (x)-->(xl), (y)-->(yl), \ +# sent: (y)-->(ys)' \ # --where 'x in ["Q859", "Q868", "Q913"]' \ # --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ # / html @@ -528,9 +529,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $COMPLEX -i complex -i labels - --match 'complex: (x)-[]->(xv), + --match 'complex: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q40", "Q41", "Q30"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -538,9 +539,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $TRANSE -i transe -i labels - --match 'transe: (x)-[]->(xv), + --match 'transe: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q40", "Q41", "Q30"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -548,10 +549,10 @@ def show_html(img_width=150): # + # out = !kgtk query --gc $MAIN --ac $ABSTRACT \ # -i abstract -i labels -i sentence \ -# --match 'abstract: (x)-[]->(xv), \ +# --match 'abstract: (x)-->(xv), \ # (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ -# labels: (x)-[]->(xl), (y)-[]->(yl), \ -# sent: (y)-[]->(ys)' \ +# labels: (x)-->(xl), (y)-->(yl), \ +# sent: (y)-->(ys)' \ # --where 'x in ["Q40", "Q41", "Q30"]' \ # --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ # / html @@ -565,9 +566,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $COMPLEX -i complex -i labels - --match 'complex: (x)-[]->(xv), + --match 'complex: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q144", "Q146", "Q726"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -575,9 +576,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $TRANSE -i transe -i labels - --match 'transe: (x)-[]->(xv), + --match 'transe: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q144", "Q146", "Q726"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -585,10 +586,10 @@ def show_html(img_width=150): # + # out = !kgtk query --gc $MAIN --ac $ABSTRACT \ # -i abstract -i labels -i sentence \ -# --match 'abstract: (x)-[]->(xv), \ +# --match 'abstract: (x)-->(xv), \ # (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ -# labels: (x)-[]->(xl), (y)-[]->(yl), \ -# sent: (y)-[]->(ys)' \ +# labels: (x)-->(xl), (y)-->(yl), \ +# sent: (y)-->(ys)' \ # --where 'x in ["Q144", "Q146", "Q726"]' \ # --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ # / html @@ -602,9 +603,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $COMPLEX -i complex -i labels - --match 'complex: (x)-[]->(xv), + --match 'complex: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q8418"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -612,9 +613,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $TRANSE -i transe -i labels - --match 'transe: (x)-[]->(xv), + --match 'transe: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q8418"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -622,10 +623,10 @@ def show_html(img_width=150): # + # out = !kgtk query --gc $MAIN --ac $ABSTRACT \ # -i abstract -i labels -i sentence \ -# --match 'abstract: (x)-[]->(xv), \ +# --match 'abstract: (x)-->(xv), \ # (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ -# labels: (x)-[]->(xl), (y)-[]->(yl), \ -# sent: (y)-[]->(ys)' \ +# labels: (x)-->(xl), (y)-->(yl), \ +# sent: (y)-->(ys)' \ # --where 'x in ["Q8418"]' \ # --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ # / html @@ -639,9 +640,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $COMPLEX -i complex -i labels - --match 'complex: (x)-[]->(xv), + --match 'complex: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q1930187"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -649,9 +650,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $TRANSE -i transe -i labels - --match 'transe: (x)-[]->(xv), + --match 'transe: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q1930187"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -659,10 +660,10 @@ def show_html(img_width=150): # + # out = !kgtk query --gc $MAIN --ac $ABSTRACT \ # -i abstract -i labels -i sentence \ -# --match 'abstract: (x)-[]->(xv), \ +# --match 'abstract: (x)-->(xv), \ # (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ -# labels: (x)-[]->(xl), (y)-[]->(yl), \ -# sent: (y)-[]->(ys)' \ +# labels: (x)-->(xl), (y)-->(yl), \ +# sent: (y)-->(ys)' \ # --where 'x in ["Q1930187"]' \ # --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ # / html @@ -676,9 +677,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $COMPLEX -i complex -i labels - --match 'complex: (x)-[]->(xv), + --match 'complex: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q48352"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -686,9 +687,9 @@ def show_html(img_width=150): kgtk(""" query --gc $MAIN --ac $TRANSE -i transe -i labels - --match 'transe: (x)-[]->(xv), + --match 'transe: (x)-->(xv), (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), - labels: (x)-[]->(xl), (y)-[]->(yl)' + labels: (x)-->(xl), (y)-->(yl)' --where 'x in ["Q48352"]' --return 'xl as xlabel, yl as ylabel, r.similarity as sim' """) @@ -696,10 +697,10 @@ def show_html(img_width=150): # + # out = !kgtk query --gc $MAIN --ac $ABSTRACT \ # -i abstract -i labels -i sentence \ -# --match 'abstract: (x)-[]->(xv), \ +# --match 'abstract: (x)-->(xv), \ # (xv)-[r:kvec_topk_cos_sim {k: 10, maxk: 1024, nprobe: 8}]->(y), \ -# labels: (x)-[]->(xl), (y)-[]->(yl), \ -# sent: (y)-[]->(ys)' \ +# labels: (x)-->(xl), (y)-->(yl), \ +# sent: (y)-->(ys)' \ # --where 'x in ["Q48352"]' \ # --return 'xl as xlabel, yl as ylabel, r.similarity as sim, kgtk_lqstring_text(ys) as ysent' \ # / html From c3a999d88fd04d44d7e022ef7110d7f5e9e5410a Mon Sep 17 00:00:00 2001 From: Hans Chalupsky Date: Fri, 20 Jan 2023 13:19:11 -0800 Subject: [PATCH 4/5] Updated to use 1024-D embeddings on unquoted LQ-strings These provide somewhat better separation and matches and are not influenced by quotes and language tags. --- examples/kypherv-similarity-queries.ipynb | 374 +++++++++++----------- examples/kypherv-similarity-queries.py | 36 ++- 2 files changed, 214 insertions(+), 196 deletions(-) diff --git a/examples/kypherv-similarity-queries.ipynb b/examples/kypherv-similarity-queries.ipynb index b6788dfa5..f697d2175 100644 --- a/examples/kypherv-similarity-queries.ipynb +++ b/examples/kypherv-similarity-queries.ipynb @@ -76,7 +76,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "id": "cc177681", "metadata": {}, "outputs": [ @@ -88,7 +88,7 @@ "env: MAIN=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-main.sqlite3.db\n", "env: COMPLEX=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db\n", "env: TRANSE=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-transe-embeddings.sqlite3.db\n", - "env: ABSTRACT=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db\n", + "env: ABSTRACT=/kgtk-data/kypherv/wikidata-20221102-dwd-v8-abstract-embeddings-large.sqlite3.db\n", "env: IMAGE=/kgtk-data/kypherv/wikimedia-capcom-image-embeddings-v2.sqlite3.db\n" ] } @@ -99,7 +99,7 @@ "%env MAIN={DB}/wikidata-20221102-dwd-v8-main.sqlite3.db\n", "%env COMPLEX={DB}/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db\n", "%env TRANSE={DB}/wikidata-20221102-dwd-v8-transe-embeddings.sqlite3.db\n", - "%env ABSTRACT={DB}/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db\n", + "%env ABSTRACT={DB}/wikidata-20221102-dwd-v8-abstract-embeddings-large.sqlite3.db\n", "%env IMAGE={DB}/wikimedia-capcom-image-embeddings-v2.sqlite3.db" ] }, @@ -263,7 +263,9 @@ "source": [ "The `ABSTRACT` graph cache contains the sentences and embedding vectors\n", "generated from the first sentences of Wikipedia short abstracts. It\n", - "contains about 6M 768-D Roberta base vectors:" + "contains about 6M 1024-D Roberta large vectors (**Note**: these are different\n", + "embeddings than the ones used and reported on in the 2022 Wikidata Workshop paper,\n", + "therefore, the query results in this notebook are somewhat different):" ] }, { @@ -277,21 +279,21 @@ "output_type": "stream", "text": [ "Graph Cache:\r\n", - "DB file: /kgtk-data/kypherv/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db\r\n", - " size: 26.32 GB \tfree: 0 Bytes \tmodified: 2023-01-09 18:14:00\r\n", + "DB file: /kgtk-data/kypherv/wikidata-20221102-dwd-v8-abstract-embeddings-large.sqlite3.db\r\n", + " size: 29.37 GB \tfree: 0 Bytes \tmodified: 2023-01-19 15:02:30\r\n", "\r\n", "KGTK File Information:\r\n", + "abstract:\r\n", + " size: 0 Bytes \tmodified: 2023-01-19 13:24:19 \tgraph: graph_1\r\n", "sentence:\r\n", " size: 256.32 MB \tmodified: 2023-01-04 13:53:44 \tgraph: graph_2\r\n", - "abstract:\r\n", - " size: 0 Bytes \tmodified: 2023-01-09 13:45:47 \tgraph: graph_1\r\n", "\r\n", "Graph Table Information:\r\n", "graph_1:\r\n", - " size: 25.16 GB \tcreated: 2023-01-09 13:45:47\r\n", + " size: 28.21 GB \tcreated: 2023-01-19 13:24:19\r\n", " header: ['node1', 'label', 'node2', 'id']\r\n", "graph_2:\r\n", - " size: 1.23 GB \tcreated: 2023-01-09 18:13:31\r\n", + " size: 1.23 GB \tcreated: 2023-01-19 15:01:41\r\n", " header: ['node1', 'label', 'node2', 'id']\r\n" ] } @@ -474,7 +476,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "id": "10524248", "metadata": { "lines_to_next_cell": 2 @@ -511,7 +513,7 @@ " 0\n", " 'Aristotle'@en\n", " 'Socrates'@en\n", - " 0.908608\n", + " 0.816283\n", " \n", " \n", "\n", @@ -519,10 +521,10 @@ ], "text/plain": [ " xlabel ylabel sim\n", - "0 'Aristotle'@en 'Socrates'@en 0.908608" + "0 'Aristotle'@en 'Socrates'@en 0.816283" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -562,7 +564,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "d7ac35fc", "metadata": {}, "outputs": [ @@ -602,76 +604,76 @@ " \n", " 1\n", " 'Socrates'@en\n", - " 'early life of Plato'@en\n", - " 0.938260\n", + " 'Adamantios Korais'@en\n", + " 0.873166\n", " \n", " \n", " 2\n", " 'Socrates'@en\n", - " 'Aristippus'@en\n", - " 0.934973\n", + " 'Prodicus'@en\n", + " 0.872791\n", " \n", " \n", " 3\n", " 'Socrates'@en\n", - " 'Empedocles'@en\n", - " 0.930798\n", + " 'Protagoras'@en\n", + " 0.870216\n", " \n", " \n", " 4\n", " 'Socrates'@en\n", - " 'Adamantios Korais'@en\n", - " 0.928561\n", + " 'Manuel Chrysoloras'@en\n", + " 0.868033\n", " \n", " \n", " 5\n", " 'Socrates'@en\n", - " 'Menedemus'@en\n", - " 0.928002\n", + " 'Cebes'@en\n", + " 0.867012\n", " \n", " \n", " 6\n", " 'Socrates'@en\n", - " 'Plato'@en\n", - " 0.926748\n", + " 'Pyrrho'@en\n", + " 0.866274\n", " \n", " \n", " 7\n", " 'Socrates'@en\n", - " 'Eubulides'@en\n", - " 0.925711\n", + " 'Menedemus'@en\n", + " 0.863220\n", " \n", " \n", " 8\n", " 'Socrates'@en\n", - " 'Iosipos Moisiodax'@en\n", - " 0.924585\n", + " 'Epicurus'@en\n", + " 0.861731\n", " \n", " \n", " 9\n", " 'Socrates'@en\n", - " 'Henry Oldenburg'@en\n", - " 0.923927\n", + " 'Xenophon'@en\n", + " 0.860759\n", " \n", " \n", "\n", "" ], "text/plain": [ - " xlabel ylabel sim\n", - "0 'Socrates'@en 'Socrates'@en 1.000000\n", - "1 'Socrates'@en 'early life of Plato'@en 0.938260\n", - "2 'Socrates'@en 'Aristippus'@en 0.934973\n", - "3 'Socrates'@en 'Empedocles'@en 0.930798\n", - "4 'Socrates'@en 'Adamantios Korais'@en 0.928561\n", - "5 'Socrates'@en 'Menedemus'@en 0.928002\n", - "6 'Socrates'@en 'Plato'@en 0.926748\n", - "7 'Socrates'@en 'Eubulides'@en 0.925711\n", - "8 'Socrates'@en 'Iosipos Moisiodax'@en 0.924585\n", - "9 'Socrates'@en 'Henry Oldenburg'@en 0.923927" + " xlabel ylabel sim\n", + "0 'Socrates'@en 'Socrates'@en 1.000000\n", + "1 'Socrates'@en 'Adamantios Korais'@en 0.873166\n", + "2 'Socrates'@en 'Prodicus'@en 0.872791\n", + "3 'Socrates'@en 'Protagoras'@en 0.870216\n", + "4 'Socrates'@en 'Manuel Chrysoloras'@en 0.868033\n", + "5 'Socrates'@en 'Cebes'@en 0.867012\n", + "6 'Socrates'@en 'Pyrrho'@en 0.866274\n", + "7 'Socrates'@en 'Menedemus'@en 0.863220\n", + "8 'Socrates'@en 'Epicurus'@en 0.861731\n", + "9 'Socrates'@en 'Xenophon'@en 0.860759" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -699,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "id": "d4a26fd8", "metadata": { "lines_to_next_cell": 2 @@ -732,7 +734,7 @@ " \n", " \n", " 0\n", - " 1801483\n", + " 1801484\n", " \n", " \n", "\n", @@ -740,10 +742,10 @@ ], "text/plain": [ " count(DISTINCT db1_graph_1_c1.\"node1\")\n", - "0 1801483" + "0 1801484" ] }, - "execution_count": 5, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -764,7 +766,7 @@ "metadata": {}, "source": [ "If we used the same brute-force search from above on this much larger set,\n", - "it would take about 5 min to run (which is why this command is disabled):" + "it would take about 2 min to run (which is why this command is disabled):" ] }, { @@ -793,16 +795,16 @@ "```\n", "xlabel\tylabel\tsim\n", "'Socrates'@en\t'Socrates'@en\t1.0000001192092896\n", - "'Socrates'@en\t'Anytus'@en\t0.9346579909324646\n", - "'Socrates'@en\t'Heraclitus'@en\t0.9344534277915955\n", - "'Socrates'@en\t'Hippocrates'@en\t0.9304061532020569\n", - "'Socrates'@en\t'Cleisthenes'@en\t0.9292828440666199\n", - "'Socrates'@en\t'Aristides'@en\t0.9283562898635864\n", - "'Socrates'@en\t'Yannis Xirotiris'@en\t0.926308274269104\n", - "'Socrates'@en\t'Sotiris Trivizas'@en\t0.9255445003509521\n", - "'Socrates'@en\t'Aris Maragkopoulos'@en\t0.9234243035316467\n", - "'Socrates'@en\t'Valerios Stais'@en\t0.919943630695343\n", - "93.859u 38.640s 4:49.84 45.7%\t0+0k 18782808+8io 0pf+0w\n", + "'Socrates'@en\t'Adamantios Korais'@en\t0.8731658458709717\n", + "'Socrates'@en\t'Prodicus'@en\t0.872790515422821\n", + "'Socrates'@en\t'Protagoras'@en\t0.8702158331871033\n", + "'Socrates'@en\t'Manuel Chrysoloras'@en\t0.8680326342582703\n", + "'Socrates'@en\t'Cebes'@en\t0.8670117259025574\n", + "'Socrates'@en\t'Pyrrho'@en\t0.8662737011909485\n", + "'Socrates'@en\t'Menedemus'@en\t0.8632197380065918\n", + "'Socrates'@en\t'Epicurus'@en\t0.8617314696311951\n", + "'Socrates'@en\t'Xenophon'@en\t0.8607585430145264\n", + "52.997u 15.548s 1:50.53 62.0%\t0+0k 19477248+136io 0pf+0w\n", "```" ] }, @@ -828,7 +830,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "2aab7ebe", "metadata": { "lines_to_next_cell": 2 @@ -870,41 +872,41 @@ " \n", " 1\n", " 'Socrates'@en\n", - " 'Histories'@en\n", - " 0.937620\n", + " 'Adamantios Korais'@en\n", + " 0.873166\n", " \n", " \n", " 2\n", " 'Socrates'@en\n", - " 'Cadmus'@en\n", - " 0.915083\n", + " 'Prodicus'@en\n", + " 0.872791\n", " \n", " \n", " 3\n", " 'Socrates'@en\n", - " 'Eudorus of Alexandria'@en\n", - " 0.914027\n", + " 'Manuel Chrysoloras'@en\n", + " 0.868033\n", " \n", " \n", " 4\n", " 'Socrates'@en\n", - " 'John Wilkins'@en\n", - " 0.913926\n", + " 'Cebes'@en\n", + " 0.867012\n", " \n", " \n", "\n", "" ], "text/plain": [ - " xlabel ylabel sim\n", - "0 'Socrates'@en 'Socrates'@en 1.000000\n", - "1 'Socrates'@en 'Histories'@en 0.937620\n", - "2 'Socrates'@en 'Cadmus'@en 0.915083\n", - "3 'Socrates'@en 'Eudorus of Alexandria'@en 0.914027\n", - "4 'Socrates'@en 'John Wilkins'@en 0.913926" + " xlabel ylabel sim\n", + "0 'Socrates'@en 'Socrates'@en 1.000000\n", + "1 'Socrates'@en 'Adamantios Korais'@en 0.873166\n", + "2 'Socrates'@en 'Prodicus'@en 0.872791\n", + "3 'Socrates'@en 'Manuel Chrysoloras'@en 0.868033\n", + "4 'Socrates'@en 'Cebes'@en 0.867012" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -936,7 +938,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "29dc4f04", "metadata": { "lines_to_next_cell": 2 @@ -973,116 +975,116 @@ " 0\n", " 'Plato'@en\n", " 'Aenesidemus'@en\n", - " 0.965394\n", + " 0.936797\n", " \n", " \n", " 1\n", " 'Plato'@en\n", - " 'Hicetas'@en\n", - " 0.964990\n", + " 'Aristotle'@en\n", + " 0.928277\n", " \n", " \n", " 2\n", " 'Plato'@en\n", - " 'Empedocles'@en\n", - " 0.962913\n", + " 'Menedemus'@en\n", + " 0.926272\n", " \n", " \n", " 3\n", " 'Plato'@en\n", - " 'Eubulides'@en\n", - " 0.962904\n", + " 'Hicetas'@en\n", + " 0.923380\n", " \n", " \n", " 4\n", " 'Plato'@en\n", - " 'Aristotle'@en\n", - " 0.961594\n", + " 'Philo of Larissa'@en\n", + " 0.921708\n", " \n", " \n", " 5\n", " 'Aristotle'@en\n", - " 'Bryson of Achaea'@en\n", - " 0.974303\n", + " 'Philo of Larissa'@en\n", + " 0.931010\n", " \n", " \n", " 6\n", " 'Aristotle'@en\n", - " 'Michael Papageorgiou'@en\n", - " 0.970041\n", + " 'Speusippus'@en\n", + " 0.930034\n", " \n", " \n", " 7\n", " 'Aristotle'@en\n", - " 'Hicetas'@en\n", - " 0.967692\n", + " 'Plato'@en\n", + " 0.928277\n", " \n", " \n", " 8\n", " 'Aristotle'@en\n", - " 'Anaxarchus'@en\n", - " 0.967682\n", + " 'Hicetas'@en\n", + " 0.927509\n", " \n", " \n", " 9\n", " 'Aristotle'@en\n", - " 'Metrodorus of Lampsacus'@en\n", - " 0.967349\n", + " 'Bryson of Achaea'@en\n", + " 0.927930\n", " \n", " \n", " 10\n", " 'Socrates'@en\n", - " 'Eudorus of Alexandria'@en\n", - " 0.914027\n", + " 'Adamantios Korais'@en\n", + " 0.873166\n", " \n", " \n", " 11\n", " 'Socrates'@en\n", - " 'John Wilkins'@en\n", - " 0.913926\n", + " 'Prodicus'@en\n", + " 0.872791\n", " \n", " \n", " 12\n", " 'Socrates'@en\n", - " 'Eurytus'@en\n", - " 0.911165\n", + " 'Manuel Chrysoloras'@en\n", + " 0.868033\n", " \n", " \n", " 13\n", " 'Socrates'@en\n", - " 'Syrianus'@en\n", - " 0.908286\n", + " 'Cebes'@en\n", + " 0.867012\n", " \n", " \n", " 14\n", " 'Socrates'@en\n", - " 'Peter the Iberian'@en\n", - " 0.907799\n", + " 'Pyrrho'@en\n", + " 0.866274\n", " \n", " \n", "\n", "" ], "text/plain": [ - " xlabel ylabel sim\n", - "0 'Plato'@en 'Aenesidemus'@en 0.965394\n", - "1 'Plato'@en 'Hicetas'@en 0.964990\n", - "2 'Plato'@en 'Empedocles'@en 0.962913\n", - "3 'Plato'@en 'Eubulides'@en 0.962904\n", - "4 'Plato'@en 'Aristotle'@en 0.961594\n", - "5 'Aristotle'@en 'Bryson of Achaea'@en 0.974303\n", - "6 'Aristotle'@en 'Michael Papageorgiou'@en 0.970041\n", - "7 'Aristotle'@en 'Hicetas'@en 0.967692\n", - "8 'Aristotle'@en 'Anaxarchus'@en 0.967682\n", - "9 'Aristotle'@en 'Metrodorus of Lampsacus'@en 0.967349\n", - "10 'Socrates'@en 'Eudorus of Alexandria'@en 0.914027\n", - "11 'Socrates'@en 'John Wilkins'@en 0.913926\n", - "12 'Socrates'@en 'Eurytus'@en 0.911165\n", - "13 'Socrates'@en 'Syrianus'@en 0.908286\n", - "14 'Socrates'@en 'Peter the Iberian'@en 0.907799" + " xlabel ylabel sim\n", + "0 'Plato'@en 'Aenesidemus'@en 0.936797\n", + "1 'Plato'@en 'Aristotle'@en 0.928277\n", + "2 'Plato'@en 'Menedemus'@en 0.926272\n", + "3 'Plato'@en 'Hicetas'@en 0.923380\n", + "4 'Plato'@en 'Philo of Larissa'@en 0.921708\n", + "5 'Aristotle'@en 'Philo of Larissa'@en 0.931010\n", + "6 'Aristotle'@en 'Speusippus'@en 0.930034\n", + "7 'Aristotle'@en 'Plato'@en 0.928277\n", + "8 'Aristotle'@en 'Hicetas'@en 0.927509\n", + "9 'Aristotle'@en 'Bryson of Achaea'@en 0.927930\n", + "10 'Socrates'@en 'Adamantios Korais'@en 0.873166\n", + "11 'Socrates'@en 'Prodicus'@en 0.872791\n", + "12 'Socrates'@en 'Manuel Chrysoloras'@en 0.868033\n", + "13 'Socrates'@en 'Cebes'@en 0.867012\n", + "14 'Socrates'@en 'Pyrrho'@en 0.866274" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1106,8 +1108,8 @@ "id": "0a1b627c", "metadata": {}, "source": [ - "For comparison, here is a run without dynamic scaling which returns much fewer results, since\n", - "only a small number of the top-5 similar results for each input also satisfy the post conditions:" + "For comparison, here is a run without dynamic scaling which returns fewer results, since\n", + "not all of the top-5 similar results for each input also satisfy the post conditions:" ] }, { @@ -1149,78 +1151,92 @@ " 0\n", " 'Plato'@en\n", " 'Aenesidemus'@en\n", - " 0.965394\n", + " 0.936797\n", " \n", " \n", " 1\n", " 'Plato'@en\n", - " 'Hicetas'@en\n", - " 0.964990\n", + " 'Aristotle'@en\n", + " 0.928277\n", " \n", " \n", " 2\n", " 'Plato'@en\n", - " 'Empedocles'@en\n", - " 0.962913\n", + " 'Menedemus'@en\n", + " 0.926272\n", " \n", " \n", " 3\n", " 'Plato'@en\n", - " 'Eubulides'@en\n", - " 0.962904\n", + " 'Hicetas'@en\n", + " 0.923380\n", " \n", " \n", " 4\n", " 'Aristotle'@en\n", - " 'Bryson of Achaea'@en\n", - " 0.974303\n", + " 'Philo of Larissa'@en\n", + " 0.931010\n", " \n", " \n", " 5\n", " 'Aristotle'@en\n", - " 'Michael Papageorgiou'@en\n", - " 0.970041\n", + " 'Speusippus'@en\n", + " 0.930034\n", " \n", " \n", " 6\n", " 'Aristotle'@en\n", - " 'Hicetas'@en\n", - " 0.967692\n", + " 'Plato'@en\n", + " 0.928277\n", " \n", " \n", " 7\n", " 'Aristotle'@en\n", - " 'Anaxarchus'@en\n", - " 0.967682\n", + " 'Hicetas'@en\n", + " 0.927509\n", " \n", " \n", " 8\n", " 'Socrates'@en\n", - " 'Eudorus of Alexandria'@en\n", - " 0.914027\n", + " 'Adamantios Korais'@en\n", + " 0.873166\n", " \n", " \n", " 9\n", " 'Socrates'@en\n", - " 'John Wilkins'@en\n", - " 0.913926\n", + " 'Prodicus'@en\n", + " 0.872791\n", + " \n", + " \n", + " 10\n", + " 'Socrates'@en\n", + " 'Manuel Chrysoloras'@en\n", + " 0.868033\n", + " \n", + " \n", + " 11\n", + " 'Socrates'@en\n", + " 'Cebes'@en\n", + " 0.867012\n", " \n", " \n", "\n", "" ], "text/plain": [ - " xlabel ylabel sim\n", - "0 'Plato'@en 'Aenesidemus'@en 0.965394\n", - "1 'Plato'@en 'Hicetas'@en 0.964990\n", - "2 'Plato'@en 'Empedocles'@en 0.962913\n", - "3 'Plato'@en 'Eubulides'@en 0.962904\n", - "4 'Aristotle'@en 'Bryson of Achaea'@en 0.974303\n", - "5 'Aristotle'@en 'Michael Papageorgiou'@en 0.970041\n", - "6 'Aristotle'@en 'Hicetas'@en 0.967692\n", - "7 'Aristotle'@en 'Anaxarchus'@en 0.967682\n", - "8 'Socrates'@en 'Eudorus of Alexandria'@en 0.914027\n", - "9 'Socrates'@en 'John Wilkins'@en 0.913926" + " xlabel ylabel sim\n", + "0 'Plato'@en 'Aenesidemus'@en 0.936797\n", + "1 'Plato'@en 'Aristotle'@en 0.928277\n", + "2 'Plato'@en 'Menedemus'@en 0.926272\n", + "3 'Plato'@en 'Hicetas'@en 0.923380\n", + "4 'Aristotle'@en 'Philo of Larissa'@en 0.931010\n", + "5 'Aristotle'@en 'Speusippus'@en 0.930034\n", + "6 'Aristotle'@en 'Plato'@en 0.928277\n", + "7 'Aristotle'@en 'Hicetas'@en 0.927509\n", + "8 'Socrates'@en 'Adamantios Korais'@en 0.873166\n", + "9 'Socrates'@en 'Prodicus'@en 0.872791\n", + "10 'Socrates'@en 'Manuel Chrysoloras'@en 0.868033\n", + "11 'Socrates'@en 'Cebes'@en 0.867012" ] }, "execution_count": 14, @@ -1551,7 +1567,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "id": "cc90540b", "metadata": {}, "outputs": [ @@ -1560,11 +1576,11 @@ "output_type": "stream", "text": [ "Running with logging level 30\n", - "2023-01-13 13:53:13.932934: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", - "2023-01-13 13:53:13.932961: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n", + "2023-01-20 12:48:51.752147: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", + "2023-01-20 12:48:51.752170: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n", "/home/hans/.local/share/virtualenvs/ksink39/lib/python3.9/site-packages/torch/cuda/__init__.py:52: UserWarning: CUDA initialization: The NVIDIA driver on your system is too old (found version 9010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver. (Triggered internally at /pytorch/c10/cuda/CUDAFunctions.cpp:115.)\n", " return torch._C._cuda_getDeviceCount() > 0\n", - "Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 16.01it/s]\n", + "Batches: 100%|████████████████████████████████████| 1/1 [00:00<00:00, 4.12it/s]\n", "node1\n", "q1\n", "q2\n", @@ -1580,7 +1596,7 @@ "sed -e 's/^ *//' | \\\n", "kgtk cat --no-input-header --input-column-names node1 node2 --implied-label sentence \\\n", " / add-id \\\n", - " / text-embedding -i - --model roberta-base-nli-mean-tokens \\\n", + " / text-embedding -i - --model roberta-large-nli-mean-tokens \\\n", " --output-data-format kgtk --output-property emb -o - \\\n", " / query -i - --idx vector:node2 --as text_emb_queries --match '(x)' --return x" ] @@ -1590,7 +1606,7 @@ "id": "e79d64e4", "metadata": {}, "source": [ - "The above created 768-D text embedding vector for three short queries\n", + "The above created 1024-D text embedding vector for three short queries\n", "using the same text embedding type as used in our `ABSTRACT` embeddings.\n", "Now we find Wikidata QNodes whose short-abstract embedding vector is most similar\n", "to the queries, and that satisfy any additional conditions we might have.\n", @@ -1609,14 +1625,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "id": "a98a939d", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
yylabelsimysent
Q325955'Speusippus'@en0.9440442323684692Speusippus (/spjuːˈsɪpəs/; Greek: Σπεύσιππος; c. 408 – 339/8 BC) was an ancient Greek philosopher.
Q1200209'Dercil·lides'@en0.9357015490531921Dercyllides was an ancient Greek Platonist philosopher.
Q2927235'Bryson of Achaea'@en0.9300292134284973Bryson of Achaea (or Bryson the Achaean; Greek: Βρύσων ὁ Ἀχαιός Vryson o Acheos, gen.: Βρύσωνος Vrysonos; fl. 330 BC) was an ancient Greek philosopher.
Q9250176'Echecratides'@en0.9262670874595642Echecratides (Ancient Greek: Ἐχεκρατίδης) was an Ancient Greek Peripatetic philosopher who is mentioned among the disciples of Aristotle.
Q668009'Aristotelis the Dialectician'@en0.9235112071037292Aristotle the Dialectician (or Aristoteles of Argos, Greek: Ἀριστοτέλης; fl. 3rd century BC), was an ancient Greek dialectic philosopher from Argos.
Q366031'Anaxarchus'@en0.9216422438621521Anaxarchus (/ˌænəɡˈzɑːrkəs/; Ancient Greek: Ἀνάξαρχος; c. 380 – c. 320 BC) was a Greek philosopher of the school of Democritus.
Q297420'Panaetius'@en0.9199343919754028Panaetius (/pəˈniːʃiəs/; Greek: Παναίτιος, translit. Panetios; c.  185 – c.  110/109 BC) of Rhodes was an ancient Greek Stoic philosopher.
Q962486'Echecrates of Flius'@en0.9173671007156372Echecrates (Greek: Ἐχεκράτης) was a Pythagorean philosopher from the ancient Greek town of Phlius.
Q365977'Bias of Priene'@en0.9115197658538818Bias (/ˈbaɪəs/; Greek: Βίας ὁ Πριηνεύς; fl. 6th century BC) of Priene was a Greek sage.
Q13634113'Michael Papageorgiou'@en0.9098575115203857Michail Papageorgiou (Greek: Μιχαήλ Παπαγεωργίου; 1727–1796) was a Greek philosopher.
" + "
yylabelsimysent
Q1200209'Dercil·lides'@en0.9270190596580505Dercyllides was an ancient Greek Platonist philosopher.
Q12901192'Nessas of Chios'@en0.9017531871795654Nessos of Chios (Ancient Greek: Νεσσᾶς or Νέσσος ὁ Χῖος) was a pre-Socratic ancient Greek philosopher from the island of Chios.
Q962486'Echecrates of Flius'@en0.8998405933380127Echecrates (Greek: Ἐχεκράτης) was a Pythagorean philosopher from the ancient Greek town of Phlius.
Q20379195'Nestor of Tarsus'@en0.8979542255401611Nestor of Tarsus (Ancient Greek: Νέστωρ) was an ancient Greek philosopher of the Stoic school of thought.
Q3780759'Patro the Epicurean'@en0.8746783137321472Patro (Greek: Πάτρων) was an Epicurean philosopher.
Q2397427'Heraclides Lembus'@en0.8738521933555603Heraclides Lembus (Greek: Ἡρακλείδης Λέμβος, Hērakleidēs Lembos) was an Ancient Greek statesman, historian and philosophical writer.
Q992324'Eudorus of Alexandria'@en0.8724848031997681Eudorus of Alexandria (Greek: Εὔδωρος ὁ Ἀλεξανδρεύς; 1st century BC) was an ancient Greek philosopher, and a representative of Middle Platonism.
Q373042'Onasander'@en0.8724181056022644Onasander or Onosander (Greek: Ὀνήσανδρος Onesandros or Ὀνόσανδρος Onosandros; fl. 1st century AD) was a Greek philosopher.
Q924215'Hecato of Rhodes'@en0.8698219060897827Hecato or Hecaton of Rhodes (Greek: Ἑκάτων; fl. c. 100 BC) was a Greek Stoic philosopher.
Q325955'Speusippus'@en0.869309663772583Speusippus (/spjuːˈsɪpəs/; Greek: Σπεύσιππος; c. 408 – 339/8 BC) was an ancient Greek philosopher.
" ], "text/plain": [ "" @@ -1650,14 +1666,14 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "id": "3e6c1169", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
yylabelsimysent
Q673952'Haidershofen'@en0.9632641077041626Haidershofen is a town located in Austria.
Q256996'Grieskirchen'@en0.9585073590278625Grieskirchen is a town in Austria.
Q2240044'Annabichl Castle'@en0.9552702307701111Annabichl Castle is a castle in Austria.
Q7378773'Ruine Hauenstein'@en0.9487172365188599Ruine Hauenstein is a castle in Styria, Austria.
Q37809497'Ruine Neudeck'@en0.9469427466392517Ruine Neudeck is a castle in Styria, Austria.
Q7378781'Ruine Raabeck'@en0.946899950504303Ruine Raabeck is a castle in Styria, Austria.
Q4998499'Burg Kaisersberg'@en0.9449542760848999Burg Kaisersberg is a castle in Styria, Austria.
Q674097'Mannersdorf am Leithagebirge'@en0.9442192316055298Mannersdorf am Leithagebirge is a town in Austria.
Q7378769'Ruine Kalsberg'@en0.943941056728363Ruine Kalsberg is a castle in Styria, Austria.
Q1012734'Burg Krems'@en0.942907452583313Burg Krems is a castle in Styria, Austria.
" + "
yylabelsimysent
Q2240044'Annabichl Castle'@en0.9592803120613098Annabichl Castle is a castle in Austria.
Q7378781'Ruine Raabeck'@en0.9573754072189331Ruine Raabeck is a castle in Styria, Austria.
Q7431733'Schloss Frondsberg'@en0.9572933316230774Schloss Frondsberg is a castle in Styria, Austria.
Q1379421'Burg Bideneck'@en0.9563704133033752Burg Bideneck is a castle in Tyrol, Austria.
Q1013482'Burgruine Pfannberg'@en0.9561655521392822Burgruine Pfannberg is a castle in Styria, Austria.
Q7378775'Ruine Ligist'@en0.9561580419540405Ruine Ligist is a castle in Styria, Austria.
Q7378769'Ruine Kalsberg'@en0.956127405166626Ruine Kalsberg is a castle in Styria, Austria.
Q4998492'Burg Baiersdorf'@en0.9559961557388306Burg Baiersdorf is a castle in Styria, Austria.
Q7378780'Ruine Pernegg'@en0.9556364417076111Ruine Pernegg is a castle in Styria, Austria.
Q7378773'Ruine Hauenstein'@en0.9551136493682861Ruine Hauenstein is a castle in Styria, Austria.
" ], "text/plain": [ "" @@ -1692,7 +1708,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "3bc0fe55", "metadata": { "lines_to_next_cell": 2 @@ -1701,7 +1717,7 @@ { "data": { "text/html": [ - "
yylabelsimysent
Q27924985'Toby Williams'@en0.9054120779037476Toby Williams is a British actor, writer and award-winning stand-up comedian performing both as himself and Dr George Ryegold.
Q7087463'Oliver Cotton'@en0.8896428942680359Oliver Charles Cotton (born 20 June 1944) is an English actor, comedian and playwright, known for his prolific work on stage, TV and film.
Q7704327'Terry Duggan'@en0.8872928619384766Terence A. Duggan (15 April 1932 – 1 May 2008) was a British comedian and actor who had a successful career in cabaret and variety, and played numerous character roles on television.
Q23772268'Guz Khan'@en0.8805128335952759Ghulam Dustgir \\"Guz\\" Khan (born 1986) is a British comedian, impressionist, and actor best known for his work in the TV show Man Like Mobeen and stand up appearances in Live at the Apollo.
Q6988861'Neil Linpow'@en0.8776082992553711Neil Linpow is a multi-award-winning English actor, writer and filmmaker.
Q7320263'Rhashan Stone'@en0.8773206472396851Rhashan Stone is an American actor and comedian based in the UK. He is best known for appearing in many comedy shows such as Desmond\\'s and Mutual Friends.
Q7608608'Stephen Ashfield'@en0.8739212155342102Stephen Ashfield is an Olivier Award-winning Scottish actor.
Q5290454'Dominic Anciano'@en0.8687206506729126Dominic Anciano (born 1959) is an English producer, actor, director, writer and comedian best known for his role as Sgt.
Q7626524'Stuart Fell'@en0.8686633110046387Stuart Fell is a professional actor and stuntman.
Q5534773'Geoffrey McGivern'@en0.8672927021980286Geoffrey M. McGivern is a British actor in film, radio, stage and television, as well as a comedian.
" + "
yylabelsimysent
Q7803499'Tim FitzHigham'@en0.866931676864624Tim FitzHigham FRSA FRGS, is an English comedian, author, artist and world record holder.
Q5561891'Gill Isles'@en0.8463524580001831Gill Isles is a BAFTA winning TV comedy producer.
Q6988861'Neil Linpow'@en0.8445479869842529Neil Linpow is a multi-award-winning English actor, writer and filmmaker.
Q16210661'Philip Bulcock'@en0.8129876255989075Philip Bulcock is an English actor who has appeared in numerous award-winning film and theatre productions.
Q7626524'Stuart Fell'@en0.8109696507453918Stuart Fell is a professional actor and stuntman.
Q4424886'Hedrick Smith'@en0.801676869392395Hedrick Smith is a Pulitzer Prize-winning former New York Times reporter and Emmy award-winning producer and correspondent.
Q6137252'James Kenny'@en0.7837412357330322James Kenny is a professional photographer based in the United Kingdom, best known for his fashion, celebrity portrait, and documentary work.
Q8002945'Will Lyons'@en0.781859815120697Will Lyons is a journalist, newspaper columnist, award-winning wine writer and broadcaster.
Q6229086'John Deery'@en0.7798240780830383John Deery is a British award-winning film and television drama director.
Q5213767'Dan Jones'@en0.7785274386405945Dan Jones is a BAFTA and Ivor Novello Award winning composer and sound designer working in film and theatre.
" ], "text/plain": [ "" @@ -2291,7 +2307,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 20, "id": "042784ec", "metadata": { "lines_to_next_cell": 2 @@ -2300,7 +2316,7 @@ { "data": { "text/html": [ - "
xlabelylabelsimysent
'Plato'@en'Plato'@en1.0Plato (/ˈpleɪtoʊ/ PLAY-toe; Greek: Πλάτων Plátōn; 428/427 or 424/423 – 348/347 BC) was a Greek philosopher born in Athens during the Classical period in Ancient Greece.
'Plato'@en'Aenesidemus'@en0.965393602848053Aenesidemus (Ancient Greek: Αἰνησίδημος or Αἰνεσίδημος) was a Greek Pyrrhonist philosopher, born in Knossos on the island of Crete.
'Plato'@en'Hicetas'@en0.9649903178215027Hicetas (Ancient Greek: Ἱκέτας or Ἱκέτης; c. 400 – c. 335 BC) was a Greek philosopher of the Pythagorean School.
'Plato'@en'Empedocles'@en0.9629127979278564Empedocles (/ɛmˈpɛdəkliːz/; Greek: Ἐμπεδοκλῆς; c. 494 – c. 434 BC, fl. 444–443 BC) was a Greek pre-Socratic philosopher and a native citizen of Akragas, a Greek city in Sicily.
'Plato'@en'Eubulides'@en0.9629042744636536Eubulides of Miletus (Ancient Greek: Εὐβουλίδης; fl. 4th century BCE) was a Greek philosopher of the Megarian school, a pupil of Euclid of Megara and a contemporary of Aristotle.
'Plato'@en'Aristotle'@en0.9615942239761353Aristotle (/ˈærɪstɒtəl/; Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece.
'Plato'@en'Metrodorus of Lampsacus'@en0.9613872766494751Metrodorus of Lampsacus (Greek: Μητρόδωρος Λαμψακηνός, Mētrodōros Lampsakēnos; 331/0–278/7 BC) was a Greek philosopher of the Epicurean school.
'Plato'@en'Xenophon'@en0.960830569267273Xenophon of Athens (/ˈzɛnəfən, zi-, -fɒn/; Ancient Greek: Ξενοφῶν [ksenopʰɔ̂ːn]; c. 430 – probably 355 or 354 BC) was a Greek military leader, philosopher, and historian, born in Athens.
'Plato'@en'Anaxarchus'@en0.9582780599594116Anaxarchus (/ˌænəɡˈzɑːrkəs/; Ancient Greek: Ἀνάξαρχος; c. 380 – c. 320 BC) was a Greek philosopher of the school of Democritus.
'Plato'@en'Clearchus of Soli'@en0.957090437412262Clearchus of Soli (Greek: Kλέαρχoς ὁ Σολεύς, Klearkhos ho Soleus) was a Greek philosopher of the 4th–3rd century BCE, belonging to Aristotle\\'s Peripatetic school.
'Aristotle'@en'Aristotle'@en0.9999998807907104Aristotle (/ˈærɪstɒtəl/; Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece.
'Aristotle'@en'Bryson of Achaea'@en0.9743033647537231Bryson of Achaea (or Bryson the Achaean; Greek: Βρύσων ὁ Ἀχαιός Vryson o Acheos, gen.: Βρύσωνος Vrysonos; fl. 330 BC) was an ancient Greek philosopher.
'Aristotle'@en'Michael Papageorgiou'@en0.9700412750244141Michail Papageorgiou (Greek: Μιχαήλ Παπαγεωργίου; 1727–1796) was a Greek philosopher.
'Aristotle'@en'Hicetas'@en0.9676922559738159Hicetas (Ancient Greek: Ἱκέτας or Ἱκέτης; c. 400 – c. 335 BC) was a Greek philosopher of the Pythagorean School.
'Aristotle'@en'Anaxarchus'@en0.9676817655563354Anaxarchus (/ˌænəɡˈzɑːrkəs/; Ancient Greek: Ἀνάξαρχος; c. 380 – c. 320 BC) was a Greek philosopher of the school of Democritus.
'Aristotle'@en'Metrodorus of Lampsacus'@en0.9673494100570679Metrodorus of Lampsacus (Greek: Μητρόδωρος Λαμψακηνός, Mētrodōros Lampsakēnos; 331/0–278/7 BC) was a Greek philosopher of the Epicurean school.
'Aristotle'@en'Philolaus'@en0.9670777916908264Philolaus (/ˌfɪləˈleɪəs/; Ancient Greek: Φιλόλαος, Philólaos; c. 470 – c. 385 BCE) was a Greek Pythagorean and pre-Socratic philosopher.
'Aristotle'@en'Phaedo of Elis'@en0.9669275283813477Phaedo of Elis (/ˈfiːdoʊ/; also Phaedon; Greek: Φαίδων ὁ Ἠλεῖος, gen.: Φαίδωνος; fl. 4th century BCE) was a Greek philosopher.
'Aristotle'@en'Asclepiades of Phlius'@en0.965190589427948Asclepiades of Phlius (Greek: Ἀσκληπιάδης ὁ Φλιάσιος; c. 350 – c. 270 BC) was a Greek philosopher in the Eretrian school of philosophy.
'Aristotle'@en'Dionysius of Chalcedon'@en0.965139627456665Dionysius of Chalcedon (Greek: Διονύσιος; fl. 320 BC) was a Greek philosopher and dialectician connected with the Megarian school.
'Socrates'@en'Socrates'@en0.9999998807907104Socrates (/ˈsɒkrətiːz/; Greek: Σωκράτης; c. 470–399 BC) was a Greek philosopher from Athens who is credited as the founder of Western philosophy and among the first moral philosophers of the ethical tradition of thought.
'Socrates'@en'Histories'@en0.9376196265220642The Histories (Greek: Ἱστορίαι, Ancient Greek: [historíai̯]; also known as The History) of Herodotus is considered the founding work of history in Western literature.
'Socrates'@en'Károly Kerényi'@en0.930176854133606Károly (Carl, Karl) Kerényi (Hungarian: Kerényi Károly, pronounced [ˈkɛreːɲi ˈkaːroj]; 19 January 1897 – 14 April 1973) was a Hungarian scholar in classical philology and one of the founders of modern studies of Greek mythology.
'Socrates'@en'Xenokrates of Sicyon'@en0.9301142692565918Xenokrates of Athens or of Sicyon (Greek: Ξενοκράτης; fl. c. 280 BC) was an ancient Greek sculptor and writer, and one of the world\\'s first art historians.
'Socrates'@en'Iosipos Moisiodax'@en0.9245849847793579Iosipos Moisiodax or Moesiodax (/ˈmiːsiədæks/; Greek: Ιώσηπος Μοισιόδαξ; 1725–1800) was a Greek philosopher, an Eastern Orthodox deacon, and one of the greatest exponents of the modern Greek Enlightenment.
'Socrates'@en'Hippodamus of Miletus'@en0.9245232343673706Hippodamus of Miletus (/hɪˈpɒdəməs/; Greek: Ἱππόδαμος ὁ Μιλήσιος, Hippodamos ho Milesios; 498 – 408 BC) was an ancient Greek architect, urban planner, physician, mathematician, meteorologist and philosopher, who is considered to be \\"the father of European urban planning\\", and the namesake of the \\"Hippodamian plan\\" (grid plan) of city layout.
'Socrates'@en'Henry Oldenburg'@en0.9239271879196167Henry Oldenburg (also Henry Oldenbourg) FRS (c. 1618 as Heinrich Oldenburg – 5 September 1677). was a German theologian, diplomat, and natural philosopher, known as one of the creators of modern scientific peer review.
'Socrates'@en'Inachus'@en0.9215598702430725In Greek mythology, Inachus, Inachos or Inakhos (Ancient Greek: Ἴναχος) was the first king of Argos after whom a river was called Inachus River, the modern Panitsa that drains the western margin of the Argive plain.
'Socrates'@en'Hippobotus'@en0.917095959186554Hippobotus (/hɪˈpɒbətəs/; Ancient Greek: Ἱππόβοτος; fl. c. 200 BC) was a Greek historian of philosophers and philosophical schools.
'Socrates'@en'Georgios Hatzidakis'@en0.9140840172767639Georgios Nicolaou Hatzidakis, aka Georgios Nikolaou Chatzidakis (Greek: Γεώργιος Νικολάου Χατζιδάκις; 23 November [O.S. 11 November] 1843, in Myrthios, Ottoman Crete – 28 June 1941, in Athens) was a Greek philologist, who is regarded as the father of linguistics in Greece.
" + "
xlabelylabelsimysent
'Plato'@en'Plato'@en1.0Plato (/ˈpleɪtoʊ/ PLAY-toe; Greek: Πλάτων Plátōn; 428/427 or 424/423 – 348/347 BC) was a Greek philosopher born in Athens during the Classical period in Ancient Greece.
'Plato'@en'Aenesidemus'@en0.936797022819519Aenesidemus (Ancient Greek: Αἰνησίδημος or Αἰνεσίδημος) was a Greek Pyrrhonist philosopher, born in Knossos on the island of Crete.
'Plato'@en'Aristotle'@en0.928276777267456Aristotle (/ˈærɪstɒtəl/; Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece.
'Plato'@en'Menedemus'@en0.9262720942497253Menedemus of Eretria (Greek: Μενέδημος ὁ Ἐρετριεύς; 345/44 – 261/60 BC) was a Greek philosopher and founder of the Eretrian school.
'Plato'@en'Hicetas'@en0.9233798980712891Hicetas (Ancient Greek: Ἱκέτας or Ἱκέτης; c. 400 – c. 335 BC) was a Greek philosopher of the Pythagorean School.
'Plato'@en'Philo of Larissa'@en0.9217081069946289Philo of Larissa (Greek: Φίλων ὁ Λαρισσαῖος Philon ho Larissaios; 159/8–84/3 BC) was a Greek philosopher.
'Plato'@en'Metrodorus of Lampsacus'@en0.917860209941864Metrodorus of Lampsacus (Greek: Μητρόδωρος Λαμψακηνός, Mētrodōros Lampsakēnos; 331/0–278/7 BC) was a Greek philosopher of the Epicurean school.
'Plato'@en'Speusippus'@en0.9144768714904785Speusippus (/spjuːˈsɪpəs/; Greek: Σπεύσιππος; c. 408 – 339/8 BC) was an ancient Greek philosopher.
'Plato'@en'Echecrates of Flius'@en0.9101097583770752Echecrates (Greek: Ἐχεκράτης) was a Pythagorean philosopher from the ancient Greek town of Phlius.
'Plato'@en'Philolaus'@en0.9096436500549316Philolaus (/ˌfɪləˈleɪəs/; Ancient Greek: Φιλόλαος, Philólaos; c. 470 – c. 385 BCE) was a Greek Pythagorean and pre-Socratic philosopher.
'Aristotle'@en'Aristotle'@en1.0Aristotle (/ˈærɪstɒtəl/; Greek: Ἀριστοτέλης Aristotélēs, pronounced [aristotélɛːs]; 384–322 BC) was a Greek philosopher and polymath during the Classical period in Ancient Greece.
'Aristotle'@en'Philo of Larissa'@en0.9310098886489868Philo of Larissa (Greek: Φίλων ὁ Λαρισσαῖος Philon ho Larissaios; 159/8–84/3 BC) was a Greek philosopher.
'Aristotle'@en'Speusippus'@en0.9300340414047241Speusippus (/spjuːˈsɪpəs/; Greek: Σπεύσιππος; c. 408 – 339/8 BC) was an ancient Greek philosopher.
'Aristotle'@en'Plato'@en0.928276777267456Plato (/ˈpleɪtoʊ/ PLAY-toe; Greek: Πλάτων Plátōn; 428/427 or 424/423 – 348/347 BC) was a Greek philosopher born in Athens during the Classical period in Ancient Greece.
'Aristotle'@en'Bryson of Achaea'@en0.9279296398162842Bryson of Achaea (or Bryson the Achaean; Greek: Βρύσων ὁ Ἀχαιός Vryson o Acheos, gen.: Βρύσωνος Vrysonos; fl. 330 BC) was an ancient Greek philosopher.
'Aristotle'@en'Hicetas'@en0.927509069442749Hicetas (Ancient Greek: Ἱκέτας or Ἱκέτης; c. 400 – c. 335 BC) was a Greek philosopher of the Pythagorean School.
'Aristotle'@en'Dimitrios Pepagomenos'@en0.9206493496894836Demetrios Pepagomenos or Demetrius Pepagomenus (Greek: Δημήτριος Πεπαγωμένος, 1200–1300) was a Byzantine Greek savant who resided in Constantinople.
'Aristotle'@en'Aristippus'@en0.9205538630485535Aristippus of Cyrene (/ˌærəˈstɪpəs/; Ancient Greek: Ἀρίστιππος ὁ Κυρηναῖος; c. 435 – c. 356 BCE) was a hedonistic Greek philosopher and the founder of the Cyrenaic school of philosophy.
'Aristotle'@en'Asclepigenia'@en0.9197463989257812Asclepigenia (Greek: Ἀσκληπιγένεια; 430 – 485 AD) was an Athenian philosopher and mystic.
'Aristotle'@en'Dicaearchus'@en0.9195908904075623Dicaearchus of Messana (/ˌdɪkeɪˈɑːrkəs ... məˈsɑːnə/; Greek: Δικαίαρχος Dikaiarkhos; c. 370/350 – c. post 323 BC), also written Dikaiarchos (/ˈdɪkaɪɑːrk/), was a Greek philosopher, geographer and author.
'Socrates'@en'Socrates'@en1.0000001192092896Socrates (/ˈsɒkrətiːz/; Greek: Σωκράτης; c. 470–399 BC) was a Greek philosopher from Athens who is credited as the founder of Western philosophy and among the first moral philosophers of the ethical tradition of thought.
'Socrates'@en'Adamantios Korais'@en0.8731658458709717Adamantios Korais or Koraïs (Greek: Ἀδαμάντιος Κοραῆς [aðaˈmandi.os koraˈis]; Latin: Adamantius Coraes; French: Adamance Coray; 27 April 1748 – 6 April 1833) was a Greek scholar credited with laying the foundations of modern Greek literature and a major figure in the Greek Enlightenment.
'Socrates'@en'Prodicus'@en0.872790515422821Prodicus of Ceos (/ˈproʊdɪkəs/; Greek: Πρόδικος ὁ Κεῖος, Pródikos ho Keios; c. 465 BC – c. 395 BC) was a Greek philosopher, and part of the first generation of Sophists.
'Socrates'@en'Manuel Chrysoloras'@en0.8680326342582703Manuel (or Emmanuel) Chrysoloras (Greek: Μανουὴλ Χρυσολωρᾶς; c. 1355 – 15 April 1415) was Greek scholar and a pioneer in the introduction of Greek literature to Western Europe during the Late Middle Ages.
'Socrates'@en'Cebes'@en0.8670117259025574Cebes of Thebes (Greek: Κέβης Θηβαῖος, gen.: Κέβητος; c. 430 – 350 BCE) was an Ancient Greek philosopher from Thebes remembered as a disciple of Socrates.
'Socrates'@en'Pyrrho'@en0.8662737011909485Pyrrho of Elis (/ˈpɪroʊ/; Ancient Greek: Πύρρων ὁ Ἠλεῖος, romanized: Pyrrhо̄n ho Ēleios; c. 360 – c. 270 BC), born in Elis, Greece, was a Greek philosopher of Classical antiquity, credited as being the first Greek skeptic philosopher and founder of Pyrrhonism.
'Socrates'@en'Menedemus'@en0.8632197380065918Menedemus of Eretria (Greek: Μενέδημος ὁ Ἐρετριεύς; 345/44 – 261/60 BC) was a Greek philosopher and founder of the Eretrian school.
'Socrates'@en'Xenophon'@en0.8607585430145264Xenophon of Athens (/ˈzɛnəfən, zi-, -fɒn/; Ancient Greek: Ξενοφῶν [ksenopʰɔ̂ːn]; c. 430 – probably 355 or 354 BC) was a Greek military leader, philosopher, and historian, born in Athens.
'Socrates'@en'Theodoros Gaza'@en0.8601701855659485Theodorus Gaza (Greek: Θεόδωρος Γαζῆς, Theodoros Gazis; Italian: Teodoro Gaza; Latin: Theodorus Gazes), also called Theodore Gazis or by the epithet Thessalonicensis (in Latin) and Thessalonikeus (in Greek) (c. 1398 – c. 1475), was a Greek humanist and translator of Aristotle, one of the Greek scholars who were the leaders of the revival of learning in the 15th century (the Palaeologan Renaissance).
'Socrates'@en'Daniel Philippidis'@en0.8599057793617249Daniel Philippidis (Greek: Δανιήλ Φιλιππίδης; Romanian: Dimitrie Daniil Philippide; c. 1750 – 1832) was a Greek scholar, figure of the modern Greek Enlightenment and member of the patriotic organization Filiki Etaireia.
" ], "text/plain": [ "" @@ -2902,7 +2918,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 21, "id": "11b54d06", "metadata": { "lines_to_next_cell": 2 @@ -2911,7 +2927,7 @@ { "data": { "text/html": [ - "
xlabelylabelsimysent
'United States of America'@en'United States of America'@en1.0000001192092896The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America.
'United States of America'@en'North African American'@en0.9427798390388489North African Americans are Americans with origins in the region of North Africa.
'United States of America'@en'Central America'@en0.9361214637756348Central America (Spanish: América Central [aˈmeɾika senˈtɾal] or Centroamérica [sentɾoaˈmeɾika]) is a subcontinent of North America.
'United States of America'@en'Northern United States'@en0.9349914789199829The Northern United States, commonly referred to as the American North, the Northern States, or simply the North, is a geographical or historical region of the United States.
'United States of America'@en'Episcopal Diocese of Atlanta'@en0.9307031035423279The Episcopal Diocese of Atlanta is the diocese of the Episcopal Church in the United States of America, with jurisdiction over middle and north Georgia.
'United States of America'@en'Tidewater region of Virginia'@en0.9302077293395996Tidewater refers to the north Atlantic coastal plain region of the United States of America.
'United States of America'@en'Great Northern Railway'@en0.9266144037246704The Great Northern Railway (reporting mark GN) was an American Class I railroad.
'United States of America'@en'American people of North American descent'@en0.9230941534042358American people of North American descent refers to inhabitants of the United States with lineage tracing to other North American countries.
'United States of America'@en'Episcopal Diocese of Northern Michigan'@en0.922684907913208The Episcopal Diocese of Northern Michigan is the diocese of the Episcopal Church in the United States of America (TEC) with canonical jurisdiction in the Upper Peninsula of Michigan.
'United States of America'@en'list of metropolitan areas in Northern America'@en0.9206945300102234This is a list of metropolitan areas in Northern America, typically defined to include Canada and the United States as well as Bermuda (UK), Greenland (Denmark), and St. Pierre and Miquelon (France).
'Austria'@en'Austria'@en0.9999998807907104Austria, officially the Republic of Austria, is a landlocked country in the southern part of Central Europe, situated at Eastern Alps.
'Austria'@en'Mount Royal'@en0.9394936561584473Mount Royal is situated at the southern end of the Mount Royal Range in the Barrington Tops region of eastern Australia.
'Austria'@en'Theresienfeld'@en0.9383996725082397Theresienfeld is a town in the Wiener Neustadt-Land district of Lower Austria, in eastern Austria.
'Austria'@en'Paruna'@en0.9354041814804077Paruna is a town in eastern South Australia.
'Austria'@en'Middleton'@en0.9351267218589783Middleton is a town in South Australia on the eastern end of the south coast of the Fleurieu Peninsula.
'Austria'@en'Lavanttal'@en0.9330481290817261The Lavant Valley (German: Lavanttal, Slovene: Labotska dolina or Laboška dolina; Southern Bavarian: Lovnthol) lies in the Lavanttal Alps in southern Austria in the eastern part of the state of Carinthia.
'Austria'@en'Belarus'@en0.9315851926803589Belarus, officially the Republic of Belarus, is a landlocked country in Eastern Europe.
'Austria'@en'Towitta'@en0.9289759397506714Towitta is a locality in the Murray Mallee region of South Australia at the foot of the eastern side of the Mount Lofty Ranges.
'Austria'@en'Port Elliot'@en0.9283883571624756Port Elliot is a town in South Australia toward the eastern end of the south coast of the Fleurieu Peninsula.
'Austria'@en'list of companies of Andorra'@en0.928240180015564Andorra is a sovereign landlocked microstate in Southwestern Europe, located in the eastern Pyrenees mountains and bordered by Spain and France.
'Greece'@en'Greece'@en1.0Greece or Hellas (Greek: Ελλάδα, romanized: Elláda or Ελλάς, romanized: Ellas), officially the Hellenic Republic (Greek: Ελληνική Δημοκρατία, romanized: Elliniki Dimokratia) is a country in Southeast Europe.
'Greece'@en'Bulgaria'@en0.9444983005523682Bulgaria (/bʌlˈɡɛəriə, bʊl-/; Bulgarian: България, romanized: Balgariya), officially the Republic of Bulgaria, is a country in Southeast Europe.
'Greece'@en'Croatia'@en0.9344392418861389Croatia (/kroʊˈeɪʃə/, kroh-AY-shə; Croatian: Hrvatska, pronounced [xř̩ʋaːtskaː]), officially the Republic of Croatia (Croatian: Republika Hrvatska,), is a country at the crossroads of Central and Southeast Europe.
'Greece'@en'Caenophrurium'@en0.9327539205551147Caenophrurium (also written as Cenophrurium and Coenophrurium; Greek: Καινοφρούριον Kainophrurion) was a settlement in the Roman province of Europa (the southeasternmost part of Thrace), between Byzantium and Heraclea Perinthus.
'Greece'@en'Chech'@en0.9299066662788391Chech (Bulgarian: Чеч, Greek: Τσέτσι) or Chechko (Bulgarian: Чечко) is a geographical and historical region of the Balkan peninsula in southeastern Europe in modern-day Bulgaria and Greece.
'Greece'@en'history of Bosnia and Herzegovina'@en0.926729142665863Bosnia and Herzegovina, sometimes referred to simply as Bosnia, is a country in Southeast Europe on the Balkan Peninsula.
'Greece'@en'Zagem'@en0.9253758788108826Zagem or Bazari (Georgian: ბაზარი) was a town in the southeast Caucasus, in the eastern Georgian kingdom of Kakheti.
'Greece'@en'Barony of Gritzena'@en0.9247817993164062The Barony of Gritzena or Gritsena was a medieval Frankish fiefdom of the Principality of Achaea, located in eastern Messenia, in the Peloponnese peninsula in Greece, centred on the settlement of Gritzena (Greek: Γρίτζενα/Γρίτσενα; French: La Grite).
'Greece'@en'Maroneia-Sapes Municipality'@en0.924449622631073Maroneia-Sapes (Greek: Μαρώνεια-Σάπες) is a municipality in the Rhodope regional unit, East Macedonia and Thrace, Greece.
'Greece'@en'Globočice pri Kostanjevici'@en0.9231210947036743Globočice pri Kostanjevici (pronounced [ɡlɔbɔˈtʃiːtsɛ pɾi kɔˈstaːnjɛʋitsa]; in older sources also Globočica, German: Globoschitz) is a settlement southeast of Kostanjevica na Krki in eastern Slovenia.
" + "
xlabelylabelsimysent
'United States of America'@en'United States of America'@en1.0000001192092896The United States of America (U.S.A. or USA), commonly known as the United States (U.S. or US) or America, is a transcontinental country located primarily in North America.
'United States of America'@en'Flora of Arkansas'@en0.9252734184265137Geobotanically, Arkansas belongs to the North American Atlantic Region.
'United States of America'@en'Northern United States'@en0.9048947095870972The Northern United States, commonly referred to as the American North, the Northern States, or simply the North, is a geographical or historical region of the United States.
'United States of America'@en'Backcountry'@en0.9047999382019043The Backcountry was a region in North America.
'United States of America'@en'Canada'@en0.9028735160827637Canada is a country in North America.
'United States of America'@en'Medfield'@en0.9025246500968933Medfield is a neighborhood located in north Baltimore, Maryland, United States of America.
'United States of America'@en'Northwest Georgia'@en0.901907742023468Northwest Georgia is a region of the state of Georgia in the United States.
'United States of America'@en'North America'@en0.9008367657661438North America is a continent in the Northern Hemisphere and almost entirely within the Western Hemisphere.
'United States of America'@en'Tidewater region of Virginia'@en0.9006068110466003Tidewater refers to the north Atlantic coastal plain region of the United States of America.
'United States of America'@en'Oreamnos'@en0.8993678689002991Oreamnos is a genus of North American caprines.
'Austria'@en'Austria'@en1.0Austria, officially the Republic of Austria, is a landlocked country in the southern part of Central Europe, situated at Eastern Alps.
'Austria'@en'list of companies of Andorra'@en0.8954368233680725Andorra is a sovereign landlocked microstate in Southwestern Europe, located in the eastern Pyrenees mountains and bordered by Spain and France.
'Austria'@en'history of Andorra'@en0.8863847851753235Andorra, officially the Principality of Andorra (Catalan: Principat d\\'Andorra), also called the Principality of the Valleys of Andorra (Catalan: Principat de les Valls d\\'Andorra), is a sovereign landlocked microstate in Southwestern Europe, located in the eastern Pyrenees mountains and bordered by Spain and France.
'Austria'@en'Belarus'@en0.8846515417098999Belarus, officially the Republic of Belarus, is a landlocked country in Eastern Europe.
'Austria'@en'Jervois'@en0.8832077383995056The County of Jervois is a cadastral unit in the Australian state of South Australia that covers land on the east coast of the Eyre Peninsula.
'Austria'@en'Venetian Prealps'@en0.8805562853813171The Venetian Prealps (Prealpi Venete in Italian) are a mountain range in the south-eastern part of the Alps.
'Austria'@en'Burg Lockenhaus'@en0.8778088688850403Burg Lockenhaus (Hungarian Léka) is a castle and medieval fortress in the Güns Valley in the southeastern part of Lockenhaus, in Burgenland, eastern Austria.
'Austria'@en'Soria Province'@en0.8766404986381531Soria is a province of central Spain, in the eastern part of the autonomous community of Castile and León.
'Austria'@en'Burra'@en0.8749806880950928County of Burra is a cadastral unit located in the Australian state of South Australia which covers land located in the state’s east associated with the town of Burra.
'Austria'@en'Apševci'@en0.8717129230499268Apševci (Hungarian: Halápfalva) is a village in Syrmia in easternmost part of Croatia along the state border with Serbia.
'Greece'@en'Greece'@en1.0Greece or Hellas (Greek: Ελλάδα, romanized: Elláda or Ελλάς, romanized: Ellas), officially the Hellenic Republic (Greek: Ελληνική Δημοκρατία, romanized: Elliniki Dimokratia) is a country in Southeast Europe.
'Greece'@en'Peloponnese Region'@en0.9604872465133667The Peloponnese Region (Greek: Περιφέρεια Πελοποννήσου, romanized: Periféria Peloponnísou, [periˈferia pelopoˈnisu]) is a region in southern Greece.
'Greece'@en'Kalyvia Sochas'@en0.9564719796180725Κalyvia Sochas (Greek: Καλύβια Σοχάς) is a village in Laconia, in southern Greece.
'Greece'@en'Parrhasia'@en0.9560511708259583Parrhasia (Greek: Παρρασία) was a region in south Arcadia, Greece.
'Greece'@en'Hora Sfakion'@en0.9531674385070801Hóra Sfakíon (Greek: Χώρα Σφακίων) or Sfakia (Σφακιά [sfaˈca]) is a town on the south coast of Crete, Greece.
'Greece'@en'Trikala, Corinthia'@en0.9519431591033936Trikala (Greek: Τρίκαλα Κορινθίας) is a village in Corinthia, in the Peloponnese peninsula of southern Greece.
'Greece'@en'Terpsithea, Messenia'@en0.9517883658409119Terpsithea (Greek: Τερψιθέα) is a village in Messenia, southern Greece.
'Greece'@en'Laconian Gulf'@en0.9507078528404236The Laconian Gulf (Greek: Λακωνικός Κόλπος, romanized: Lakonikos Kolpos), is a gulf in the south-eastern Peloponnese, in Greece.
'Greece'@en'Kalyvia, Laconia'@en0.9503564834594727Κalyvia (Greek: Καλύβια) is a village in Laconia, southern Greece.
'Greece'@en'Kremasti, Laconia'@en0.9496901035308838Kremasti (Greek: Κρεμαστή) is a village in Laconia, southern Greece.
" ], "text/plain": [ "" @@ -3481,7 +3497,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 22, "id": "28194492", "metadata": { "lines_to_next_cell": 2 @@ -3490,7 +3506,7 @@ { "data": { "text/html": [ - "
xlabelylabelsimysent
'dog'@en'dog'@en1.0000001192092896The dog or domestic dog (Canis familiaris or Canis lupus familiaris) is a domesticated descendant of the wolf.
'dog'@en'Canis simensis'@en0.9140985012054443The Ethiopian wolf (Canis simensis), also called the Simien jackal and Simien fox, is a canine native to the Ethiopian Highlands.
'dog'@en'Bucovina Shepherd Dog'@en0.8995509743690491The Romanian Bucovina Shepherd (Romanian: Ciobănesc Românesc de Bucovina) is a breed of livestock guardian dogs native to historical Bukovina (Bucovina) region.
'dog'@en'Karst Shepherd'@en0.8969720602035522The Karst Shepherd (Slovene: kraški ovčar or kraševec ) is a breed of dog of the livestock guardian type, originating in Slovenia.
'dog'@en'Majorca Shepherd Dog'@en0.8946498036384583The Majorca Shepherd Dog (Catalan: Ca de bestiar, Spanish: Perro de pastor mallorquín) is a domesticated breed of dog, used in the Balearic Islands of Spain, both for guarding sheep and as a general purpose farm dog.
'dog'@en'Tornjak'@en0.8933776021003723The Tornjak (pronounced [torɲâk]), is a breed of livestock guardian dog native to Bosnia and Herzegovina and Croatia.
'dog'@en'Schapendoes'@en0.8902978301048279The Schapendoes (Dutch pronunciation: [ˈsxaːpəndus]) or Dutch Sheepdog, is a breed of dog originating in the Netherlands.
'dog'@en'Native American dogs'@en0.8873973488807678Native American dogs, or Pre-Columbian dogs, were dogs living with people indigenous to the Americas.
'dog'@en'Mozart family'@en0.8867671489715576The Mozart family were the ancestors, relatives, and descendants of Wolfgang Amadeus Mozart.
'dog'@en'Hare Indian Dog'@en0.88480544090271The Hare Indian dog is an extinct domesticated canine; possibly a breed of domestic dog, coydog, or domesticated coyote; formerly found and originally bred in northern Canada by the Hare Indians for coursing.
'house cat'@en'house cat'@en1.0000001192092896The cat (Felis catus) is a domestic species of small carnivorous mammal.
'house cat'@en'Ragamuffin'@en0.9178828597068787The Ragamuffin is a breed of domestic cat.
'house cat'@en'Cymric'@en0.9060893058776855The Cymric (/ˈkɪmrɪk/ KIM-rik, /ˈkʌmrɪk/ KUM-rik) is a breed of domestic cat.
'house cat'@en'Turkish Angora'@en0.9040964841842651The Turkish Angora (Turkish: Ankara kedisi, \\"Ankara cat\\") is a breed of domestic cat.
'house cat'@en'Cornish Rex'@en0.9037503004074097The Cornish Rex is a breed of domestic cat.
'house cat'@en'Flat-headed cat'@en0.8956901431083679The flat-headed cat (Prionailurus planiceps) is a small wild cat native to the Thai-Malay Peninsula, Borneo, and Sumatra.
'house cat'@en'Oriental Longhair'@en0.8942360877990723The Oriental Longhair is a variety of domestic cat.
'house cat'@en'Prionailurus'@en0.8825944662094116Prionailurus is a genus of spotted, small wild cats native to Asia.
'house cat'@en'German Rex'@en0.8820987343788147The German Rex is a breed of domestic cat.
'house cat'@en'Cheyletiella blakei'@en0.8800439238548279Cheyletiella blakei is a small mite and ectoparasitic of domestic cats.
'horse'@en'horse'@en0.9999999403953552The horse (Equus ferus caballus) is a domesticated, one-toed, hoofed mammal.
'horse'@en'Equus ferus'@en0.8957256078720093The wild horse (Equus ferus) is a species of the genus Equus, which includes as subspecies the modern domesticated horse (Equus ferus caballus) as well as the endangered Przewalski\\'s horse (Equus ferus przewalskii).
'horse'@en'Tolfetano'@en0.8931700587272644The Tolfetano or Cavallo Tolfetano is a breed of horse from the northern part of the Lazio region of Italy.
'horse'@en'Asturcón'@en0.8833141922950745The Asturcón is an ancient breed of small horse or pony from the autonomous region of Asturias in northern Spain.
'horse'@en'Riwoche horse'@en0.8793144822120667The Riwoche horse /ˈriːwoʊtʃeɪ/ is a dun-colored, pony-sized horse indigenous to northeastern Tibet.
'horse'@en'Catria horse'@en0.8720530867576599The Catria Horse (Italian: Cavallo del Catria) is a breed of horse originating in the mountainous area of the massif of Monte Catria in the Marche region of Italy, and surrounding areas in the provinces of Ancona, Perugia and Pesaro.
'horse'@en'Hequ horse'@en0.8712781071662903The Hequ horse, previously called the Nanfan, is a horse breed native to the northwestern Tibetan plateau.
'horse'@en'Pentro horse'@en0.8711110949516296The Pentro Horse (Italian: Cavallo Pentro) is a breed of horse originating in the area of Isernia, in the Molise region of Italy.
'horse'@en'Dasyrhamphis'@en0.871029794216156Dasyrhamphis is a species of \\'horse fly\\' belonging to the family Tabanidae subfamily Tabaninae.
'horse'@en'Akhal-Teke'@en0.8686932325363159The Akhal-Teke (/ˌækəlˈtɛk/ or /ˌækəlˈtɛki/; from Turkmen Ahalteke, [axalˈteke]) is a Turkmen horse breed.
" + "
xlabelylabelsimysent
'dog'@en'dog'@en0.9999999403953552The dog or domestic dog (Canis familiaris or Canis lupus familiaris) is a domesticated descendant of the wolf.
'dog'@en'wolfdog'@en0.8726404309272766A wolfdog is a canine produced by the mating of a domestic dog (Canis familiaris) with a gray wolf (Canis lupus), eastern wolf (Canis lycaon), red wolf (Canis rufus), or Ethiopian wolf (Canis simensis) to produce a hybrid.
'dog'@en'Garmr'@en0.8452869653701782In Norse mythology, Garmr or Garm (Old Norse: Garmr [ˈɡɑrmz̠]; \\"rag\\") is a wolf or dog associated with both Hel and Ragnarök, and described as a blood-stained guardian of Hel\\'s gate.
'dog'@en'Shaun Ellis'@en0.839640736579895Shaun Ellis is an English animal researcher who lived among wolves, and adopted a pack of abandoned North American timber wolf pups.
'dog'@en'Saarloos wolfdog'@en0.8337552547454834The Saarloos Wolfdog (Dutch: Saarlooswolfhond, German: Saarlooswolfhund) is a wolf-dog breed originating from the Netherlands by the crossing of a German Shepherd with a Siberian grey wolf in 1935.
'dog'@en'Canidae'@en0.8309797048568726Canidae (/ˈkænɪdiː/; from Latin, canis, \\"dog\\") is a biological family of dog-like carnivorans, colloquially referred to as dogs, and constitutes a clade.
'dog'@en'Pembroke Welsh Corgi'@en0.8307735323905945The Pembroke Welsh Corgi (/ˈkɔːrɡi/; Welsh for \\"dwarf dog\\") is a cattle herding dog breed that originated in Pembrokeshire, Wales.
'dog'@en'Wolf distribution'@en0.8283219933509827Wolf distribution is the species distribution of the wolf (Canis lupus).
'dog'@en'Schizocosa stridulans'@en0.8269302248954773Schizocosa stridulans is a sibling species of S. ocreata and S. rovneri and is part of the wolf spider family.
'dog'@en'Himalayan Wolf'@en0.8238919973373413The Himalayan wolf (Canis lupus chanco) is a canine of debated taxonomy.
'house cat'@en'house cat'@en0.9999999403953552The cat (Felis catus) is a domestic species of small carnivorous mammal.
'house cat'@en'Flat-headed cat'@en0.8973938822746277The flat-headed cat (Prionailurus planiceps) is a small wild cat native to the Thai-Malay Peninsula, Borneo, and Sumatra.
'house cat'@en'Ragamuffin'@en0.8963260650634766The Ragamuffin is a breed of domestic cat.
'house cat'@en'Cornish Rex'@en0.8604508638381958The Cornish Rex is a breed of domestic cat.
'house cat'@en'lists of cats'@en0.8595231771469116The cat (Felis silvestris catus), also known as the domestic cat or house cat to distinguish it from other felines, is a small carnivorous species of crepuscular mammal that is often valued by humans for its companionship and its ability to hunt vermin.
'house cat'@en'Oriental Longhair'@en0.8485206961631775The Oriental Longhair is a variety of domestic cat.
'house cat'@en'Cymric'@en0.8457154631614685The Cymric (/ˈkɪmrɪk/ KIM-rik, /ˈkʌmrɪk/ KUM-rik) is a breed of domestic cat.
'house cat'@en'Gogo'@en0.8425351977348328Gogo is a small genus of catfishes (order Siluriformes) of the family Anchariidae.
'house cat'@en'Turkish Angora'@en0.84034264087677The Turkish Angora (Turkish: Ankara kedisi, \\"Ankara cat\\") is a breed of domestic cat.
'house cat'@en'Ancharius'@en0.8346255421638489Ancharius, the Vaonas, is a small genus of catfishes (order Siluriformes) of the family Anchariidae.
'horse'@en'horse'@en1.0000001192092896The horse (Equus ferus caballus) is a domesticated, one-toed, hoofed mammal.
'horse'@en'sheep'@en0.7853900790214539Sheep or domestic sheep (Ovis aries) are domesticated, ruminant mammals typically kept as livestock.
'horse'@en'stable'@en0.7729700207710266A stable is a building in which livestock, especially horses, are kept.
'horse'@en'Bit mouthpiece'@en0.7656224370002747The mouthpiece is the part of a horse\\'s bit that goes into the mouth of a horse, resting on the bars of the mouth in the sensitive interdental space where there are no teeth.
'horse'@en'tack'@en0.755368709564209Tack is equipment or accessories equipped on horses and other equines in the course of their use as domesticated animals.
'horse'@en'grupstal'@en0.7541809678077698Tie stalls, also known as stanchion or stall barn, are a type of stall where animals are tethered at the neck to their stall.
'horse'@en'breed'@en0.7521687150001526A breed is a specific group of domestic animals having homogeneous appearance (phenotype), homogeneous behavior, and/or other characteristics that distinguish it from other organisms of the same species.
'horse'@en'Ergot'@en0.7416918873786926The ergot is a small callosity (Calcar metacarpeum and Calcar metatarseum) on the underside of the fetlock of a horse or other equine.
'horse'@en'Lord Howe Woodhen'@en0.7349467873573303The Lord Howe woodhen (Hypotaenidia sylvestris) also known as the Lord Howe Island woodhen or Lord Howe (Island) rail, is a flightless bird of the rail family, (Rallidae).
'horse'@en'Asinus'@en0.7292249202728271Asinus is a subgenus of Equus (single-toed (hooved) grazing animal) that encompasses several subspecies of the Equidae commonly known as wild asses, characterized by long ears, a lean, straight-backed build, lack of a true withers, a coarse mane and tail, and a reputation for considerable toughness and endurance.
" ], "text/plain": [ "" @@ -3780,7 +3796,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 23, "id": "ce658ebb", "metadata": { "lines_to_next_cell": 2 @@ -3789,7 +3805,7 @@ { "data": { "text/html": [ - "
xlabelylabelsimysent
'handball'@en'handball'@en1.0Handball (also known as team handball, European handball or Olympic handball) is a team sport in which two teams of seven players each (six outcourt players and a goalkeeper) pass a ball using their hands with the aim of throwing it into the goal of the other team.
'handball'@en'beach handball'@en0.8905559182167053Beach handball is a team sport where two teams pass and bounce or roll a ball, trying to throw it in the goal of the opposing team.
'handball'@en'volleyball injury'@en0.8481535315513611Volleyball is a game played between two opposing sides, with six players on each team, where the players use mainly their hands to hit the ball over a net and try to make the ball land on the opposing team\\'s side of the court.
'handball'@en'ball boy'@en0.8265464305877686Ball boys and ball girls, also known as ball kids are individuals, usually human youths but sometimes dogs, who retrieve and supply balls for players or officials in sports such as association football, American football, bandy, cricket, tennis, baseball and basketball.
'handball'@en'Balonpesado'@en0.8249591588973999The balonpesado is a team sport, devised for both open field as closed, in which two sets of five players each try to score goals within circles drawn on the ground of each end of the field.
'handball'@en'Screwball Scramble'@en0.8165739178657532Screwball Scramble is a toy made by Tomy that involves guiding a 14-millimeter-diameter chrome steel ball bearing around an obstacle course.
'handball'@en'sepak takraw'@en0.808526337146759Sepak takraw, or Sepaktakraw, also called kick volleyball, is a team sport played with a ball made of rattan or synthetic plastic between two teams of two to four players on a court resembling a badminton court.
'handball'@en'muggle quidditch'@en0.7998005151748657Quidditch, also known as quadball, is a sport of two teams of seven players each mounted on a broomstick, and is played on a hockey rink-sized pitch.
'handball'@en'tag'@en0.7951781749725342Tag (also called tig, it, tiggy, tips, tick, tip) is a playground game involving two or more players chasing other players in an attempt to \\"tag\\" and mark them out of play, usually by touching with a hand.
'handball'@en'dodgeball'@en0.7933317422866821Dodgeball is a team sport in which players on two teams try to throw balls and hit opponents, while avoiding being hit themselves.
" + "
xlabelylabelsimysent
'handball'@en'handball'@en0.9999999403953552Handball (also known as team handball, European handball or Olympic handball) is a team sport in which two teams of seven players each (six outcourt players and a goalkeeper) pass a ball using their hands with the aim of throwing it into the goal of the other team.
'handball'@en'beach handball'@en0.8959406614303589Beach handball is a team sport where two teams pass and bounce or roll a ball, trying to throw it in the goal of the opposing team.
'handball'@en'Gaelic handball'@en0.8065397143363953Gaelic handball (known in Ireland simply as handball; Irish: liathróid láimhe) is a sport where players hit a ball with a hand or fist against a wall in such a way as to make a shot the opposition cannot return, and that may be played with two (singles) or four players (doubles).
'handball'@en'Tennis polo'@en0.80338454246521Tennis polo (or toccer) is a field sport where two teams of ten players (nine field players and one goalkeeper) use a tennis ball to score goals by throwing the ball into a goal defended by a keeper who holds a racket.
'handball'@en'Balonpesado'@en0.7840337157249451The balonpesado is a team sport, devised for both open field as closed, in which two sets of five players each try to score goals within circles drawn on the ground of each end of the field.
'handball'@en'Harrison Hoist'@en0.776003897190094The Harrison Hoist, also known as the Chairlift, is a form of goaltending in netball where one defender lifts another defender, rugby union lineout-style, in order to catch the ball and prevent a goal scoring opportunity.
'handball'@en'ball boy'@en0.7694556713104248Ball boys and ball girls, also known as ball kids are individuals, usually human youths but sometimes dogs, who retrieve and supply balls for players or officials in sports such as association football, American football, bandy, cricket, tennis, baseball and basketball.
'handball'@en'dodgeball'@en0.7659051418304443Dodgeball is a team sport in which players on two teams try to throw balls and hit opponents, while avoiding being hit themselves.
'handball'@en'sepak takraw'@en0.7623462677001953Sepak takraw, or Sepaktakraw, also called kick volleyball, is a team sport played with a ball made of rattan or synthetic plastic between two teams of two to four players on a court resembling a badminton court.
'handball'@en'Guts'@en0.7574184536933899Guts or disc guts (sometimes guts Frisbee in reference to the trademarked brand name) is a disc game inspired by dodgeball, involving teams throwing a flying disc (rather than balls) at members of the opposing team.
" ], "text/plain": [ "" @@ -4079,7 +4095,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 24, "id": "57ed28f9", "metadata": { "lines_to_next_cell": 2 @@ -4088,7 +4104,7 @@ { "data": { "text/html": [ - "
xlabelylabelsimysent
'journalist'@en'journalist'@en0.9999999403953552A journalist is an individual that collects/gathers information in form of text, audio, or pictures, processes them into a news-worthy form, and disseminates it to the public.
'journalist'@en'technology journalism'@en0.8715080618858337Technology journalism is the activity, or product, of journalists engaged in the preparation of written, visual, audio or multi-media material intended for dissemination through public media, focusing on technology-related subjects.
'journalist'@en'Information subsidy'@en0.8664877414703369An information subsidy is the provision of ready-to-use newsworthy information to the news media by various sources interested in gaining access to media time and space.
'journalist'@en'media relations'@en0.8634818196296692Media Relations involves working with media for the purpose of informing the public of an organization\\'s mission, policies and practices in a positive, consistent and credible manner.
'journalist'@en'journalism'@en0.8631913065910339Journalism is the production and distribution of reports on the interaction of events, facts, ideas, and people that are the \\"news of the day\\" and that informs society to at least some degree.
'journalist'@en'Mediated deliberation'@en0.851024329662323Mediated deliberation is a form of deliberation that is achieved through the media which acts as a mediator between the mass public and elected officials.
'journalist'@en'news conference'@en0.8386815190315247A press conference or news conference is a media event in which notable individuals or organizations invite journalists to hear them speak and ask questions.
'journalist'@en'Media pilgrimage'@en0.8380250930786133A media pilgrimage refers to visits made to the sites mentioned in popular media.
'journalist'@en'press kit'@en0.836276650428772A press kit, often referred to as a media kit in business environments, is a pre-packaged set of promotional materials that provide information about a person, company, organization or cause and which is distributed to members of the media for promotional use.
'journalist'@en'multimedia journalism'@en0.8353787064552307Multimedia journalism is the practice of contemporary journalism that distributes news content either using two or more media formats via the Internet, or disseminating news report via multiple media platforms.
" + "
xlabelylabelsimysent
'journalist'@en'journalist'@en0.9999999403953552A journalist is an individual that collects/gathers information in form of text, audio, or pictures, processes them into a news-worthy form, and disseminates it to the public.
'journalist'@en'journalism'@en0.8947291374206543Journalism is the production and distribution of reports on the interaction of events, facts, ideas, and people that are the \\"news of the day\\" and that informs society to at least some degree.
'journalist'@en'news analyst'@en0.8600048422813416A news analyst examines, analyses and interprets broadcast news received from various sources.
'journalist'@en'outline of journalism'@en0.8528070449829102The following outline is provided as an overview of and topical guide to journalism: Journalism – investigation and reporting of events, issues and trends to a broad audience.
'journalist'@en'Public editor'@en0.8508998155593872A public editor is a position existing at some news publications; the person holding this position is responsible for supervising the implementation of proper journalism ethics at that publication.
'journalist'@en'Index of journalism articles'@en0.831515371799469Articles related to the field of journalism include:
'journalist'@en'source'@en0.8213073015213013In journalism, a source is a person, publication, or knowledge other record or document that gives timely information.
'journalist'@en'journalism ethics and standards'@en0.8076792359352112Journalistic ethics and standards comprise principles of ethics and good practice applicable to journalists.
'journalist'@en'Assignment editor'@en0.8065866231918335In journalism, an assignment editor is an editor – either at a newspaper or a radio or television station – who selects, develops, and plans reporting assignments, either news events or feature stories, to be covered by reporters.
'journalist'@en'Information subsidy'@en0.8030204176902771An information subsidy is the provision of ready-to-use newsworthy information to the news media by various sources interested in gaining access to media time and space.
" ], "text/plain": [ "" @@ -4378,14 +4394,14 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 25, "id": "5033d596", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
xlabelylabelsimysent
'head of state'@en'head of state'@en1.0A head of state (or chief of state) is the public persona who officially embodies a state in its unity and legitimacy.
'head of state'@en'state religion'@en0.9016953706741333A state religion (also called an established religion or official religion) is a religion or creed officially endorsed by a sovereign state.
'head of state'@en'nation state'@en0.8929967284202576A nation state is a political unit where the state and nation are congruent.
'head of state'@en'Iman, Ittihad, Nazm'@en0.8915225863456726Faith, Unity, Discipline (Urdu: ایمان، اتحاد، نظم) is the national motto of Pakistan.
'head of state'@en'Freedom and Unity'@en0.890255331993103\\"Freedom and Unity\\" is the official motto of the U.S. state of Vermont.
'head of state'@en'Ukrainian nationalism'@en0.8886831402778625Ukrainian nationalism refers to the promotion of the unity of Ukrainians and the titular Ukraine nation state (and in a modern sense, also the \\"people of Ukraine\\" in a constitutionally mandated \\"territorial-civic\\" sense), as well as nation building as a means of strengthening and protecting state sovereignty within the international system of states.
'head of state'@en'Department for Constitutional Affairs'@en0.8855394124984741The Department for Constitutional Affairs (DCA) was a United Kingdom government department.
'head of state'@en'Most Excellent Majesty'@en0.8850839734077454Most Excellent Majesty is a form of address in the United Kingdom.
'head of state'@en'Official culture'@en0.8842212557792664Official culture is the culture that receives social legitimation or institutional support in a given society.
'head of state'@en'National Enterprise Board'@en0.8837820887565613The National Enterprise Board (NEB) was a United Kingdom government body.
" + "
xlabelylabelsimysent
'head of state'@en'head of state'@en1.0000001192092896A head of state (or chief of state) is the public persona who officially embodies a state in its unity and legitimacy.
'head of state'@en'Justification for the state'@en0.8505247831344604The justification of the state refers to the source of legitimate authority for the state or government.
'head of state'@en'Seal of Tamil Nadu'@en0.8389069437980652The Emblem of Tamil Nadu is the official state emblem of Tamil Nadu and is used as the official state symbol of the Government of Tamil Nadu.
'head of state'@en'Head of Kalmykia'@en0.8310245275497437The Head of Kalmykia is an elected official who serves as the head of state of Kalmykia.
'head of state'@en'Prime Minister of the United Kingdom'@en0.8299723863601685The prime minister of the United Kingdom is the head of government of the United Kingdom.
'head of state'@en'Governor of Kaduna State'@en0.8240423202514648The Kaduna State Governor is the head of .The governor leads the executive branch of the Government.This position places its holder in leadership of the state with command authority over the state affairs.
'head of state'@en'Official culture'@en0.8230085372924805Official culture is the culture that receives social legitimation or institutional support in a given society.
'head of state'@en'President of Tanzania'@en0.8226332664489746The President of the United Republic of Tanzania (Swahili: Rais wa Jamhuri ya Muungano wa Tanzania) is the head of state and head of government of the United Republic of Tanzania.
'head of state'@en'Administrator of Ascension Island'@en0.8202604055404663The Administrator of Ascension is the head of government and representative of the Governor of St Helena, Ascension and Tristan da Cunha in Ascension Island.
'head of state'@en'Contributions Agency'@en0.8172905445098877The Contributions Agency was an executive agency of the United Kingdom government.
" ], "text/plain": [ "" diff --git a/examples/kypherv-similarity-queries.py b/examples/kypherv-similarity-queries.py index aed3a61d3..376ef85f0 100644 --- a/examples/kypherv-similarity-queries.py +++ b/examples/kypherv-similarity-queries.py @@ -57,7 +57,7 @@ def show_html(img_width=150): # %env MAIN={DB}/wikidata-20221102-dwd-v8-main.sqlite3.db # %env COMPLEX={DB}/wikidata-20221102-dwd-v8-complex-embeddings.sqlite3.db # %env TRANSE={DB}/wikidata-20221102-dwd-v8-transe-embeddings.sqlite3.db -# %env ABSTRACT={DB}/wikidata-20221102-dwd-v8-abstract-embeddings.sqlite3.db +# %env ABSTRACT={DB}/wikidata-20221102-dwd-v8-abstract-embeddings-large.sqlite3.db # %env IMAGE={DB}/wikimedia-capcom-image-embeddings-v2.sqlite3.db # If you copied the graph caches and their associated `.faiss.idx` ANNS index files @@ -118,7 +118,9 @@ def show_html(img_width=150): # The `ABSTRACT` graph cache contains the sentences and embedding vectors # generated from the first sentences of Wikipedia short abstracts. It -# contains about 6M 768-D Roberta base vectors: +# contains about 6M 1024-D Roberta large vectors (**Note**: these are different +# embeddings than the ones used and reported on in the 2022 Wikidata Workshop paper, +# therefore, the query results in this notebook are somewhat different): # !kgtk query --gc $ABSTRACT --sc @@ -196,7 +198,7 @@ def show_html(img_width=150): # If we used the same brute-force search from above on this much larger set, -# it would take about 5 min to run (which is why this command is disabled): +# it would take about 2 min to run (which is why this command is disabled): # !time DISABLED kgtk query --gc $MAIN \ # --ac $ABSTRACT \ @@ -211,16 +213,16 @@ def show_html(img_width=150): # ``` # xlabel ylabel sim # 'Socrates'@en 'Socrates'@en 1.0000001192092896 -# 'Socrates'@en 'Anytus'@en 0.9346579909324646 -# 'Socrates'@en 'Heraclitus'@en 0.9344534277915955 -# 'Socrates'@en 'Hippocrates'@en 0.9304061532020569 -# 'Socrates'@en 'Cleisthenes'@en 0.9292828440666199 -# 'Socrates'@en 'Aristides'@en 0.9283562898635864 -# 'Socrates'@en 'Yannis Xirotiris'@en 0.926308274269104 -# 'Socrates'@en 'Sotiris Trivizas'@en 0.9255445003509521 -# 'Socrates'@en 'Aris Maragkopoulos'@en 0.9234243035316467 -# 'Socrates'@en 'Valerios Stais'@en 0.919943630695343 -# 93.859u 38.640s 4:49.84 45.7% 0+0k 18782808+8io 0pf+0w +# 'Socrates'@en 'Adamantios Korais'@en 0.8731658458709717 +# 'Socrates'@en 'Prodicus'@en 0.872790515422821 +# 'Socrates'@en 'Protagoras'@en 0.8702158331871033 +# 'Socrates'@en 'Manuel Chrysoloras'@en 0.8680326342582703 +# 'Socrates'@en 'Cebes'@en 0.8670117259025574 +# 'Socrates'@en 'Pyrrho'@en 0.8662737011909485 +# 'Socrates'@en 'Menedemus'@en 0.8632197380065918 +# 'Socrates'@en 'Epicurus'@en 0.8617314696311951 +# 'Socrates'@en 'Xenophon'@en 0.8607585430145264 +# 52.997u 15.548s 1:50.53 62.0% 0+0k 19477248+136io 0pf+0w # ``` # @@ -262,8 +264,8 @@ def show_html(img_width=150): """) -# For comparison, here is a run without dynamic scaling which returns much fewer results, since -# only a small number of the top-5 similar results for each input also satisfy the post conditions: +# For comparison, here is a run without dynamic scaling which returns fewer results, since +# not all of the top-5 similar results for each input also satisfy the post conditions: kgtk(""" query --gc $MAIN --ac $ABSTRACT @@ -417,11 +419,11 @@ def show_html(img_width=150): # sed -e 's/^ *//' | \ # kgtk cat --no-input-header --input-column-names node1 node2 --implied-label sentence \ # / add-id \ -# / text-embedding -i - --model roberta-base-nli-mean-tokens \ +# / text-embedding -i - --model roberta-large-nli-mean-tokens \ # --output-data-format kgtk --output-property emb -o - \ # / query -i - --idx vector:node2 --as text_emb_queries --match '(x)' --return x -# The above created 768-D text embedding vector for three short queries +# The above created 1024-D text embedding vector for three short queries # using the same text embedding type as used in our `ABSTRACT` embeddings. # Now we find Wikidata QNodes whose short-abstract embedding vector is most similar # to the queries, and that satisfy any additional conditions we might have. From cbb9f1e551144b5b7b4c135615aef127b60bb9cc Mon Sep 17 00:00:00 2001 From: saggu Date: Thu, 9 Mar 2023 10:16:40 -0800 Subject: [PATCH 5/5] freeze sh package version --- kgtk/__init__.py | 2 +- requirements.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kgtk/__init__.py b/kgtk/__init__.py index c3b384154..0bb84ff29 100644 --- a/kgtk/__init__.py +++ b/kgtk/__init__.py @@ -1 +1 @@ -__version__ = '1.5.2' +__version__ = '1.5.3' diff --git a/requirements.txt b/requirements.txt index 35f89a842..e0aae13e0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -sh>=1.13 +sh==1.14.3 attrs lz4 iso-639