Merge pull request #1 from schmidek/baseline

Add baseline code from EMNLP2019 paper
diffbot · Oct 25, 2019 · 6f4e39c · 6f4e39c
2 parents 6007819 + 6b077e8
commit 6f4e39c
Show file tree

Hide file tree

Showing 25 changed files with 2,167 additions and 0 deletions.
diff --git a/baselines/EMNLP2019/.gitignore b/baselines/EMNLP2019/.gitignore
@@ -0,0 +1,9 @@
+__pycache__
+models
+data
+run
+.vscode
+.idea
+*.db
+*.vocab
+tmp
diff --git a/baselines/EMNLP2019/README.md b/baselines/EMNLP2019/README.md
@@ -0,0 +1,63 @@
+## Requirements
+python 3.7
+```
+pip install -r requirements.txt
+python -m spacy download en_core_web_lg
+```
+
+Set environment variable DIFFBOT_TOKEN if you want to use entitylinking. We are providing cached results for the KnowledgeNet documents, but you will need this if you want to run the system on other documents or if you want to change the NER system. Contact Filipe Mesquita (filipe[at]diffbot.com) for a free research token.
+
+## Using the pretrained model
+
+Get the release from https://github.com/diffbot/knowledge-net/releases which includes the pretrained baseline 5 model, vocab, and linking and wikidata cache.
+
+Run on a single document:
+
+`echo "Butler W. Lampson (born December 23, 1943) is an American computer scientist contributing to the development and implementation of distributed, personal computing. He is a Technical Fellow at Microsoft and an Adjunct Professor at MIT." | python run.py`
+
+output
+```
+Butler W. Lampson| DATE_OF_BIRTH(0.99) December 23, 1943|
+Butler W. Lampson|http://www.wikidata.org/entity/Q92644 NATIONALITY(0.88) American|http://www.wikidata.org/entity/Q30
+Microsoft|http://www.wikidata.org/entity/Q2283 CEO(0.63) He|http://www.wikidata.org/entity/Q92644
+He|http://www.wikidata.org/entity/Q92644 EMPLOYEE_OR_MEMBER_OF(0.88) Microsoft|http://www.wikidata.org/entity/Q2283
+He|http://www.wikidata.org/entity/Q92644 EMPLOYEE_OR_MEMBER_OF(0.93) MIT|http://www.wikidata.org/entity/Q49108
+He| DATE_OF_BIRTH(1.00) December 23, 1943|
+He|http://www.wikidata.org/entity/Q92644 NATIONALITY(0.86) American|http://www.wikidata.org/entity/Q30
+Butler W. Lampson|http://www.wikidata.org/entity/Q92644 EMPLOYEE_OR_MEMBER_OF(0.56) Microsoft|http://www.wikidata.org/entity/Q2283
+Butler W. Lampson|http://www.wikidata.org/entity/Q92644 EMPLOYEE_OR_MEMBER_OF(0.69) MIT|http://www.wikidata.org/entity/Q49108
+```
+
+## Evaluating
+
+`python evaluate.py [test or dev]`
+
+This creates the analysis files in `tmp` and when run on `dev` prints the results. To preserve the integrity of the results, we have released the test set without annotations. See https://github.com/diffbot/knowledge-net#adding-a-system-to-the-leaderboard for more details.
+
+With the pretrained baseline 5 model you should get similar to the following on the dev set
+```
+Evaluation     Precision      Recall         F1
+span_overlap   0.718          0.691          0.704
+span_exact     0.620          0.599          0.609
+uri            0.557          0.472          0.511  
+```
+
+## Training
+
+Choose which model you would like to train in config.py
+
+Warning: baseline 5 requires ~300GB of disk space to train. The others require much less.
+
+`./train.sh`
+
+## Troubleshooting
+
+`spacy.strings.StringStore size changed error`
+
+If you have an error mentioning spacy.strings.StringStore size changed, may indicate binary incompatibility when loading NeuralCoref with import neuralcoref, it means you'll have to install NeuralCoref from the distribution's sources instead of the wheels to get NeuralCoref to build against the most recent version of SpaCy for your system.
+
+In this case, simply re-install neuralcoref as follows:
+
+pip uninstall neuralcoref
+
+pip install neuralcoref --no-binary neuralcoref
diff --git a/baselines/EMNLP2019/bert_wrapper.py b/baselines/EMNLP2019/bert_wrapper.py
@@ -0,0 +1,126 @@
+
+from spacy.tokens import Token
+import tensorflow as tf
+import tensorflow_hub as hub
+from bert.tokenization import FullTokenizer
+import numpy as np
+import logging
+
+Token.set_extension("bert_vector", default=[])
+Token.set_extension("bert_layers", default=[])
+
+# This is a path to an uncased (all lowercase) version of BERT
+BERT_MODEL_HUB = "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"
+
+def create_tokenizer_from_hub_module():
+  """Get the vocab file and casing info from the Hub module."""
+  with tf.Graph().as_default():
+    bert_module = hub.Module(BERT_MODEL_HUB)
+    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
+    with tf.Session() as sess:
+      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
+                                            tokenization_info["do_lower_case"]])
+
+  return FullTokenizer(
+      vocab_file=vocab_file, do_lower_case=do_lower_case)
+
+tokenizer = create_tokenizer_from_hub_module()
+
+LAYERS_TO_USE = 12
+
+bert_outputs_to_use = []
+g = tf.Graph()
+with g.as_default():
+  bert_module = hub.Module(BERT_MODEL_HUB, trainable=False)
+  bert_inputs = dict(
+    input_ids=tf.placeholder(dtype=tf.int32, shape=[None,None], name="input_ids"),
+    input_mask=tf.placeholder(dtype=tf.int32, shape=[None,None], name="input_mask"),
+    segment_ids=tf.placeholder(dtype=tf.int32, shape=[None,None], name="segment_ids"))
+  bert_outputs = bert_module(bert_inputs, signature="tokens", as_dict=True)
+  bert_sequence_output = bert_outputs["sequence_output"]
+  def get_intermediate_layer(total_layers, desired_layer):
+    intermediate_layer_name = bert_sequence_output.name.replace(str(total_layers + 1),
+                                                      str(desired_layer + 1))
+    logging.debug("Intermediate layer name: %s", intermediate_layer_name)
+    return g.get_tensor_by_name(intermediate_layer_name)
+  for i in range(LAYERS_TO_USE):
+    bert_outputs_to_use.append(get_intermediate_layer(12, 12-i))
+  init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
+g.finalize()
+
+sess = tf.compat.v1.Session(graph=g)
+sess.run(init_op)
+
+def run(doc):
+
+  max_seq_length = 2
+
+  batch_input_ids = []
+  batch_input_mask = []
+  batch_segment_ids = []
+  batch_orig_to_tok_map = []
+
+  for sentence_index, sentence in enumerate(doc.sents):
+    bert_tokens = []
+    orig_to_tok_map = []
+    segment_ids = [] # sentence index, 1 sentence per so always 0 right now
+
+    bert_tokens.append("[CLS]")
+    segment_ids.append(0)
+    for word in sentence:
+      orig_to_tok_map.append(len(bert_tokens))
+      ts = tokenizer.tokenize(word.text)
+      if len(ts) == 0:
+        logging.debug("Token has no bert tokens: %s", word.text)
+      for t in ts:
+        bert_tokens.append(t)
+        segment_ids.append(0)
+    bert_tokens.append("[SEP]")
+    segment_ids.append(0)
+    orig_to_tok_map.append(len(bert_tokens))
+
+    if len(bert_tokens) > max_seq_length:
+      max_seq_length = len(bert_tokens)
+
+    input_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
+    input_mask = [1] * len(input_ids)
+
+    batch_input_ids.append(input_ids)
+    batch_input_mask.append(input_mask)
+    batch_orig_to_tok_map.append(orig_to_tok_map)
+    batch_segment_ids.append(segment_ids)
+
+  # Zero-pad up to the max sequence length.
+  for sentence_index, sentence in enumerate(doc.sents):
+    while len(batch_input_ids[sentence_index]) < max_seq_length:
+        batch_input_ids[sentence_index].append(0)
+        batch_input_mask[sentence_index].append(0)
+        batch_segment_ids[sentence_index].append(0)
+
+  outputs = sess.run(bert_outputs_to_use, feed_dict={
+    bert_inputs["input_ids"]: batch_input_ids,
+    bert_inputs["input_mask"]: batch_input_mask,
+    bert_inputs["segment_ids"]: batch_segment_ids
+  })
+
+  # translate back from subtokens to spacy tokens and store vectors
+  for sentence_index, sentence in enumerate(doc.sents):
+    for word_index, word in enumerate(sentence):
+      bert_vecs = []
+      for output in outputs:
+        vecs = output[sentence_index][batch_orig_to_tok_map[sentence_index][word_index]:batch_orig_to_tok_map[sentence_index][word_index+1]]
+        if len(vecs) == 0:
+          #print("Error no output for word", len(output[sentence_index]), batch_orig_to_tok_map[sentence_index][word_index], batch_orig_to_tok_map[sentence_index][word_index+1])
+          bert_vecs.append([0]*768)
+        else:
+          bert_vecs.append(np.average(vecs, axis=0))
+      word._.bert_vector = np.concatenate(bert_vecs)
+      word._.bert_layers = bert_vecs
+
+# import spacy
+# nlp = spacy.load('en_core_web_sm')
+# doc = nlp("Mike Tung's the CEO of Diffbot.")
+# run(doc)
+
+# for word in doc:
+#   print(word._.bert_vector)
diff --git a/baselines/EMNLP2019/config.py b/baselines/EMNLP2019/config.py
@@ -0,0 +1,19 @@
+#MODEL = "simple_pipeline"
+#MODEL = "pipeline_without_global"
+#MODEL = "best_pipeline"
+#MODEL = "ours"
+MODEL = "bert"
+
+NUMBER_URI_CANDIDATES = 1 if MODEL == "ours" else 1
+NUMBER_URI_CANDIDATES_TO_CONSIDER = 1
+URI_THRESHOLD = 0.0
+SOFT_COREF_CANDIDATES = MODEL == "ours" or MODEL == "bert"
+MULTITASK = True
+CANDIDATE_RECALL = False
+
+USE_ENTITY_LINKER = True
+USE_BERT = MODEL == "bert"
+
+MODELS_DIR = "models"
+
+KNOWLEDGE_NET_DIR = "../../"
diff --git a/baselines/EMNLP2019/data.yml b/baselines/EMNLP2019/data.yml
@@ -0,0 +1,24 @@
+model_dir: run/
+
+data:
+  train_features_file: {data_dir}/train.tfrecord
+  train_labels_file: {data_dir}/train.labels
+  eval_features_file: {data_dir}/validation.tfrecord
+  eval_labels_file: {data_dir}/validation.labels
+  labels_vocabulary: labels.txt
+  config_file: {data_dir}/header.txt
+
+params:
+  optimizer: AdamOptimizer
+  learning_rate: 0.001
+
+train:
+  batch_size: 128
+  save_checkpoints_steps: 200
+  train_steps: 2000
+  sample_buffer_size: 5000
+
+eval:
+  eval_delay: 10
+  exporters:
+    - best
diff --git a/baselines/EMNLP2019/entitylinker.py b/baselines/EMNLP2019/entitylinker.py
@@ -0,0 +1,93 @@
+import requests
+from requests.adapters import HTTPAdapter
+import os
+from sqlitedict import SqliteDict
+import hashlib
+from spacy.tokens import Span
+import json
+import time
+import diffbot_nlapi
+import logging
+
+from config import MODEL, NUMBER_URI_CANDIDATES, SOFT_COREF_CANDIDATES
+
+# el_candidate has types, uri, score
+Span.set_extension("el_candidates", default=[])
+Span.set_extension("uri_candidates", default=[])
+
+db = SqliteDict(os.path.join('tmp','el.db'), autocommit=True)
+
+configuration = diffbot_nlapi.Configuration()
+api_instance = diffbot_nlapi.NaturalLanguageApi(diffbot_nlapi.ApiClient(configuration))
+
+def _get_uri_candidates_from_mention_with_score(mention, score):
+  return [ { 'types': elc["types"], 'uri': elc["uri"], 'score': (2*score)+elc["score"], 'coref_score':score, 'el_score':elc["score"]} for elc in mention._.el_candidates if SOFT_COREF_CANDIDATES or elc["score"] > 0.5 ]
+def _coref_probability_from_score(score):
+  # coref scores are not between 0 and 1, normalize
+  return (max(-5.0,min(4.5,score)) + 5.0) / 10.0
+def _get_uri_candidates(mention):
+  ret = _get_uri_candidates_from_mention_with_score(mention, 1.0)
+  # Look at all coref clusters and get el_candidates from each
+  # score uri_candidates based on coref and and entitylinker score
+  if MODEL == "best_pipeline" or MODEL == "pipeline_without_global":
+    if len(ret) == 0 and mention._.coref_cluster:
+      ret = _get_uri_candidates_from_mention_with_score(mention._.coref_cluster.main, 1.0)
+  if SOFT_COREF_CANDIDATES:
+    if mention._.coref_scores: #TODO we might want to look at overlapping mentions here?
+      for coref_mention, score in mention._.coref_scores.items(): 
+        normalized_score = _coref_probability_from_score(score)
+        ret.extend(_get_uri_candidates_from_mention_with_score(coref_mention, normalized_score))
+        for coref_mention_inner, score_inner in coref_mention._.coref_scores.items(): 
+          ret.extend(_get_uri_candidates_from_mention_with_score(coref_mention_inner, normalized_score * _coref_probability_from_score(score_inner)))
+
+    # sort and keep top n
+    ret.sort(key=lambda x:x['score'], reverse=True)
+  return ret[:NUMBER_URI_CANDIDATES]
+
+def link(doc, mentions):
+  offsets = []
+  mentionDict = dict()
+  for mention in mentions:
+    if mention._.is_pronoun:
+      continue
+    offsets.append((mention.start_char, mention.end_char))
+    mention_id = str(mention.start_char) + "-" + str(mention.end_char)
+    mentionDict[mention_id] = mention
+  mentions_str = str(sorted([ (ent.start_char, ent.end_char) for ent in mentions ]))
+  cache_key = hashlib.md5((doc.text + mentions_str).encode()).hexdigest()
+  el_response = db.get(cache_key, None)
+  if el_response is None:
+    el_response = _link(doc, offsets)
+    db[cache_key] = el_response
+  if 'mentions' not in el_response:
+    logging.warning("No mentions returned for %s", doc.text.replace('\n', '.'))
+    return
+  for mention in el_response['mentions']:
+    el_candidates = []
+    if "entityCandidates" in mention:
+      for scored_candidate in mention['entityCandidates']:
+        uri = next((u for u in scored_candidate['allUris'] if "wikidata.org" in u), None) if 'allUris' in scored_candidate else None
+        types = list(map(lambda x: x["name"], scored_candidate['allTypes'])) if 'allTypes' in scored_candidate else []
+        if uri:
+          el_candidates.append({'types': types, 'uri': uri, 'score': scored_candidate['confidence']})
+
+    chunk_id = str(mention['beginOffset']) + "-" + str(mention['endOffset'])
+    if chunk_id in mentionDict:
+      mentionDict[chunk_id]._.el_candidates = el_candidates
+  for mention in mentions:
+    mention._.uri_candidates = _get_uri_candidates(mention)
+
+def _link(doc, offsets):
+  if 'DIFFBOT_TOKEN' not in os.environ:
+    raise Exception("Must define environment variable DIFFBOT_TOKEN")
+  DIFFBOT_TOKEN = os.environ['DIFFBOT_TOKEN']
+
+  documents = [ 
+    diffbot_nlapi.Document(content=doc.text.replace('\n', '.'),
+    mentions=[diffbot_nlapi.Span(begin_offset=b, end_offset=e) for (b,e) in offsets]) 
+  ]
+
+  api_response = api_instance.v1_post(documents, DIFFBOT_TOKEN, fields=["mentions"])
+  return api_response[0]
+
+