Skip to content

Commit

Permalink
Merge pull request #1 from schmidek/baseline
Browse files Browse the repository at this point in the history
Add baseline code from EMNLP2019 paper
  • Loading branch information
schmidek authored Oct 25, 2019
2 parents 6007819 + 6b077e8 commit 6f4e39c
Show file tree
Hide file tree
Showing 25 changed files with 2,167 additions and 0 deletions.
9 changes: 9 additions & 0 deletions baselines/EMNLP2019/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
__pycache__
models
data
run
.vscode
.idea
*.db
*.vocab
tmp
63 changes: 63 additions & 0 deletions baselines/EMNLP2019/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
## Requirements
python 3.7
```
pip install -r requirements.txt
python -m spacy download en_core_web_lg
```

Set environment variable DIFFBOT_TOKEN if you want to use entitylinking. We are providing cached results for the KnowledgeNet documents, but you will need this if you want to run the system on other documents or if you want to change the NER system. Contact Filipe Mesquita (filipe[at]diffbot.com) for a free research token.

## Using the pretrained model

Get the release from https://github.com/diffbot/knowledge-net/releases which includes the pretrained baseline 5 model, vocab, and linking and wikidata cache.

Run on a single document:

`echo "Butler W. Lampson (born December 23, 1943) is an American computer scientist contributing to the development and implementation of distributed, personal computing. He is a Technical Fellow at Microsoft and an Adjunct Professor at MIT." | python run.py`

output
```
Butler W. Lampson| DATE_OF_BIRTH(0.99) December 23, 1943|
Butler W. Lampson|http://www.wikidata.org/entity/Q92644 NATIONALITY(0.88) American|http://www.wikidata.org/entity/Q30
Microsoft|http://www.wikidata.org/entity/Q2283 CEO(0.63) He|http://www.wikidata.org/entity/Q92644
He|http://www.wikidata.org/entity/Q92644 EMPLOYEE_OR_MEMBER_OF(0.88) Microsoft|http://www.wikidata.org/entity/Q2283
He|http://www.wikidata.org/entity/Q92644 EMPLOYEE_OR_MEMBER_OF(0.93) MIT|http://www.wikidata.org/entity/Q49108
He| DATE_OF_BIRTH(1.00) December 23, 1943|
He|http://www.wikidata.org/entity/Q92644 NATIONALITY(0.86) American|http://www.wikidata.org/entity/Q30
Butler W. Lampson|http://www.wikidata.org/entity/Q92644 EMPLOYEE_OR_MEMBER_OF(0.56) Microsoft|http://www.wikidata.org/entity/Q2283
Butler W. Lampson|http://www.wikidata.org/entity/Q92644 EMPLOYEE_OR_MEMBER_OF(0.69) MIT|http://www.wikidata.org/entity/Q49108
```

## Evaluating

`python evaluate.py [test or dev]`

This creates the analysis files in `tmp` and when run on `dev` prints the results. To preserve the integrity of the results, we have released the test set without annotations. See https://github.com/diffbot/knowledge-net#adding-a-system-to-the-leaderboard for more details.

With the pretrained baseline 5 model you should get similar to the following on the dev set
```
Evaluation Precision Recall F1
span_overlap 0.718 0.691 0.704
span_exact 0.620 0.599 0.609
uri 0.557 0.472 0.511
```

## Training

Choose which model you would like to train in config.py

Warning: baseline 5 requires ~300GB of disk space to train. The others require much less.

`./train.sh`

## Troubleshooting

`spacy.strings.StringStore size changed error`

If you have an error mentioning spacy.strings.StringStore size changed, may indicate binary incompatibility when loading NeuralCoref with import neuralcoref, it means you'll have to install NeuralCoref from the distribution's sources instead of the wheels to get NeuralCoref to build against the most recent version of SpaCy for your system.

In this case, simply re-install neuralcoref as follows:

pip uninstall neuralcoref

pip install neuralcoref --no-binary neuralcoref
126 changes: 126 additions & 0 deletions baselines/EMNLP2019/bert_wrapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@

from spacy.tokens import Token
import tensorflow as tf
import tensorflow_hub as hub
from bert.tokenization import FullTokenizer
import numpy as np
import logging

Token.set_extension("bert_vector", default=[])
Token.set_extension("bert_layers", default=[])

# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
"""Get the vocab file and casing info from the Hub module."""
with tf.Graph().as_default():
bert_module = hub.Module(BERT_MODEL_HUB)
tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
with tf.Session() as sess:
vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
tokenization_info["do_lower_case"]])

return FullTokenizer(
vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

LAYERS_TO_USE = 12

bert_outputs_to_use = []
g = tf.Graph()
with g.as_default():
bert_module = hub.Module(BERT_MODEL_HUB, trainable=False)
bert_inputs = dict(
input_ids=tf.placeholder(dtype=tf.int32, shape=[None,None], name="input_ids"),
input_mask=tf.placeholder(dtype=tf.int32, shape=[None,None], name="input_mask"),
segment_ids=tf.placeholder(dtype=tf.int32, shape=[None,None], name="segment_ids"))
bert_outputs = bert_module(bert_inputs, signature="tokens", as_dict=True)
bert_sequence_output = bert_outputs["sequence_output"]
def get_intermediate_layer(total_layers, desired_layer):
intermediate_layer_name = bert_sequence_output.name.replace(str(total_layers + 1),
str(desired_layer + 1))
logging.debug("Intermediate layer name: %s", intermediate_layer_name)
return g.get_tensor_by_name(intermediate_layer_name)
for i in range(LAYERS_TO_USE):
bert_outputs_to_use.append(get_intermediate_layer(12, 12-i))
init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

sess = tf.compat.v1.Session(graph=g)
sess.run(init_op)

def run(doc):

max_seq_length = 2

batch_input_ids = []
batch_input_mask = []
batch_segment_ids = []
batch_orig_to_tok_map = []

for sentence_index, sentence in enumerate(doc.sents):
bert_tokens = []
orig_to_tok_map = []
segment_ids = [] # sentence index, 1 sentence per so always 0 right now

bert_tokens.append("[CLS]")
segment_ids.append(0)
for word in sentence:
orig_to_tok_map.append(len(bert_tokens))
ts = tokenizer.tokenize(word.text)
if len(ts) == 0:
logging.debug("Token has no bert tokens: %s", word.text)
for t in ts:
bert_tokens.append(t)
segment_ids.append(0)
bert_tokens.append("[SEP]")
segment_ids.append(0)
orig_to_tok_map.append(len(bert_tokens))

if len(bert_tokens) > max_seq_length:
max_seq_length = len(bert_tokens)

input_ids = tokenizer.convert_tokens_to_ids(bert_tokens)
input_mask = [1] * len(input_ids)

batch_input_ids.append(input_ids)
batch_input_mask.append(input_mask)
batch_orig_to_tok_map.append(orig_to_tok_map)
batch_segment_ids.append(segment_ids)

# Zero-pad up to the max sequence length.
for sentence_index, sentence in enumerate(doc.sents):
while len(batch_input_ids[sentence_index]) < max_seq_length:
batch_input_ids[sentence_index].append(0)
batch_input_mask[sentence_index].append(0)
batch_segment_ids[sentence_index].append(0)

outputs = sess.run(bert_outputs_to_use, feed_dict={
bert_inputs["input_ids"]: batch_input_ids,
bert_inputs["input_mask"]: batch_input_mask,
bert_inputs["segment_ids"]: batch_segment_ids
})

# translate back from subtokens to spacy tokens and store vectors
for sentence_index, sentence in enumerate(doc.sents):
for word_index, word in enumerate(sentence):
bert_vecs = []
for output in outputs:
vecs = output[sentence_index][batch_orig_to_tok_map[sentence_index][word_index]:batch_orig_to_tok_map[sentence_index][word_index+1]]
if len(vecs) == 0:
#print("Error no output for word", len(output[sentence_index]), batch_orig_to_tok_map[sentence_index][word_index], batch_orig_to_tok_map[sentence_index][word_index+1])
bert_vecs.append([0]*768)
else:
bert_vecs.append(np.average(vecs, axis=0))
word._.bert_vector = np.concatenate(bert_vecs)
word._.bert_layers = bert_vecs

# import spacy
# nlp = spacy.load('en_core_web_sm')
# doc = nlp("Mike Tung's the CEO of Diffbot.")
# run(doc)

# for word in doc:
# print(word._.bert_vector)
19 changes: 19 additions & 0 deletions baselines/EMNLP2019/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#MODEL = "simple_pipeline"
#MODEL = "pipeline_without_global"
#MODEL = "best_pipeline"
#MODEL = "ours"
MODEL = "bert"

NUMBER_URI_CANDIDATES = 1 if MODEL == "ours" else 1
NUMBER_URI_CANDIDATES_TO_CONSIDER = 1
URI_THRESHOLD = 0.0
SOFT_COREF_CANDIDATES = MODEL == "ours" or MODEL == "bert"
MULTITASK = True
CANDIDATE_RECALL = False

USE_ENTITY_LINKER = True
USE_BERT = MODEL == "bert"

MODELS_DIR = "models"

KNOWLEDGE_NET_DIR = "../../"
24 changes: 24 additions & 0 deletions baselines/EMNLP2019/data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
model_dir: run/

data:
train_features_file: {data_dir}/train.tfrecord
train_labels_file: {data_dir}/train.labels
eval_features_file: {data_dir}/validation.tfrecord
eval_labels_file: {data_dir}/validation.labels
labels_vocabulary: labels.txt
config_file: {data_dir}/header.txt

params:
optimizer: AdamOptimizer
learning_rate: 0.001

train:
batch_size: 128
save_checkpoints_steps: 200
train_steps: 2000
sample_buffer_size: 5000

eval:
eval_delay: 10
exporters:
- best
93 changes: 93 additions & 0 deletions baselines/EMNLP2019/entitylinker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import requests
from requests.adapters import HTTPAdapter
import os
from sqlitedict import SqliteDict
import hashlib
from spacy.tokens import Span
import json
import time
import diffbot_nlapi
import logging

from config import MODEL, NUMBER_URI_CANDIDATES, SOFT_COREF_CANDIDATES

# el_candidate has types, uri, score
Span.set_extension("el_candidates", default=[])
Span.set_extension("uri_candidates", default=[])

db = SqliteDict(os.path.join('tmp','el.db'), autocommit=True)

configuration = diffbot_nlapi.Configuration()
api_instance = diffbot_nlapi.NaturalLanguageApi(diffbot_nlapi.ApiClient(configuration))

def _get_uri_candidates_from_mention_with_score(mention, score):
return [ { 'types': elc["types"], 'uri': elc["uri"], 'score': (2*score)+elc["score"], 'coref_score':score, 'el_score':elc["score"]} for elc in mention._.el_candidates if SOFT_COREF_CANDIDATES or elc["score"] > 0.5 ]
def _coref_probability_from_score(score):
# coref scores are not between 0 and 1, normalize
return (max(-5.0,min(4.5,score)) + 5.0) / 10.0
def _get_uri_candidates(mention):
ret = _get_uri_candidates_from_mention_with_score(mention, 1.0)
# Look at all coref clusters and get el_candidates from each
# score uri_candidates based on coref and and entitylinker score
if MODEL == "best_pipeline" or MODEL == "pipeline_without_global":
if len(ret) == 0 and mention._.coref_cluster:
ret = _get_uri_candidates_from_mention_with_score(mention._.coref_cluster.main, 1.0)
if SOFT_COREF_CANDIDATES:
if mention._.coref_scores: #TODO we might want to look at overlapping mentions here?
for coref_mention, score in mention._.coref_scores.items():
normalized_score = _coref_probability_from_score(score)
ret.extend(_get_uri_candidates_from_mention_with_score(coref_mention, normalized_score))
for coref_mention_inner, score_inner in coref_mention._.coref_scores.items():
ret.extend(_get_uri_candidates_from_mention_with_score(coref_mention_inner, normalized_score * _coref_probability_from_score(score_inner)))

# sort and keep top n
ret.sort(key=lambda x:x['score'], reverse=True)
return ret[:NUMBER_URI_CANDIDATES]

def link(doc, mentions):
offsets = []
mentionDict = dict()
for mention in mentions:
if mention._.is_pronoun:
continue
offsets.append((mention.start_char, mention.end_char))
mention_id = str(mention.start_char) + "-" + str(mention.end_char)
mentionDict[mention_id] = mention
mentions_str = str(sorted([ (ent.start_char, ent.end_char) for ent in mentions ]))
cache_key = hashlib.md5((doc.text + mentions_str).encode()).hexdigest()
el_response = db.get(cache_key, None)
if el_response is None:
el_response = _link(doc, offsets)
db[cache_key] = el_response
if 'mentions' not in el_response:
logging.warning("No mentions returned for %s", doc.text.replace('\n', '.'))
return
for mention in el_response['mentions']:
el_candidates = []
if "entityCandidates" in mention:
for scored_candidate in mention['entityCandidates']:
uri = next((u for u in scored_candidate['allUris'] if "wikidata.org" in u), None) if 'allUris' in scored_candidate else None
types = list(map(lambda x: x["name"], scored_candidate['allTypes'])) if 'allTypes' in scored_candidate else []
if uri:
el_candidates.append({'types': types, 'uri': uri, 'score': scored_candidate['confidence']})

chunk_id = str(mention['beginOffset']) + "-" + str(mention['endOffset'])
if chunk_id in mentionDict:
mentionDict[chunk_id]._.el_candidates = el_candidates
for mention in mentions:
mention._.uri_candidates = _get_uri_candidates(mention)

def _link(doc, offsets):
if 'DIFFBOT_TOKEN' not in os.environ:
raise Exception("Must define environment variable DIFFBOT_TOKEN")
DIFFBOT_TOKEN = os.environ['DIFFBOT_TOKEN']

documents = [
diffbot_nlapi.Document(content=doc.text.replace('\n', '.'),
mentions=[diffbot_nlapi.Span(begin_offset=b, end_offset=e) for (b,e) in offsets])
]

api_response = api_instance.v1_post(documents, DIFFBOT_TOKEN, fields=["mentions"])
return api_response[0]


Loading

0 comments on commit 6f4e39c

Please sign in to comment.