From 61a33f55d2eec93a335dfecc9c9a5e85c339e00a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 10 Apr 2019 16:06:09 +0200 Subject: [PATCH 001/148] little fixes --- spacy/kb.pxd | 12 +++++++++--- spacy/kb.pyx | 13 +++++-------- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index e34a0a9ba..e57c162fc 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -13,7 +13,7 @@ from .typedefs cimport hash_t # of bits we need to keep track of the answers. cdef struct _EntryC: - # The hash of this entry's unique ID and name in the kB + # The hash of this entry's unique ID/name in the kB hash_t entity_hash # Allows retrieval of one or more vectors. @@ -99,7 +99,7 @@ cdef class KnowledgeBase: cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, int feats_row): """Add an entry to the knowledge base.""" - # This is what we'll map the hash key to. It's where the entry will sit + # This is what we'll map the entity hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. cdef int64_t new_index = self._entries.size() self._entries.push_back( @@ -114,6 +114,8 @@ cdef class KnowledgeBase: cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): """Connect a mention to a list of potential entities with their prior probabilities .""" + # This is what we'll map the alias hash key to. It's where the alias will be defined + # in the vector of aliases. cdef int64_t new_index = self._aliases_table.size() self._aliases_table.push_back( @@ -126,12 +128,14 @@ cdef class KnowledgeBase: cdef inline _create_empty_vectors(self): """ - Making sure the first element of each vector is a dummy, + Initializing the vectors and making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value cf. https://github.com/explosion/preshed/issues/17 """ cdef int32_t dummy_value = 0 self.vocab.strings.add("") + + self._entry_index = PreshMap() self._entries.push_back( _EntryC( entity_hash=self.vocab.strings[""], @@ -139,6 +143,8 @@ cdef class KnowledgeBase: feats_row=dummy_value, prob=dummy_value )) + + self._alias_index = PreshMap() self._aliases_table.push_back( _AliasC( entry_indices=[dummy_value], diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 3a0a8b918..38c393355 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,3 +1,4 @@ +# cython: infer_types=True # cython: profile=True # coding: utf8 from spacy.errors import Errors, Warnings, user_warning @@ -19,7 +20,7 @@ cdef class Candidate: @property def entity_(self): """RETURNS (unicode): ID/name of this entity in the KB""" - return self.kb.vocab.strings[self.entity] + return self.kb.vocab.strings[self.entity_hash] @property def alias(self): @@ -29,7 +30,7 @@ cdef class Candidate: @property def alias_(self): """RETURNS (unicode): ID of the original alias""" - return self.kb.vocab.strings[self.alias] + return self.kb.vocab.strings[self.alias_hash] @property def prior_prob(self): @@ -40,8 +41,6 @@ cdef class KnowledgeBase: def __init__(self, Vocab vocab): self.vocab = vocab - self._entry_index = PreshMap() - self._alias_index = PreshMap() self.mem = Pool() self._create_empty_vectors() @@ -56,8 +55,8 @@ cdef class KnowledgeBase: def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None): """ - Add an entity to the KB. - Return the hash of the entity ID at the end + Add an entity to the KB, optionally specifying its log probability based on corpus frequency + Return the hash of the entity ID/name at the end """ cdef hash_t entity_hash = self.vocab.strings.add(entity) @@ -98,8 +97,6 @@ cdef class KnowledgeBase: user_warning(Warnings.W017.format(alias=alias)) return - cdef hash_t entity_hash - cdef vector[int64_t] entry_indices cdef vector[float] probs From 9a7d534b1bc07898d855a254fcbaa39b28023fa3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 10 Apr 2019 17:25:10 +0200 Subject: [PATCH 002/148] enable nogil for cython functions in kb.pxd --- examples/pipeline/dummy_entity_linking.py | 2 +- spacy/kb.pxd | 74 ++++++++++--------- spacy/kb.pyx | 15 +++- .../{test_el.py => test_entity_linker.py} | 0 4 files changed, 52 insertions(+), 39 deletions(-) rename spacy/tests/pipeline/{test_el.py => test_entity_linker.py} (100%) diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py index 88415d040..e93e3e20b 100644 --- a/examples/pipeline/dummy_entity_linking.py +++ b/examples/pipeline/dummy_entity_linking.py @@ -28,7 +28,7 @@ def create_kb(vocab): print() alias_0 = "Douglas" print("adding alias", alias_0) - kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.1, 0.6, 0.2]) + kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2]) alias_1 = "Douglas Adams" print("adding alias", alias_1) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index e57c162fc..3cdf1e07e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -97,58 +97,64 @@ cdef class KnowledgeBase: cdef object _features_table cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, - int32_t* vector_rows, int feats_row): - """Add an entry to the knowledge base.""" + int32_t* vector_rows, int feats_row) nogil: + """Add an entry to the vector of entries. + After calling this method, make sure to update also the _entry_index using the return value""" # This is what we'll map the entity hash key to. It's where the entry will sit # in the vector of entries, so we can get it later. cdef int64_t new_index = self._entries.size() - self._entries.push_back( - _EntryC( - entity_hash=entity_hash, - vector_rows=vector_rows, - feats_row=feats_row, - prob=prob - )) - self._entry_index[entity_hash] = new_index + + # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 + cdef _EntryC entry + entry.entity_hash = entity_hash + entry.vector_rows = vector_rows + entry.feats_row = feats_row + entry.prob = prob + + self._entries.push_back(entry) return new_index - cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs): - """Connect a mention to a list of potential entities with their prior probabilities .""" + cdef inline int64_t c_add_aliases(self, hash_t alias_hash, vector[int64_t] entry_indices, vector[float] probs) nogil: + """Connect a mention to a list of potential entities with their prior probabilities . + After calling this method, make sure to update also the _alias_index using the return value""" # This is what we'll map the alias hash key to. It's where the alias will be defined # in the vector of aliases. cdef int64_t new_index = self._aliases_table.size() - self._aliases_table.push_back( - _AliasC( - entry_indices=entry_indices, - probs=probs - )) - self._alias_index[alias_hash] = new_index + # Avoid struct initializer to enable nogil + cdef _AliasC alias + alias.entry_indices = entry_indices + alias.probs = probs + + self._aliases_table.push_back(alias) return new_index - cdef inline _create_empty_vectors(self): + cdef inline void _create_empty_vectors(self, hash_t dummy_hash) nogil: """ Initializing the vectors and making sure the first element of each vector is a dummy, because the PreshMap maps pointing to indices in these vectors can not contain 0 as value cf. https://github.com/explosion/preshed/issues/17 """ cdef int32_t dummy_value = 0 - self.vocab.strings.add("") - self._entry_index = PreshMap() - self._entries.push_back( - _EntryC( - entity_hash=self.vocab.strings[""], - vector_rows=&dummy_value, - feats_row=dummy_value, - prob=dummy_value - )) + # Avoid struct initializer to enable nogil + cdef _EntryC entry + entry.entity_hash = dummy_hash + entry.vector_rows = &dummy_value + entry.feats_row = dummy_value + entry.prob = dummy_value - self._alias_index = PreshMap() - self._aliases_table.push_back( - _AliasC( - entry_indices=[dummy_value], - probs=[dummy_value] - )) + # Avoid struct initializer to enable nogil + cdef vector[int64_t] dummy_entry_indices + dummy_entry_indices.push_back(0) + cdef vector[float] dummy_probs + dummy_probs.push_back(0) + + cdef _AliasC alias + alias.entry_indices = dummy_entry_indices + alias.probs = dummy_probs + + self._entries.push_back(entry) + self._aliases_table.push_back(alias) diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 38c393355..97e86d01f 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -42,7 +42,11 @@ cdef class KnowledgeBase: def __init__(self, Vocab vocab): self.vocab = vocab self.mem = Pool() - self._create_empty_vectors() + self._entry_index = PreshMap() + self._alias_index = PreshMap() + + self.vocab.strings.add("") + self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) def __len__(self): return self.get_size_entities() @@ -66,8 +70,10 @@ cdef class KnowledgeBase: return cdef int32_t dummy_value = 342 - self.c_add_entity(entity_hash=entity_hash, prob=prob, - vector_rows=&dummy_value, feats_row=dummy_value) + new_index = self.c_add_entity(entity_hash=entity_hash, prob=prob, + vector_rows=&dummy_value, feats_row=dummy_value) + self._entry_index[entity_hash] = new_index + # TODO self._vectors_table.get_pointer(vectors), # self._features_table.get(features)) @@ -109,7 +115,8 @@ cdef class KnowledgeBase: entry_indices.push_back(int(entry_index)) probs.push_back(float(prob)) - self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) + new_index = self.c_add_aliases(alias_hash=alias_hash, entry_indices=entry_indices, probs=probs) + self._alias_index[alias_hash] = new_index return alias_hash diff --git a/spacy/tests/pipeline/test_el.py b/spacy/tests/pipeline/test_entity_linker.py similarity index 100% rename from spacy/tests/pipeline/test_el.py rename to spacy/tests/pipeline/test_entity_linker.py From 6e997be4b4b364583c2e148992756992cd195b43 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Apr 2019 21:08:22 +0200 Subject: [PATCH 003/148] reading wikidata descriptions and aliases --- examples/pipeline/wikidata_entity_linking.py | 94 ++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 examples/pipeline/wikidata_entity_linking.py diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py new file mode 100644 index 000000000..b467a5982 --- /dev/null +++ b/examples/pipeline/wikidata_entity_linking.py @@ -0,0 +1,94 @@ +# coding: utf-8 +from __future__ import unicode_literals + +"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. +""" +import json +import spacy +import bz2 +from spacy.kb import KnowledgeBase + + +def create_kb(vocab): + kb = KnowledgeBase(vocab=vocab) + _read_wikidata() + + # adding entities + # kb.add_entity(entity=entity, prob=prob) + + # adding aliases + # kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2]) + + print() + print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) + + return kb + + +def _read_wikidata(): + """ Read the JSON wiki data """ + # TODO remove hardcoded path + + languages = {'en', 'de'} + + with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file: + line = file.readline() + cnt = 1 + while line and cnt < 10: + clean_line = line.strip() + if clean_line.endswith(b","): + clean_line = clean_line[:-1] + if len(clean_line) > 1: + obj = json.loads(clean_line) + unique_id = obj["id"] + print(unique_id) + + labels = obj["labels"] + if labels: + for lang in languages: + lang_label = labels.get(lang, None) + if lang_label: + print("label (" + lang + "):", lang_label["value"]) + + descriptions = obj["descriptions"] + if descriptions: + for lang in languages: + lang_descr = descriptions.get(lang, None) + if lang_descr: + print("description (" + lang + "):", lang_descr["value"]) + + aliases = obj["aliases"] + if aliases: + for lang in languages: + lang_aliases = aliases.get(lang, None) + if lang_aliases: + for item in lang_aliases: + print("alias (" + lang + "):", item["value"]) + + print() + line = file.readline() + cnt += 1 + + +def add_el(kb, nlp): + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) + nlp.add_pipe(el_pipe, last=True) + + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is called Arthur Dent." + doc = nlp(text) + + print() + for token in doc: + print("token", token.text, token.ent_type_, token.ent_kb_id_) + + print() + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) + + +if __name__ == "__main__": + nlp = spacy.load('en_core_web_sm') + my_kb = create_kb(nlp.vocab) + # add_el(my_kb, nlp) From b31a390a9aaedccbdc4dc4c7ce62197ef2e9e533 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Apr 2019 21:42:44 +0200 Subject: [PATCH 004/148] reading types, claims and sitelinks --- examples/pipeline/wikidata_entity_linking.py | 21 +++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index b467a5982..11e4cc04c 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -30,6 +30,8 @@ def _read_wikidata(): # TODO remove hardcoded path languages = {'en', 'de'} + properties = {'P31'} + sites = {'enwiki'} with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file: line = file.readline() @@ -40,8 +42,25 @@ def _read_wikidata(): clean_line = clean_line[:-1] if len(clean_line) > 1: obj = json.loads(clean_line) + unique_id = obj["id"] - print(unique_id) + print("ID:", unique_id) + + entry_type = obj["type"] + print("type:", entry_type) + + # TODO: filter on rank: preferred, normal or deprecated + claims = obj["claims"] + for prop in properties: + claim_property = claims.get(prop, None) + if claim_property: + for cp in claim_property: + print(prop, cp['mainsnak']['datavalue']['value']['id']) + + entry_sites = obj["sitelinks"] + for site in sites: + site_value = entry_sites.get(site, None) + print(site, ":", site_value['title']) labels = obj["labels"] if labels: From 3163331b1ee4238265e9584247fc36965fb9da13 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 14 Apr 2019 21:52:01 +0200 Subject: [PATCH 005/148] wikipedia dump parser and mediawiki format regex cleanup --- examples/pipeline/wikidata_entity_linking.py | 81 +++++++++++++++++++- 1 file changed, 80 insertions(+), 1 deletion(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 11e4cc04c..02106bc31 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals """Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. """ +import re import json import spacy import bz2 @@ -11,7 +12,8 @@ from spacy.kb import KnowledgeBase def create_kb(vocab): kb = KnowledgeBase(vocab=vocab) - _read_wikidata() + # _read_wikidata() + _read_wikipedia() # adding entities # kb.add_entity(entity=entity, prob=prob) @@ -89,6 +91,83 @@ def _read_wikidata(): cnt += 1 +def _read_wikipedia(): + """ Read the XML wikipedia data """ + # TODO remove hardcoded path + + # with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file: + with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file: + line = file.readline() + cnt = 1 + article_text = "" + article_title = None + article_id = None + reading_text = False + while line and cnt < 10000: + clean_line = line.strip().decode("utf-8") + + # Start reading new page + if clean_line == "": + article_text = "" + article_title = None + article_id = 342 + + # finished reading this page + elif clean_line == "": + if article_id: + _store_wp_article(article_id, article_title, article_text.strip()) + + # start reading text within a page + if ")\d*(?=)", clean_line) + if ids: + article_id = ids[0] + + # read the title of this article + titles = re.findall(r"(?<=).*(?=)", clean_line) + if titles: + article_title = titles[0].strip() + + line = file.readline() + cnt += 1 + + +def _store_wp_article(article_id, article_title, article_text): + print("WP article", article_id, ":", article_title) + print(article_text) + print(_get_clean_wp_text(article_text)) + print() + + +def _get_clean_wp_text(article_text): + # remove category statements + clean_text = re.sub('\[\[Category:.*\]\]', '', article_text) + + # remove nested {{info}} statements by removing the inner/smallest ones first and iterating + try_again = True + previous_length = len(clean_text) + while try_again: + clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match + print(clean_text) + if len(clean_text) < previous_length: + try_again = True + else: + try_again = False + previous_length = len(clean_text) + + return clean_text + + def add_el(kb, nlp): el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) nlp.add_pipe(el_pipe, last=True) From 6763e025e1f351b5c3f133d5a334217f172867b9 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 15 Apr 2019 11:41:57 +0200 Subject: [PATCH 006/148] parse wp dump for links to determine prior probabilities --- examples/pipeline/wikidata_entity_linking.py | 136 +++++++++++++++++-- 1 file changed, 128 insertions(+), 8 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 02106bc31..5065648ef 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -6,9 +6,27 @@ from __future__ import unicode_literals import re import json import spacy +import datetime import bz2 from spacy.kb import KnowledgeBase +# these will/should be matched ignoring case +wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", + "d", "dbdump", "download", "Draft", "Education", "Foundation", + "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator", + "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki", + "MediaZilla", "Meta", "Metawikipedia", "Module", + "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki", + "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev", + "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn", + "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki", + "User", "User talk", "v", "voy", + "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews", + "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech", + "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"] + +map_alias_to_link = dict() + def create_kb(vocab): kb = KnowledgeBase(vocab=vocab) @@ -38,7 +56,7 @@ def _read_wikidata(): with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file: line = file.readline() cnt = 1 - while line and cnt < 10: + while line and cnt < 100000: clean_line = line.strip() if clean_line.endswith(b","): clean_line = clean_line[:-1] @@ -91,6 +109,78 @@ def _read_wikidata(): cnt += 1 +def _read_wikipedia_prior_probs(): + """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """ + + # find the links + link_regex = re.compile(r'\[\[[^\[\]]*\]\]') + + # match on interwiki links, e.g. `en:` or `:fr:` + ns_regex = r":?" + "[a-z][a-z]" + ":" + + # match on Namespace: optionally preceded by a : + for ns in wiki_namespaces: + ns_regex += "|" + ":?" + ns + ":" + + ns_regex = re.compile(ns_regex, re.IGNORECASE) + + # TODO remove hardcoded path + with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file: + line = file.readline() + cnt = 0 + while line: + if cnt % 5000000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines") + clean_line = line.strip().decode("utf-8") + + matches = link_regex.findall(clean_line) + for match in matches: + match = match[2:][:-2].replace("_", " ").strip() + + if ns_regex.match(match): + pass # ignore namespaces at the beginning of the string + + # this is a simple link, with the alias the same as the mention + elif "|" not in match: + _store_alias(match, match) + + # in wiki format, the link is written as [[entity|alias]] + else: + splits = match.split("|") + entity = splits[0].strip() + alias = splits[1].strip() + # specific wiki format [[alias (specification)|]] + if len(alias) == 0 and "(" in entity: + alias = entity.split("(")[0] + _store_alias(alias, entity) + else: + _store_alias(alias, entity) + + line = file.readline() + cnt += 1 + + # only print aliases with more than one potential entity + # TODO remove hardcoded path + with open('C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv', mode='w', encoding='utf8') as outputfile: + outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") + for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): + for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True): + outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") + + +def _store_alias(alias, entity): + alias = alias.strip() + entity = entity.strip() + + # remove everything after # as this is not part of the title but refers to a specific paragraph + clean_entity = entity.split("#")[0].capitalize() + + if len(alias) > 0 and len(clean_entity) > 0: + alias_dict = map_alias_to_link.get(alias, dict()) + entity_count = alias_dict.get(clean_entity, 0) + alias_dict[clean_entity] = entity_count + 1 + map_alias_to_link[alias] = alias_dict + def _read_wikipedia(): """ Read the XML wikipedia data """ # TODO remove hardcoded path @@ -103,7 +193,7 @@ def _read_wikipedia(): article_title = None article_id = None reading_text = False - while line and cnt < 10000: + while line and cnt < 1000000: clean_line = line.strip().decode("utf-8") # Start reading new page @@ -143,28 +233,51 @@ def _read_wikipedia(): def _store_wp_article(article_id, article_title, article_text): + pass print("WP article", article_id, ":", article_title) print(article_text) print(_get_clean_wp_text(article_text)) print() + def _get_clean_wp_text(article_text): - # remove category statements - clean_text = re.sub('\[\[Category:.*\]\]', '', article_text) + # TODO: compile the regular expressions + + # remove Category and File statements + clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', article_text) + print("1", clean_text) + clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text) # TODO: this doesn't work yet + print("2", clean_text) + + # remove bolding markup + clean_text = re.sub('\'\'\'', '', clean_text) + clean_text = re.sub('\'\'', '', clean_text) # remove nested {{info}} statements by removing the inner/smallest ones first and iterating try_again = True previous_length = len(clean_text) while try_again: - clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match - print(clean_text) + clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match excluding a nested { if len(clean_text) < previous_length: try_again = True else: try_again = False previous_length = len(clean_text) + # remove multiple spaces + while ' ' in clean_text: + clean_text = re.sub(' ', ' ', clean_text) + + # remove simple interwiki links (no alternative name) + clean_text = re.sub('\[\[([^|]*?)]]', r'\1', clean_text) + + # remove simple interwiki links by picking the alternative name + clean_text = re.sub(r'\[\[[^|]*?\|([^|]*?)]]', r'\1', clean_text) + + # remove HTML comments + clean_text = re.sub('<!--[^!]*-->', '', clean_text) + return clean_text @@ -187,6 +300,13 @@ def add_el(kb, nlp): if __name__ == "__main__": - nlp = spacy.load('en_core_web_sm') - my_kb = create_kb(nlp.vocab) + _read_wikipedia_prior_probs() + + # nlp = spacy.load('en_core_web_sm') + # my_kb = create_kb(nlp.vocab) # add_el(my_kb, nlp) + + # clean_text = "[[File:smomething]] jhk" + # clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', clean_text) + # clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text) + # print(clean_text) From 10ee8dfea240ffe7e2b4d644df12b5179b6f01b6 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Apr 2019 14:12:17 +0200 Subject: [PATCH 007/148] poc with few entities and collecting aliases from the WP links --- examples/pipeline/wikidata_entity_linking.py | 106 +++++++++++++++---- spacy/kb.pyx | 8 +- 2 files changed, 92 insertions(+), 22 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 5065648ef..cd6cc7c40 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -10,6 +10,13 @@ import datetime import bz2 from spacy.kb import KnowledgeBase +# TODO: remove hardcoded paths +WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2' +ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2' +ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' +PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' + + # these will/should be matched ignoring case wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", "d", "dbdump", "download", "Draft", "Education", "Foundation", @@ -28,16 +35,14 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", map_alias_to_link = dict() -def create_kb(vocab): +def create_kb(vocab, max_entities_per_alias, min_occ): kb = KnowledgeBase(vocab=vocab) + + _add_entities(kb) + _add_aliases(kb, max_entities_per_alias, min_occ) + # _read_wikidata() - _read_wikipedia() - - # adding entities - # kb.add_entity(entity=entity, prob=prob) - - # adding aliases - # kb.add_alias(alias=alias, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2]) + # _read_wikipedia() print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) @@ -45,6 +50,66 @@ def create_kb(vocab): return kb +def _add_entities(kb): + + kb.add_entity(entity="Earthquake", prob=0.342) + kb.add_entity(entity="2010 haiti earthquake", prob=0.1) + kb.add_entity(entity="1906 san francisco earthquake", prob=0.1) + kb.add_entity(entity="2011 christchurch earthquak", prob=0.1) + + kb.add_entity(entity="Soft drink", prob=0.342) + + print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings()) + + +def _add_aliases(kb, max_entities_per_alias, min_occ): + all_entities = kb.get_entity_strings() + # adding aliases with prior probabilities + with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: + # skip header + prior_file.readline() + line = prior_file.readline() + # we can read this file sequentially, it's sorted by alias, and then by count + previous_alias = None + total_count = 0 + counts = list() + entities = list() + while line: + splits = line.replace('\n', "").split(sep='|') + new_alias = splits[0] + count = int(splits[1]) + entity = splits[2] + + if new_alias != previous_alias and previous_alias: + # done reading the previous alias --> output + if len(entities) > 0: + selected_entities = list() + prior_probs = list() + for ent_count, ent_string in zip(counts, entities): + if ent_string in all_entities: + p_entity_givenalias = ent_count / total_count + selected_entities.append(ent_string) + prior_probs.append(p_entity_givenalias) + + if selected_entities: + kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs) + total_count = 0 + counts = list() + entities = list() + + total_count += count + + if len(entities) < max_entities_per_alias and count >= min_occ: + counts.append(count) + entities.append(entity) + previous_alias = new_alias + + line = prior_file.readline() + + print() + print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) + + def _read_wikidata(): """ Read the JSON wiki data """ # TODO remove hardcoded path @@ -53,7 +118,7 @@ def _read_wikidata(): properties = {'P31'} sites = {'enwiki'} - with bz2.open('C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2', mode='rb') as file: + with bz2.open(WIKIDATA_JSON, mode='rb') as file: line = file.readline() cnt = 1 while line and cnt < 100000: @@ -124,8 +189,7 @@ def _read_wikipedia_prior_probs(): ns_regex = re.compile(ns_regex, re.IGNORECASE) - # TODO remove hardcoded path - with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file: + with bz2.open(ENWIKI_DUMP, mode='rb') as file: line = file.readline() cnt = 0 while line: @@ -159,9 +223,8 @@ def _read_wikipedia_prior_probs(): line = file.readline() cnt += 1 - # only print aliases with more than one potential entity - # TODO remove hardcoded path - with open('C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv', mode='w', encoding='utf8') as outputfile: + # write all aliases and their entities and occurrences to file + with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile: outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True): @@ -181,12 +244,11 @@ def _store_alias(alias, entity): alias_dict[clean_entity] = entity_count + 1 map_alias_to_link[alias] = alias_dict + def _read_wikipedia(): """ Read the XML wikipedia data """ - # TODO remove hardcoded path - # with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2', mode='rb') as file: - with bz2.open('C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2', mode='rb') as file: + with bz2.open(ENWIKI_DUMP, mode='rb') as file: line = file.readline() cnt = 1 article_text = "" @@ -240,7 +302,6 @@ def _store_wp_article(article_id, article_title, article_text): print() - def _get_clean_wp_text(article_text): # TODO: compile the regular expressions @@ -300,10 +361,13 @@ def add_el(kb, nlp): if __name__ == "__main__": - _read_wikipedia_prior_probs() + # STEP 1 : create prior probabilities from WP + # run only once ! + # _read_wikipedia_prior_probs() - # nlp = spacy.load('en_core_web_sm') - # my_kb = create_kb(nlp.vocab) + # STEP 2 : create KB + nlp = spacy.load('en_core_web_sm') + my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5) # add_el(my_kb, nlp) # clean_text = "[[File:smomething]] jhk" diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 97e86d01f..8a1710a9c 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -54,8 +54,14 @@ cdef class KnowledgeBase: def get_size_entities(self): return self._entries.size() - 1 # not counting dummy element on index 0 + def get_entity_strings(self): + return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0 + def get_size_aliases(self): - return self._aliases_table.size() - 1 # not counting dummy element on index 0 + return self._aliases_table.size() - 1 # not counting dummy element on index + + def get_alias_strings(self): + return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0 def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None): """ From 9f308eb5dc8fab4dc3a625480abf567f6841d144 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Apr 2019 16:14:25 +0200 Subject: [PATCH 008/148] fixes for prior prob and linking wikidata IDs with wikipedia titles --- examples/pipeline/wikidata_entity_linking.py | 164 ++++++++++++------- 1 file changed, 102 insertions(+), 62 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index cd6cc7c40..b7dba1e0d 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -38,10 +38,13 @@ map_alias_to_link = dict() def create_kb(vocab, max_entities_per_alias, min_occ): kb = KnowledgeBase(vocab=vocab) - _add_entities(kb) - _add_aliases(kb, max_entities_per_alias, min_occ) + id_to_title = _read_wikidata(limit=100, to_print=False) + title_to_id = {v:k for k,v in id_to_title.items()} - # _read_wikidata() + _add_entities(kb, entities=id_to_title.keys(), probs=[0.4 for x in id_to_title.keys()]) + _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ) + + # TODO: read wikipedia texts for entity context # _read_wikipedia() print() @@ -50,20 +53,17 @@ def create_kb(vocab, max_entities_per_alias, min_occ): return kb -def _add_entities(kb): - - kb.add_entity(entity="Earthquake", prob=0.342) - kb.add_entity(entity="2010 haiti earthquake", prob=0.1) - kb.add_entity(entity="1906 san francisco earthquake", prob=0.1) - kb.add_entity(entity="2011 christchurch earthquak", prob=0.1) - - kb.add_entity(entity="Soft drink", prob=0.342) +def _add_entities(kb, entities, probs): + for entity, prob in zip(entities, probs): + kb.add_entity(entity=entity, prob=prob) print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings()) -def _add_aliases(kb, max_entities_per_alias, min_occ): - all_entities = kb.get_entity_strings() +def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ): + wp_titles = title_to_id.keys() + print("wp titles", wp_titles) + # adding aliases with prior probabilities with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: # skip header @@ -86,13 +86,17 @@ def _add_aliases(kb, max_entities_per_alias, min_occ): selected_entities = list() prior_probs = list() for ent_count, ent_string in zip(counts, entities): - if ent_string in all_entities: + if ent_string in wp_titles: + wd_id = title_to_id[ent_string] p_entity_givenalias = ent_count / total_count - selected_entities.append(ent_string) + selected_entities.append(wd_id) prior_probs.append(p_entity_givenalias) if selected_entities: kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs) + print("analysed", previous_alias, "with entities", entities, "and counts", counts) + print("added", previous_alias, "with selected entities", selected_entities, "and probs", prior_probs) + print() total_count = 0 counts = list() entities = list() @@ -110,69 +114,94 @@ def _add_aliases(kb, max_entities_per_alias, min_occ): print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) -def _read_wikidata(): +def _read_wikidata(limit=None, to_print=False): """ Read the JSON wiki data """ - # TODO remove hardcoded path languages = {'en', 'de'} - properties = {'P31'} + prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected sites = {'enwiki'} + entity_dict = dict() + with bz2.open(WIKIDATA_JSON, mode='rb') as file: line = file.readline() cnt = 1 - while line and cnt < 100000: + while line and (not limit or cnt < limit): clean_line = line.strip() if clean_line.endswith(b","): clean_line = clean_line[:-1] if len(clean_line) > 1: obj = json.loads(clean_line) + keep = False - unique_id = obj["id"] - print("ID:", unique_id) - - entry_type = obj["type"] - print("type:", entry_type) - + # filtering records on their properties # TODO: filter on rank: preferred, normal or deprecated claims = obj["claims"] - for prop in properties: + for prop, value_set in prop_filter.items(): claim_property = claims.get(prop, None) if claim_property: for cp in claim_property: - print(prop, cp['mainsnak']['datavalue']['value']['id']) + cp_id = cp['mainsnak']['datavalue']['value']['id'] + if cp_id in value_set: + keep = True - entry_sites = obj["sitelinks"] - for site in sites: - site_value = entry_sites.get(site, None) - print(site, ":", site_value['title']) + if keep: + unique_id = obj["id"] + entry_type = obj["type"] - labels = obj["labels"] - if labels: - for lang in languages: - lang_label = labels.get(lang, None) - if lang_label: - print("label (" + lang + "):", lang_label["value"]) + if to_print: + print("ID:", unique_id) + print("type:", entry_type) - descriptions = obj["descriptions"] - if descriptions: - for lang in languages: - lang_descr = descriptions.get(lang, None) - if lang_descr: - print("description (" + lang + "):", lang_descr["value"]) + # parsing all properties that refer to other entities + for prop, claim_property in claims.items(): + cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')] + cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None] + if cp_values: + if to_print: + print("prop:", prop, cp_values) - aliases = obj["aliases"] - if aliases: - for lang in languages: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - for item in lang_aliases: - print("alias (" + lang + "):", item["value"]) + entry_sites = obj["sitelinks"] + for site in sites: + site_value = entry_sites.get(site, None) + if site_value: + if to_print: + print(site, ":", site_value['title']) + if site == "enwiki": + entity_dict[unique_id] = site_value['title'] - print() + labels = obj["labels"] + if labels: + for lang in languages: + lang_label = labels.get(lang, None) + if lang_label: + if to_print: + print("label (" + lang + "):", lang_label["value"]) + + descriptions = obj["descriptions"] + if descriptions: + for lang in languages: + lang_descr = descriptions.get(lang, None) + if lang_descr: + if to_print: + print("description (" + lang + "):", lang_descr["value"]) + + aliases = obj["aliases"] + if aliases: + for lang in languages: + lang_aliases = aliases.get(lang, None) + if lang_aliases: + for item in lang_aliases: + if to_print: + print("alias (" + lang + "):", item["value"]) + + if to_print: + print() line = file.readline() cnt += 1 + return entity_dict + def _read_wikipedia_prior_probs(): """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """ @@ -206,7 +235,7 @@ def _read_wikipedia_prior_probs(): # this is a simple link, with the alias the same as the mention elif "|" not in match: - _store_alias(match, match) + _store_alias(match, match, normalize_alias=True, normalize_entity=True) # in wiki format, the link is written as [[entity|alias]] else: @@ -216,9 +245,9 @@ def _read_wikipedia_prior_probs(): # specific wiki format [[alias (specification)|]] if len(alias) == 0 and "(" in entity: alias = entity.split("(")[0] - _store_alias(alias, entity) + _store_alias(alias, entity, normalize_alias=False, normalize_entity=True) else: - _store_alias(alias, entity) + _store_alias(alias, entity, normalize_alias=False, normalize_entity=True) line = file.readline() cnt += 1 @@ -231,17 +260,20 @@ def _read_wikipedia_prior_probs(): outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") -def _store_alias(alias, entity): +def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): alias = alias.strip() entity = entity.strip() # remove everything after # as this is not part of the title but refers to a specific paragraph - clean_entity = entity.split("#")[0].capitalize() + if normalize_entity: + entity = capitalize_first(entity.split("#")[0]) + if normalize_alias: + alias = capitalize_first(alias.split("#")[0]) - if len(alias) > 0 and len(clean_entity) > 0: + if alias and entity: alias_dict = map_alias_to_link.get(alias, dict()) - entity_count = alias_dict.get(clean_entity, 0) - alias_dict[clean_entity] = entity_count + 1 + entity_count = alias_dict.get(entity, 0) + alias_dict[entity] = entity_count + 1 map_alias_to_link[alias] = alias_dict @@ -360,14 +392,22 @@ def add_el(kb, nlp): print("ent", ent.text, ent.label_, ent.kb_id_) +def capitalize_first(text): + if not text: + return None + result = text[0].capitalize() + if len(result) > 0: + result += text[1:] + return result + if __name__ == "__main__": # STEP 1 : create prior probabilities from WP # run only once ! - # _read_wikipedia_prior_probs() + _read_wikipedia_prior_probs() # STEP 2 : create KB - nlp = spacy.load('en_core_web_sm') - my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5) + # nlp = spacy.load('en_core_web_sm') + # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5) # add_el(my_kb, nlp) # clean_text = "[[File:smomething]] jhk" From 9a8197185b733e471fa672e544fa2c8de57b991c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 18 Apr 2019 22:37:50 +0200 Subject: [PATCH 009/148] fix alias capitalization --- examples/pipeline/wikidata_entity_linking.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index b7dba1e0d..691be7990 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -204,7 +204,9 @@ def _read_wikidata(limit=None, to_print=False): def _read_wikipedia_prior_probs(): - """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities """ + """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities + The full file takes about 2h to parse 1100M lines (update printed every 5M lines) + """ # find the links link_regex = re.compile(r'\[\[[^\[\]]*\]\]') @@ -266,9 +268,10 @@ def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): # remove everything after # as this is not part of the title but refers to a specific paragraph if normalize_entity: + # wikipedia titles are always capitalized entity = capitalize_first(entity.split("#")[0]) if normalize_alias: - alias = capitalize_first(alias.split("#")[0]) + alias = alias.split("#")[0] if alias and entity: alias_dict = map_alias_to_link.get(alias, dict()) From 004e5e7d1c76bd507e83aab6321177ce5d27f39b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 19 Apr 2019 14:24:02 +0200 Subject: [PATCH 010/148] little fixes --- examples/pipeline/wikidata_entity_linking.py | 62 ++++++++++++-------- 1 file changed, 37 insertions(+), 25 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 691be7990..a02226f9f 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -35,34 +35,46 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", map_alias_to_link = dict() -def create_kb(vocab, max_entities_per_alias, min_occ): +def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): kb = KnowledgeBase(vocab=vocab) - id_to_title = _read_wikidata(limit=100, to_print=False) + id_to_title = _read_wikidata(limit=1000) title_to_id = {v:k for k,v in id_to_title.items()} - _add_entities(kb, entities=id_to_title.keys(), probs=[0.4 for x in id_to_title.keys()]) - _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ) + _add_entities(kb, + entities=id_to_title.keys(), + probs=[0.4 for x in id_to_title.keys()], + to_print=to_print) + + _add_aliases(kb, + title_to_id=title_to_id, + max_entities_per_alias=max_entities_per_alias, + min_occ=min_occ, + to_print=to_print) # TODO: read wikipedia texts for entity context # _read_wikipedia() - print() - print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) + if to_print: + print() + print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) return kb -def _add_entities(kb, entities, probs): +def _add_entities(kb, entities, probs, to_print=False): for entity, prob in zip(entities, probs): kb.add_entity(entity=entity, prob=prob) - print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings()) + if to_print: + print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings()) -def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ): +def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False): wp_titles = title_to_id.keys() - print("wp titles", wp_titles) + + if to_print: + print("wp titles", wp_titles) # adding aliases with prior probabilities with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: @@ -94,9 +106,6 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ): if selected_entities: kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs) - print("analysed", previous_alias, "with entities", entities, "and counts", counts) - print("added", previous_alias, "with selected entities", selected_entities, "and probs", prior_probs) - print() total_count = 0 counts = list() entities = list() @@ -110,8 +119,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ): line = prior_file.readline() - print() - print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) + if to_print: + print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) def _read_wikidata(limit=None, to_print=False): @@ -141,7 +150,7 @@ def _read_wikidata(limit=None, to_print=False): claim_property = claims.get(prop, None) if claim_property: for cp in claim_property: - cp_id = cp['mainsnak']['datavalue']['value']['id'] + cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') if cp_id in value_set: keep = True @@ -383,7 +392,7 @@ def add_el(kb, nlp): text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ "Douglas reminds us to always bring our towel. " \ - "The main character in Doug's novel is called Arthur Dent." + "The main character in Doug's novel is the man Arthur Dent, but Douglas doesn't write about George Washington." doc = nlp(text) print() @@ -406,14 +415,17 @@ def capitalize_first(text): if __name__ == "__main__": # STEP 1 : create prior probabilities from WP # run only once ! - _read_wikipedia_prior_probs() + # _read_wikipedia_prior_probs() # STEP 2 : create KB - # nlp = spacy.load('en_core_web_sm') - # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5) - # add_el(my_kb, nlp) + nlp = spacy.load('en_core_web_sm') + my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) - # clean_text = "[[File:smomething]] jhk" - # clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', clean_text) - # clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text) - # print(clean_text) + # STEP 3 : write KB to file + # TODO + + # STEP 4 : read KB back in from file + # TODO + + # STEP 5 : actually use the EL functionality + add_el(my_kb, nlp) From 8e70a564f11f70b8e1d8acd7b2639562394d7455 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 23 Apr 2019 16:33:40 +0200 Subject: [PATCH 011/148] custom reader and writer for _EntryC fields (first stab at it - not complete) --- examples/pipeline/wikidata_entity_linking.py | 16 ++- spacy/kb.pxd | 14 +++ spacy/kb.pyx | 106 +++++++++++++++++++ 3 files changed, 133 insertions(+), 3 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index a02226f9f..84e8066e2 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -16,6 +16,8 @@ ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-art ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' +KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' + # these will/should be matched ignoring case wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", @@ -418,14 +420,22 @@ if __name__ == "__main__": # _read_wikipedia_prior_probs() # STEP 2 : create KB - nlp = spacy.load('en_core_web_sm') - my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) + # nlp = spacy.load('en_core_web_sm') + # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) # STEP 3 : write KB to file # TODO + nlp = spacy.load('en_core_web_sm') + kb = KnowledgeBase(vocab=nlp.vocab) + kb.dump(KB_FILE) + print("DUMPED") + kb.load(KB_FILE) + print("LOADED") + + # PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' # STEP 4 : read KB back in from file # TODO # STEP 5 : actually use the EL functionality - add_el(my_kb, nlp) + # add_el(my_kb, nlp) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 3cdf1e07e..eab947b66 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -7,6 +7,8 @@ from libc.stdint cimport int32_t, int64_t from spacy.vocab cimport Vocab from .typedefs cimport hash_t +from libc.stdio cimport FILE + # Internal struct, for storage and disambiguation. This isn't what we return # to the user as the answer to "here's your entity". It's the minimum number @@ -158,3 +160,15 @@ cdef class KnowledgeBase: self._aliases_table.push_back(alias) +cdef class Writer: + cdef FILE* _fp + + cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1 + + +cdef class Reader: + cdef FILE* _fp + cdef public int32_t nr_feat + + cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 + diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 8a1710a9c..207231c99 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,8 +1,23 @@ # cython: infer_types=True # cython: profile=True # coding: utf8 +from collections import OrderedDict +from cpython.exc cimport PyErr_CheckSignals + +from spacy import util from spacy.errors import Errors, Warnings, user_warning +from cpython.mem cimport PyMem_Malloc +from cpython.exc cimport PyErr_SetFromErrno + +from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek +from libc.stdint cimport int32_t, int64_t +from libc.stdlib cimport qsort + +from .typedefs cimport hash_t + +from os import path + cdef class Candidate: @@ -139,3 +154,94 @@ cdef class KnowledgeBase: prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) if entry_index != 0] + + + def dump(self, loc): + # TODO: actually dump the data in this KB :-) + + cdef int64_t entry_id = 32 + self.vocab.strings.add("Q342") + cdef hash_t entity_hash = self.vocab.strings["Q342"] + cdef float prob = 0.333 + + cdef Writer writer = Writer(loc) + writer.write(entry_id, entity_hash, prob) + writer.close() + + def load(self, loc): + cdef int64_t entry_id + cdef hash_t entity_hash + cdef float prob + + cdef Reader reader = Reader(loc) + reader.read(self.mem, &entry_id, &entity_hash, &prob) + + cdef _EntryC entry + entry.entity_hash = entity_hash + entry.prob = prob + + # TODO + cdef int32_t dummy_value = 342 + entry.vector_rows = &dummy_value + entry.feats_row = dummy_value + +cdef class Writer: + def __init__(self, object loc): + if path.exists(loc): + assert not path.isdir(loc), "%s is directory." % loc + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + self._fp = fopen(bytes_loc, 'wb') + assert self._fp != NULL + fseek(self._fp, 0, 0) + + def close(self): + cdef size_t status = fclose(self._fp) + assert status == 0 + + cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: + cdef int i = 0 + + # TODO: feats_rows and vector rows + + _write(&entry_id, sizeof(entry_id), self._fp) + _write(&entry_hash, sizeof(entry_hash), self._fp) + _write(&entry_prob, sizeof(entry_prob), self._fp) + + +cdef int _write(void* value, size_t size, FILE* fp) except -1: + status = fwrite(value, size, 1, fp) + assert status == 1, status + + +cdef class Reader: + def __init__(self, object loc): + assert path.exists(loc) + assert not path.isdir(loc) + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + self._fp = fopen(bytes_loc, 'rb') + if not self._fp: + PyErr_SetFromErrno(IOError) + status = fseek(self._fp, 0, 0) # this can be 0 if there is no header + + def __dealloc__(self): + fclose(self._fp) + + cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: + status = fread(entry_id, sizeof(entry_id), 1, self._fp) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entry ID from input file") + + #status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp) + status = fread(entity_hash, sizeof(entity_hash), 1, self._fp) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entity hash from input file") + + status = fread(prob, sizeof(prob), 1, self._fp) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entity prob from input file") From 694fea597aedd75ea9e045fd12268aba3ffd171d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 23 Apr 2019 18:36:50 +0200 Subject: [PATCH 012/148] dumping all entryC entries + (inefficient) reading back in --- examples/pipeline/wikidata_entity_linking.py | 18 ++++-- spacy/kb.pxd | 1 - spacy/kb.pyx | 64 +++++++++++++------- 3 files changed, 53 insertions(+), 30 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 84e8066e2..db8d4577c 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -426,16 +426,22 @@ if __name__ == "__main__": # STEP 3 : write KB to file # TODO nlp = spacy.load('en_core_web_sm') - kb = KnowledgeBase(vocab=nlp.vocab) - kb.dump(KB_FILE) - print("DUMPED") - kb.load(KB_FILE) - print("LOADED") + kb1 = KnowledgeBase(vocab=nlp.vocab) - # PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' + kb1.add_entity(entity="Q53", prob=0.33) + kb1.add_entity(entity="Q17", prob=0.1) + kb1.add_entity(entity="Q007", prob=0.7) + kb1.add_entity(entity="Q44", prob=0.4) + print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases()) + + kb1.dump(KB_FILE) # STEP 4 : read KB back in from file # TODO + kb2 = KnowledgeBase(vocab=nlp.vocab) + kb2.load(KB_FILE) + print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases()) + # STEP 5 : actually use the EL functionality # add_el(my_kb, nlp) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index eab947b66..c655c6bff 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -168,7 +168,6 @@ cdef class Writer: cdef class Reader: cdef FILE* _fp - cdef public int32_t nr_feat cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 207231c99..4ec910b03 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -157,33 +157,45 @@ cdef class KnowledgeBase: def dump(self, loc): - # TODO: actually dump the data in this KB :-) - - cdef int64_t entry_id = 32 - self.vocab.strings.add("Q342") - cdef hash_t entity_hash = self.vocab.strings["Q342"] - cdef float prob = 0.333 - cdef Writer writer = Writer(loc) - writer.write(entry_id, entity_hash, prob) + + for key, entry_index in self._entry_index.items(): + entry = self._entries[entry_index] + print("dumping") + print("index", entry_index) + print("hash", entry.entity_hash) + print("prob", entry.prob) + print("") + writer.write(entry_index, entry.entity_hash, entry.prob) + writer.close() def load(self, loc): cdef int64_t entry_id cdef hash_t entity_hash cdef float prob + cdef _EntryC entry + cdef int32_t dummy_value = 342 cdef Reader reader = Reader(loc) - reader.read(self.mem, &entry_id, &entity_hash, &prob) + result = reader.read(self.mem, &entry_id, &entity_hash, &prob) # -1: error, 0: eof after this one + while result: + print("loading") + print("entryID", entry_id) + print("hash", entity_hash) + print("prob", prob) + print("result:", result) + print("") + entry.entity_hash = entity_hash + entry.prob = prob - cdef _EntryC entry - entry.entity_hash = entity_hash - entry.prob = prob + # TODO features and vectors + entry.vector_rows = &dummy_value + entry.feats_row = dummy_value - # TODO - cdef int32_t dummy_value = 342 - entry.vector_rows = &dummy_value - entry.feats_row = dummy_value + # TODO: use set instead of push_back to ensure the index remains the same? + self._entries.push_back(entry) + result = reader.read(self.mem, &entry_id, &entity_hash, &prob) cdef class Writer: def __init__(self, object loc): @@ -199,10 +211,7 @@ cdef class Writer: assert status == 0 cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: - cdef int i = 0 - # TODO: feats_rows and vector rows - _write(&entry_id, sizeof(entry_id), self._fp) _write(&entry_hash, sizeof(entry_hash), self._fp) _write(&entry_prob, sizeof(entry_prob), self._fp) @@ -227,21 +236,30 @@ cdef class Reader: fclose(self._fp) cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: - status = fread(entry_id, sizeof(entry_id), 1, self._fp) + """ + Return values: + -1: error during current read (EOF during call) + 0: means we read the last line succesfully (EOF after call) + 1: we can continue reading this file """ + status = fread(entry_id, sizeof(int64_t), 1, self._fp) if status < 1: if feof(self._fp): return 0 # end of file raise IOError("error reading entry ID from input file") - #status = fread(&entity_hash, sizeof(entity_hash), 1, self._fp) - status = fread(entity_hash, sizeof(entity_hash), 1, self._fp) + status = fread(entity_hash, sizeof(hash_t), 1, self._fp) if status < 1: if feof(self._fp): return 0 # end of file raise IOError("error reading entity hash from input file") - status = fread(prob, sizeof(prob), 1, self._fp) + status = fread(prob, sizeof(float), 1, self._fp) if status < 1: if feof(self._fp): return 0 # end of file raise IOError("error reading entity prob from input file") + + if feof(self._fp): + return 0 + else: + return 1 From 6e3223f23494a8c3361290a748de39f5768438d4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 24 Apr 2019 11:26:38 +0200 Subject: [PATCH 013/148] bulk loading in proper order of entity indices --- examples/pipeline/wikidata_entity_linking.py | 13 ++-- spacy/kb.pxd | 57 +++++------------ spacy/kb.pyx | 65 +++++++++++++------- spacy/structs.pxd | 37 +++++++++++ 4 files changed, 100 insertions(+), 72 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index db8d4577c..674c6166c 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -424,9 +424,8 @@ if __name__ == "__main__": # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) # STEP 3 : write KB to file - # TODO - nlp = spacy.load('en_core_web_sm') - kb1 = KnowledgeBase(vocab=nlp.vocab) + nlp1 = spacy.load('en_core_web_sm') + kb1 = KnowledgeBase(vocab=nlp1.vocab) kb1.add_entity(entity="Q53", prob=0.33) kb1.add_entity(entity="Q17", prob=0.1) @@ -437,11 +436,11 @@ if __name__ == "__main__": kb1.dump(KB_FILE) # STEP 4 : read KB back in from file - # TODO - kb2 = KnowledgeBase(vocab=nlp.vocab) - kb2.load(KB_FILE) - print("kb2 size:", len(kb2), kb2.get_size_entities(), kb2.get_size_aliases()) + nlp3 = spacy.load('en_core_web_sm') + kb3 = KnowledgeBase(vocab=nlp3.vocab) + kb3.load_bulk(7, KB_FILE) + print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases()) # STEP 5 : actually use the EL functionality # add_el(my_kb, nlp) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index c655c6bff..817b7ff25 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -1,48 +1,17 @@ """Knowledge-base for entity or concept linking.""" from cymem.cymem cimport Pool from preshed.maps cimport PreshMap + from libcpp.vector cimport vector from libc.stdint cimport int32_t, int64_t +from libc.stdio cimport FILE from spacy.vocab cimport Vocab from .typedefs cimport hash_t -from libc.stdio cimport FILE - - -# Internal struct, for storage and disambiguation. This isn't what we return -# to the user as the answer to "here's your entity". It's the minimum number -# of bits we need to keep track of the answers. -cdef struct _EntryC: - - # The hash of this entry's unique ID/name in the kB - hash_t entity_hash - - # Allows retrieval of one or more vectors. - # Each element of vector_rows should be an index into a vectors table. - # Every entry should have the same number of vectors, so we can avoid storing - # the number of vectors in each knowledge-base struct - int32_t* vector_rows - - # Allows retrieval of a struct of non-vector features. We could make this a - # pointer, but we have 32 bits left over in the struct after prob, so we'd - # like this to only be 32 bits. We can also set this to -1, for the common - # case where there are no features. - int32_t feats_row - - # log probability of entity, based on corpus frequency - float prob - - -# Each alias struct stores a list of Entry pointers with their prior probabilities -# for this specific mention/alias. -cdef struct _AliasC: - - # All entry candidates for this alias - vector[int64_t] entry_indices - - # Prior probability P(entity|alias) - should sum up to (at most) 1. - vector[float] probs +from .structs cimport EntryC, AliasC +ctypedef vector[EntryC] entry_vec +ctypedef vector[AliasC] alias_vec # Object used by the Entity Linker that summarizes one entity-alias candidate combination. @@ -68,7 +37,7 @@ cdef class KnowledgeBase: # over allocation. # In total we end up with (N*128*1.3)+(N*128*1.3) bits for N entries. # Storing 1m entries would take 41.6mb under this scheme. - cdef vector[_EntryC] _entries + cdef entry_vec _entries # This maps 64bit keys (hash of unique alias string) # to 64bit values (position of the _AliasC struct in the _aliases_table vector). @@ -78,7 +47,7 @@ cdef class KnowledgeBase: # should be P(entity | mention), which is pretty important to know. # We can pack both pieces of information into a 64-bit value, to keep things # efficient. - cdef vector[_AliasC] _aliases_table + cdef alias_vec _aliases_table # This is the part which might take more space: storing various # categorical features for the entries, and storing vectors for disambiguation @@ -98,6 +67,7 @@ cdef class KnowledgeBase: # optional data, we can let users configure a DB as the backend for this. cdef object _features_table + cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, int32_t* vector_rows, int feats_row) nogil: """Add an entry to the vector of entries. @@ -107,7 +77,7 @@ cdef class KnowledgeBase: cdef int64_t new_index = self._entries.size() # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 - cdef _EntryC entry + cdef EntryC entry entry.entity_hash = entity_hash entry.vector_rows = vector_rows entry.feats_row = feats_row @@ -124,7 +94,7 @@ cdef class KnowledgeBase: cdef int64_t new_index = self._aliases_table.size() # Avoid struct initializer to enable nogil - cdef _AliasC alias + cdef AliasC alias alias.entry_indices = entry_indices alias.probs = probs @@ -140,7 +110,7 @@ cdef class KnowledgeBase: cdef int32_t dummy_value = 0 # Avoid struct initializer to enable nogil - cdef _EntryC entry + cdef EntryC entry entry.entity_hash = dummy_hash entry.vector_rows = &dummy_value entry.feats_row = dummy_value @@ -152,20 +122,21 @@ cdef class KnowledgeBase: cdef vector[float] dummy_probs dummy_probs.push_back(0) - cdef _AliasC alias + cdef AliasC alias alias.entry_indices = dummy_entry_indices alias.probs = dummy_probs self._entries.push_back(entry) self._aliases_table.push_back(alias) + cpdef load_bulk(self, int nr_entities, loc) + cdef class Writer: cdef FILE* _fp cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1 - cdef class Reader: cdef FILE* _fp diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 4ec910b03..c967654d3 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -7,6 +7,9 @@ from cpython.exc cimport PyErr_CheckSignals from spacy import util from spacy.errors import Errors, Warnings, user_warning +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap + from cpython.mem cimport PyMem_Malloc from cpython.exc cimport PyErr_SetFromErrno @@ -17,6 +20,8 @@ from libc.stdlib cimport qsort from .typedefs cimport hash_t from os import path +from libcpp.vector cimport vector + cdef class Candidate: @@ -53,7 +58,6 @@ cdef class Candidate: cdef class KnowledgeBase: - def __init__(self, Vocab vocab): self.vocab = vocab self.mem = Pool() @@ -67,13 +71,13 @@ cdef class KnowledgeBase: return self.get_size_entities() def get_size_entities(self): - return self._entries.size() - 1 # not counting dummy element on index 0 + return len(self._entry_index) def get_entity_strings(self): return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0 def get_size_aliases(self): - return self._aliases_table.size() - 1 # not counting dummy element on index + return len(self._alias_index) def get_alias_strings(self): return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0 @@ -159,33 +163,44 @@ cdef class KnowledgeBase: def dump(self, loc): cdef Writer writer = Writer(loc) - for key, entry_index in self._entry_index.items(): + # dumping the entry records in the order in which they are in the _entries vector. + # index 0 is a dummy object not stored in the _entry_index and can be ignored. + i = 1 + for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): entry = self._entries[entry_index] print("dumping") print("index", entry_index) print("hash", entry.entity_hash) + assert entry.entity_hash == entry_hash + assert entry_index == i print("prob", entry.prob) print("") writer.write(entry_index, entry.entity_hash, entry.prob) + i = i+1 writer.close() - def load(self, loc): + cpdef load_bulk(self, int nr_entities, loc): + # TODO: nr_entities from header in file (Reader constructor) cdef int64_t entry_id cdef hash_t entity_hash cdef float prob - cdef _EntryC entry + cdef EntryC entry cdef int32_t dummy_value = 342 cdef Reader reader = Reader(loc) - result = reader.read(self.mem, &entry_id, &entity_hash, &prob) # -1: error, 0: eof after this one - while result: - print("loading") - print("entryID", entry_id) - print("hash", entity_hash) - print("prob", prob) - print("result:", result) - print("") + to_read = self.get_size_entities() + + self._entry_index = PreshMap(nr_entities+1) + self._entries = entry_vec(nr_entities+1) + + # we assume the data was written in sequence + # index 0 is a dummy object not stored in the _entry_index and can be ignored. + # TODO: should we initialize the dummy objects ? + cdef int i = 1 + while reader.read(self.mem, &entry_id, &entity_hash, &prob) and i <= nr_entities: + assert i == entry_id + entry.entity_hash = entity_hash entry.prob = prob @@ -193,9 +208,18 @@ cdef class KnowledgeBase: entry.vector_rows = &dummy_value entry.feats_row = dummy_value - # TODO: use set instead of push_back to ensure the index remains the same? - self._entries.push_back(entry) - result = reader.read(self.mem, &entry_id, &entity_hash, &prob) + print("bulk loading") + print("i", i) + print("entryID", entry_id) + print("hash", entry.entity_hash) + print("prob", entry.prob) + print("") + + self._entries[i] = entry + self._entry_index[entity_hash] = i + + i += 1 + cdef class Writer: def __init__(self, object loc): @@ -236,11 +260,6 @@ cdef class Reader: fclose(self._fp) cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: - """ - Return values: - -1: error during current read (EOF during call) - 0: means we read the last line succesfully (EOF after call) - 1: we can continue reading this file """ status = fread(entry_id, sizeof(int64_t), 1, self._fp) if status < 1: if feof(self._fp): @@ -263,3 +282,5 @@ cdef class Reader: return 0 else: return 1 + + diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 154202c0d..69a1f4961 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -3,6 +3,10 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t +from libcpp.vector cimport vector +from libc.stdint cimport int32_t, int64_t + + cdef struct LexemeC: flags_t flags @@ -72,3 +76,36 @@ cdef struct TokenC: attr_t ent_type # TODO: Is there a better way to do this? Multiple sources of truth.. attr_t ent_kb_id hash_t ent_id + + +# Internal struct, for storage and disambiguation of entities. +cdef struct EntryC: + + # The hash of this entry's unique ID/name in the kB + hash_t entity_hash + + # Allows retrieval of one or more vectors. + # Each element of vector_rows should be an index into a vectors table. + # Every entry should have the same number of vectors, so we can avoid storing + # the number of vectors in each knowledge-base struct + int32_t* vector_rows + + # Allows retrieval of a struct of non-vector features. We could make this a + # pointer, but we have 32 bits left over in the struct after prob, so we'd + # like this to only be 32 bits. We can also set this to -1, for the common + # case where there are no features. + int32_t feats_row + + # log probability of entity, based on corpus frequency + float prob + + +# Each alias struct stores a list of Entry pointers with their prior probabilities +# for this specific mention/alias. +cdef struct AliasC: + + # All entry candidates for this alias + vector[int64_t] entry_indices + + # Prior probability P(entity|alias) - should sum up to (at most) 1. + vector[float] probs From ad6c5e581cd4a99300102e68cb6bdd463b51d380 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 24 Apr 2019 15:31:44 +0200 Subject: [PATCH 014/148] writing and reading number of entries to/from header --- examples/pipeline/wikidata_entity_linking.py | 6 +- spacy/kb.pxd | 10 ++- spacy/kb.pyx | 65 ++++++++++---------- 3 files changed, 46 insertions(+), 35 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 674c6166c..8628c54a9 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -432,6 +432,7 @@ if __name__ == "__main__": kb1.add_entity(entity="Q007", prob=0.7) kb1.add_entity(entity="Q44", prob=0.4) print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases()) + print("dumping kb1") kb1.dump(KB_FILE) @@ -439,7 +440,10 @@ if __name__ == "__main__": nlp3 = spacy.load('en_core_web_sm') kb3 = KnowledgeBase(vocab=nlp3.vocab) - kb3.load_bulk(7, KB_FILE) + + kb3.load_bulk(KB_FILE) + + print("loading kb3") print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases()) # STEP 5 : actually use the EL functionality diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 817b7ff25..9c393e5f2 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -129,16 +129,20 @@ cdef class KnowledgeBase: self._entries.push_back(entry) self._aliases_table.push_back(alias) - cpdef load_bulk(self, int nr_entities, loc) + cpdef load_bulk(self, loc) cdef class Writer: cdef FILE* _fp - cdef int write(self, int64_t entry_id, hash_t entity_hash, float prob) except -1 + cdef int write_header(self, int64_t nr_entries) except -1 + cdef int write_entry(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1 + cdef int _write(self, void* value, size_t size) except -1 cdef class Reader: cdef FILE* _fp - cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 + cdef int read_header(self, int64_t* nr_entries) except -1 + cdef int read_entry(self, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 + cdef int _read(self, void* value, size_t size) except -1 diff --git a/spacy/kb.pyx b/spacy/kb.pyx index c967654d3..21c6d9049 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -64,6 +64,8 @@ cdef class KnowledgeBase: self._entry_index = PreshMap() self._alias_index = PreshMap() + # TODO initialize self._entries and self._aliases_table ? + self.vocab.strings.add("") self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) @@ -162,26 +164,21 @@ cdef class KnowledgeBase: def dump(self, loc): cdef Writer writer = Writer(loc) + writer.write_header(self.get_size_entities()) # dumping the entry records in the order in which they are in the _entries vector. # index 0 is a dummy object not stored in the _entry_index and can be ignored. i = 1 for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): entry = self._entries[entry_index] - print("dumping") - print("index", entry_index) - print("hash", entry.entity_hash) assert entry.entity_hash == entry_hash assert entry_index == i - print("prob", entry.prob) - print("") - writer.write(entry_index, entry.entity_hash, entry.prob) + writer.write_entry(entry_index, entry.entity_hash, entry.prob) i = i+1 writer.close() - cpdef load_bulk(self, int nr_entities, loc): - # TODO: nr_entities from header in file (Reader constructor) + cpdef load_bulk(self, loc): cdef int64_t entry_id cdef hash_t entity_hash cdef float prob @@ -189,7 +186,8 @@ cdef class KnowledgeBase: cdef int32_t dummy_value = 342 cdef Reader reader = Reader(loc) - to_read = self.get_size_entities() + cdef int64_t nr_entities + reader.read_header(&nr_entities) self._entry_index = PreshMap(nr_entities+1) self._entries = entry_vec(nr_entities+1) @@ -198,23 +196,15 @@ cdef class KnowledgeBase: # index 0 is a dummy object not stored in the _entry_index and can be ignored. # TODO: should we initialize the dummy objects ? cdef int i = 1 - while reader.read(self.mem, &entry_id, &entity_hash, &prob) and i <= nr_entities: + while reader.read_entry(&entry_id, &entity_hash, &prob) and i <= nr_entities: assert i == entry_id + # TODO features and vectors entry.entity_hash = entity_hash entry.prob = prob - - # TODO features and vectors entry.vector_rows = &dummy_value entry.feats_row = dummy_value - print("bulk loading") - print("i", i) - print("entryID", entry_id) - print("hash", entry.entity_hash) - print("prob", entry.prob) - print("") - self._entries[i] = entry self._entry_index[entity_hash] = i @@ -234,16 +224,18 @@ cdef class Writer: cdef size_t status = fclose(self._fp) assert status == 0 - cdef int write(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: + cdef int write_header(self, int64_t nr_entries) except -1: + self._write(&nr_entries, sizeof(nr_entries)) + + cdef int write_entry(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: # TODO: feats_rows and vector rows - _write(&entry_id, sizeof(entry_id), self._fp) - _write(&entry_hash, sizeof(entry_hash), self._fp) - _write(&entry_prob, sizeof(entry_prob), self._fp) + self._write(&entry_id, sizeof(entry_id)) + self._write(&entry_hash, sizeof(entry_hash)) + self._write(&entry_prob, sizeof(entry_prob)) - -cdef int _write(void* value, size_t size, FILE* fp) except -1: - status = fwrite(value, size, 1, fp) - assert status == 1, status + cdef int _write(self, void* value, size_t size) except -1: + status = fwrite(value, size, 1, self._fp) + assert status == 1, status cdef class Reader: @@ -259,20 +251,27 @@ cdef class Reader: def __dealloc__(self): fclose(self._fp) - cdef int read(self, Pool mem, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: - status = fread(entry_id, sizeof(int64_t), 1, self._fp) + cdef int read_header(self, int64_t* nr_entries) except -1: + status = self._read(nr_entries, sizeof(int64_t)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading header from input file") + + cdef int read_entry(self, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: + status = self._read(entry_id, sizeof(int64_t)) if status < 1: if feof(self._fp): return 0 # end of file raise IOError("error reading entry ID from input file") - status = fread(entity_hash, sizeof(hash_t), 1, self._fp) + status = self._read(entity_hash, sizeof(hash_t)) if status < 1: if feof(self._fp): return 0 # end of file raise IOError("error reading entity hash from input file") - status = fread(prob, sizeof(float), 1, self._fp) + status = self._read(prob, sizeof(float)) if status < 1: if feof(self._fp): return 0 # end of file @@ -283,4 +282,8 @@ cdef class Reader: else: return 1 + cdef int _read(self, void* value, size_t size) except -1: + status = fread(value, size, 1, self._fp) + return status + From 3e0cb690653fa5fa6ebdc094d4cb65a4084578d0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 24 Apr 2019 20:24:24 +0200 Subject: [PATCH 015/148] KB aliases to and from file --- examples/pipeline/wikidata_entity_linking.py | 23 +++- spacy/kb.pxd | 14 +- spacy/kb.pyx | 129 ++++++++++++++++--- 3 files changed, 141 insertions(+), 25 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 8628c54a9..a8a3eec1e 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -425,26 +425,37 @@ if __name__ == "__main__": # STEP 3 : write KB to file nlp1 = spacy.load('en_core_web_sm') - kb1 = KnowledgeBase(vocab=nlp1.vocab) + my_vocab = nlp1.vocab + kb1 = KnowledgeBase(vocab=my_vocab) kb1.add_entity(entity="Q53", prob=0.33) kb1.add_entity(entity="Q17", prob=0.1) kb1.add_entity(entity="Q007", prob=0.7) kb1.add_entity(entity="Q44", prob=0.4) - print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases()) - print("dumping kb1") + kb1.add_alias(alias="double07", entities=["Q007", "Q17"], probabilities=[0.9, 0.1]) + kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1]) + kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0]) + print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases()) + print("kb1 entities:", kb1.get_entity_strings()) + print("kb1 aliases:", kb1.get_alias_strings()) + + print() + print("dumping kb1") kb1.dump(KB_FILE) # STEP 4 : read KB back in from file nlp3 = spacy.load('en_core_web_sm') - kb3 = KnowledgeBase(vocab=nlp3.vocab) - - kb3.load_bulk(KB_FILE) + kb3 = KnowledgeBase(vocab=my_vocab) print("loading kb3") + kb3.load_bulk(KB_FILE) + + print() print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases()) + print("kb3 entities:", kb3.get_entity_strings()) + print("kb3 aliases:", kb3.get_alias_strings()) # STEP 5 : actually use the EL functionality # add_el(my_kb, nlp) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 9c393e5f2..5f7bfa46c 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -136,13 +136,23 @@ cdef class Writer: cdef FILE* _fp cdef int write_header(self, int64_t nr_entries) except -1 - cdef int write_entry(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1 + cdef int write_entry(self, hash_t entry_hash, float entry_prob) except -1 + + cdef int write_alias_length(self, int64_t alias_length) except -1 + cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1 + cdef int write_alias(self, int64_t entry_index, float prob) except -1 + cdef int _write(self, void* value, size_t size) except -1 cdef class Reader: cdef FILE* _fp cdef int read_header(self, int64_t* nr_entries) except -1 - cdef int read_entry(self, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1 + cdef int read_entry(self, hash_t* entity_hash, float* prob) except -1 + + cdef int read_alias_length(self, int64_t* alias_length) except -1 + cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1 + cdef int read_alias(self, int64_t* entry_index, float* prob) except -1 + cdef int _read(self, void* value, size_t size) except -1 diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 21c6d9049..f3d5ecaa9 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -76,13 +76,13 @@ cdef class KnowledgeBase: return len(self._entry_index) def get_entity_strings(self): - return [self.vocab.strings[x] for x in self._entry_index][1:] # removing the dummy element on index 0 + return [self.vocab.strings[x] for x in self._entry_index] def get_size_aliases(self): return len(self._alias_index) def get_alias_strings(self): - return [self.vocab.strings[x] for x in self._alias_index][1:] # removing the dummy element on index 0 + return [self.vocab.strings[x] for x in self._alias_index] def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None): """ @@ -173,31 +173,52 @@ cdef class KnowledgeBase: entry = self._entries[entry_index] assert entry.entity_hash == entry_hash assert entry_index == i - writer.write_entry(entry_index, entry.entity_hash, entry.prob) + writer.write_entry(entry.entity_hash, entry.prob) + i = i+1 + + writer.write_alias_length(self.get_size_aliases()) + + # dumping the aliases in the order in which they are in the _alias_index vector. + # index 0 is a dummy object not stored in the _aliases_table and can be ignored. + i = 1 + for alias_hash, alias_index in sorted(self._alias_index.items(), key=lambda x: x[1]): + alias = self._aliases_table[alias_index] + assert alias_index == i + + candidate_length = len(alias.entry_indices) + writer.write_alias_header(alias_hash, candidate_length) + + for j in range(0, candidate_length): + writer.write_alias(alias.entry_indices[j], alias.probs[j]) + i = i+1 writer.close() cpdef load_bulk(self, loc): - cdef int64_t entry_id cdef hash_t entity_hash + cdef hash_t alias_hash + cdef int64_t entry_index cdef float prob cdef EntryC entry + cdef AliasC alias cdef int32_t dummy_value = 342 cdef Reader reader = Reader(loc) + + # Step 1: load entities + cdef int64_t nr_entities reader.read_header(&nr_entities) - self._entry_index = PreshMap(nr_entities+1) self._entries = entry_vec(nr_entities+1) - # we assume the data was written in sequence + # we assume that the entity data was written in sequence # index 0 is a dummy object not stored in the _entry_index and can be ignored. # TODO: should we initialize the dummy objects ? cdef int i = 1 - while reader.read_entry(&entry_id, &entity_hash, &prob) and i <= nr_entities: - assert i == entry_id + while i <= nr_entities: + reader.read_entry(&entity_hash, &prob) # TODO features and vectors entry.entity_hash = entity_hash @@ -210,6 +231,43 @@ cdef class KnowledgeBase: i += 1 + # check that all entities were read in properly + assert nr_entities == self.get_size_entities() + + # Step 2: load aliases + cdef int64_t nr_aliases + reader.read_alias_length(&nr_aliases) + self._alias_index = PreshMap(nr_aliases+1) + self._aliases_table = alias_vec(nr_aliases+1) + + cdef int64_t nr_candidates + cdef vector[int64_t] entry_indices + cdef vector[float] probs + + i = 1 + # we assume the alias data was written in sequence + # index 0 is a dummy object not stored in the _entry_index and can be ignored. + while i <= nr_aliases: + reader.read_alias_header(&alias_hash, &nr_candidates) + entry_indices = vector[int64_t](nr_candidates) + probs = vector[float](nr_candidates) + + for j in range(0, nr_candidates): + reader.read_alias(&entry_index, &prob) + entry_indices[j] = entry_index + probs[j] = prob + + alias.entry_indices = entry_indices + alias.probs = probs + + self._aliases_table[i] = alias + self._alias_index[alias_hash] = i + + i += 1 + + # check that all aliases were read in properly + assert nr_aliases == self.get_size_aliases() + cdef class Writer: def __init__(self, object loc): @@ -227,12 +285,22 @@ cdef class Writer: cdef int write_header(self, int64_t nr_entries) except -1: self._write(&nr_entries, sizeof(nr_entries)) - cdef int write_entry(self, int64_t entry_id, hash_t entry_hash, float entry_prob) except -1: + cdef int write_entry(self, hash_t entry_hash, float entry_prob) except -1: # TODO: feats_rows and vector rows - self._write(&entry_id, sizeof(entry_id)) self._write(&entry_hash, sizeof(entry_hash)) self._write(&entry_prob, sizeof(entry_prob)) + cdef int write_alias_length(self, int64_t alias_length) except -1: + self._write(&alias_length, sizeof(alias_length)) + + cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1: + self._write(&alias_hash, sizeof(alias_hash)) + self._write(&candidate_length, sizeof(candidate_length)) + + cdef int write_alias(self, int64_t entry_index, float prob) except -1: + self._write(&entry_index, sizeof(entry_index)) + self._write(&prob, sizeof(prob)) + cdef int _write(self, void* value, size_t size) except -1: status = fwrite(value, size, 1, self._fp) assert status == 1, status @@ -258,13 +326,7 @@ cdef class Reader: return 0 # end of file raise IOError("error reading header from input file") - cdef int read_entry(self, int64_t* entry_id, hash_t* entity_hash, float* prob) except -1: - status = self._read(entry_id, sizeof(int64_t)) - if status < 1: - if feof(self._fp): - return 0 # end of file - raise IOError("error reading entry ID from input file") - + cdef int read_entry(self, hash_t* entity_hash, float* prob) except -1: status = self._read(entity_hash, sizeof(hash_t)) if status < 1: if feof(self._fp): @@ -282,6 +344,39 @@ cdef class Reader: else: return 1 + cdef int read_alias_length(self, int64_t* alias_length) except -1: + status = self._read(alias_length, sizeof(int64_t)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading alias length from input file") + + cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1: + status = self._read(alias_hash, sizeof(hash_t)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading alias hash from input file") + + status = self._read(candidate_length, sizeof(int64_t)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading candidate length from input file") + + cdef int read_alias(self, int64_t* entry_index, float* prob) except -1: + status = self._read(entry_index, sizeof(int64_t)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entry index for alias from input file") + + status = self._read(prob, sizeof(float)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading prob for entity/alias from input file") + cdef int _read(self, void* value, size_t size) except -1: status = fread(value, size, 1, self._fp) return status From 54d0cea0626fd9977b15c87284e16ccb063e076f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 24 Apr 2019 23:52:34 +0200 Subject: [PATCH 016/148] unit test for KB serialization --- examples/pipeline/wikidata_entity_linking.py | 2 +- spacy/kb.pxd | 1 + spacy/kb.pyx | 8 ++- spacy/tests/serialize/test_serialize_kb.py | 64 ++++++++++++++++++++ 4 files changed, 73 insertions(+), 2 deletions(-) create mode 100644 spacy/tests/serialize/test_serialize_kb.py diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index a8a3eec1e..3b0943167 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -442,11 +442,11 @@ if __name__ == "__main__": print() print("dumping kb1") + print(KB_FILE, type(KB_FILE)) kb1.dump(KB_FILE) # STEP 4 : read KB back in from file - nlp3 = spacy.load('en_core_web_sm') kb3 = KnowledgeBase(vocab=my_vocab) print("loading kb3") diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 5f7bfa46c..82b06d192 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -19,6 +19,7 @@ cdef class Candidate: cdef readonly KnowledgeBase kb cdef hash_t entity_hash + cdef float entity_freq cdef hash_t alias_hash cdef float prior_prob diff --git a/spacy/kb.pyx b/spacy/kb.pyx index f3d5ecaa9..ad2e13b5e 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -26,9 +26,10 @@ from libcpp.vector cimport vector cdef class Candidate: - def __init__(self, KnowledgeBase kb, entity_hash, alias_hash, prior_prob): + def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, alias_hash, prior_prob): self.kb = kb self.entity_hash = entity_hash + self.entity_freq = entity_freq self.alias_hash = alias_hash self.prior_prob = prior_prob @@ -52,6 +53,10 @@ cdef class Candidate: """RETURNS (unicode): ID of the original alias""" return self.kb.vocab.strings[self.alias_hash] + @property + def entity_freq(self): + return self.entity_freq + @property def prior_prob(self): return self.prior_prob @@ -156,6 +161,7 @@ cdef class KnowledgeBase: return [Candidate(kb=self, entity_hash=self._entries[entry_index].entity_hash, + entity_freq=self._entries[entry_index].prob, alias_hash=alias_hash, prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py new file mode 100644 index 000000000..ae0eedeeb --- /dev/null +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -0,0 +1,64 @@ +from ..util import make_tempdir +from ...util import ensure_path + +from spacy.kb import KnowledgeBase + + +def test_serialize_kb_disk(en_vocab): + kb1 = KnowledgeBase(vocab=en_vocab) + + kb1.add_entity(entity="Q53", prob=0.33) + kb1.add_entity(entity="Q17", prob=0.2) + kb1.add_entity(entity="Q007", prob=0.7) + kb1.add_entity(entity="Q44", prob=0.4) + kb1.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9]) + kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1]) + kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0]) + + # baseline assertions + _check_kb(kb1) + + # dumping to file & loading back in + with make_tempdir() as d: + dir_path = ensure_path(d) + if not dir_path.exists(): + dir_path.mkdir() + file_path = dir_path / "kb" + print(file_path, type(file_path)) + kb1.dump(str(file_path)) + + kb2 = KnowledgeBase(vocab=en_vocab) + kb2.load_bulk(str(file_path)) + + # final assertions + _check_kb(kb2) + + +def _check_kb(kb): + # check entities + assert kb.get_size_entities() == 4 + for entity_string in ["Q53", "Q17", "Q007", "Q44"]: + assert entity_string in kb.get_entity_strings() + for entity_string in ["", "Q0"]: + assert entity_string not in kb.get_entity_strings() + + # check aliases + assert kb.get_size_aliases() == 3 + for alias_string in ["double07", "guy", "random"]: + assert alias_string in kb.get_alias_strings() + for alias_string in ["nothingness", "", "randomnoise"]: + assert alias_string not in kb.get_alias_strings() + + # check candidates & probabilities + candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_) + assert len(candidates) == 2 + + assert candidates[0].entity_ == "Q007" + assert candidates[0].entity_freq < 0.701 and candidates[0].entity_freq > 0.699 + assert candidates[0].alias_ == "double07" + assert candidates[0].prior_prob < 0.901 and candidates[0].prior_prob > 0.899 + + assert candidates[1].entity_ == "Q17" + assert candidates[1].entity_freq < 0.201 and candidates[1].entity_freq > 0.199 + assert candidates[1].alias_ == "double07" + assert candidates[1].prior_prob < 0.101 and candidates[1].prior_prob > 0.099 From 387263d618369aaaffaa9561791c4dc3ce988dd7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 29 Apr 2019 13:58:07 +0200 Subject: [PATCH 017/148] simplify chains --- spacy/tests/serialize/test_serialize_kb.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index ae0eedeeb..3ff6eaef6 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -54,11 +54,11 @@ def _check_kb(kb): assert len(candidates) == 2 assert candidates[0].entity_ == "Q007" - assert candidates[0].entity_freq < 0.701 and candidates[0].entity_freq > 0.699 + assert 0.6999 < candidates[0].entity_freq < 0.701 assert candidates[0].alias_ == "double07" - assert candidates[0].prior_prob < 0.901 and candidates[0].prior_prob > 0.899 + assert 0.899 < candidates[0].prior_prob < 0.901 assert candidates[1].entity_ == "Q17" - assert candidates[1].entity_freq < 0.201 and candidates[1].entity_freq > 0.199 + assert 0.199 < candidates[1].entity_freq < 0.201 assert candidates[1].alias_ == "double07" - assert candidates[1].prior_prob < 0.101 and candidates[1].prior_prob > 0.099 + assert 0.099 < candidates[1].prior_prob < 0.101 From 19e8f339cb3a125bbd7e5ae387e27dd417054dd7 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 29 Apr 2019 17:37:29 +0200 Subject: [PATCH 018/148] deduce entity freq from WP corpus and serialize vocab in WP test --- examples/pipeline/wikidata_entity_linking.py | 258 +++++++++++-------- spacy/tests/serialize/test_serialize_kb.py | 27 +- 2 files changed, 171 insertions(+), 114 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 3b0943167..2a544674f 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -1,7 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -"""Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. +from spacy.vocab import Vocab + +""" +Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. """ import re import json @@ -17,6 +20,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' +VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' # these will/should be matched ignoring case @@ -40,12 +44,16 @@ map_alias_to_link = dict() def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): kb = KnowledgeBase(vocab=vocab) - id_to_title = _read_wikidata(limit=1000) - title_to_id = {v:k for k,v in id_to_title.items()} + id_to_title = _read_wikidata_entities(limit=None) + title_to_id = {v: k for k, v in id_to_title.items()} + + entity_list = list(id_to_title.keys()) + title_list = [id_to_title[x] for x in entity_list] + entity_frequencies = _get_entity_frequencies(entities=title_list, to_print=False) _add_entities(kb, - entities=id_to_title.keys(), - probs=[0.4 for x in id_to_title.keys()], + entities=entity_list, + probs=entity_frequencies, to_print=to_print) _add_aliases(kb, @@ -64,6 +72,38 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): return kb +def _get_entity_frequencies(entities, to_print=False): + count_entities = [0 for _ in entities] + total_count = 0 + + with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: + # skip header + prior_file.readline() + line = prior_file.readline() + # we can read this file sequentially, it's sorted by alias, and then by count + + while line: + splits = line.replace('\n', "").split(sep='|') + # alias = splits[0] + count = int(splits[1]) + entity = splits[2] + + if entity in entities: + index = entities.index(entity) + count_entities[index] = count_entities[index] + count + + total_count += count + + line = prior_file.readline() + + if to_print: + for entity, count in zip(entities, count_entities): + print("Entity count:", entity, count) + print("Total count:", total_count) + + return [x*100 / total_count for x in count_entities] + + def _add_entities(kb, entities, probs, to_print=False): for entity, prob in zip(entities, probs): kb.add_entity(entity=entity, prob=prob) @@ -76,7 +116,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals wp_titles = title_to_id.keys() if to_print: - print("wp titles", wp_titles) + print("wp titles:", wp_titles) # adding aliases with prior probabilities with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: @@ -125,89 +165,100 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) -def _read_wikidata(limit=None, to_print=False): - """ Read the JSON wiki data """ +def _read_wikidata_entities(limit=None, to_print=False): + """ Read the JSON wiki data and parse out the entities""" languages = {'en', 'de'} prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected - sites = {'enwiki'} + site_filter = 'enwiki' entity_dict = dict() + # parse appropriate fields - depending on what we need in the KB + parse_properties = False + parse_sitelinks = True + parse_labels = False + parse_descriptions = False + parse_aliases = False + with bz2.open(WIKIDATA_JSON, mode='rb') as file: line = file.readline() - cnt = 1 + cnt = 0 while line and (not limit or cnt < limit): + if cnt % 100000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") clean_line = line.strip() if clean_line.endswith(b","): clean_line = clean_line[:-1] if len(clean_line) > 1: obj = json.loads(clean_line) - keep = False + unique_id = obj["id"] + entry_type = obj["type"] - # filtering records on their properties - # TODO: filter on rank: preferred, normal or deprecated - claims = obj["claims"] - for prop, value_set in prop_filter.items(): - claim_property = claims.get(prop, None) - if claim_property: - for cp in claim_property: - cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') - if cp_id in value_set: - keep = True + if unique_id[0] == 'Q' and entry_type == "item": + # filtering records on their properties + keep = False + claims = obj["claims"] + for prop, value_set in prop_filter.items(): + claim_property = claims.get(prop, None) + if claim_property: + for cp in claim_property: + cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') + cp_rank = cp['rank'] + if cp_rank != "deprecated" and cp_id in value_set: + keep = True - if keep: - unique_id = obj["id"] - entry_type = obj["type"] + if keep: + if to_print: + print("ID:", unique_id) + print("type:", entry_type) - if to_print: - print("ID:", unique_id) - print("type:", entry_type) + # parsing all properties that refer to other entities + if parse_properties: + for prop, claim_property in claims.items(): + cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')] + cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None] + if cp_values: + if to_print: + print("prop:", prop, cp_values) - # parsing all properties that refer to other entities - for prop, claim_property in claims.items(): - cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')] - cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None] - if cp_values: - if to_print: - print("prop:", prop, cp_values) - - entry_sites = obj["sitelinks"] - for site in sites: - site_value = entry_sites.get(site, None) - if site_value: - if to_print: - print(site, ":", site_value['title']) - if site == "enwiki": + if parse_sitelinks: + site_value = obj["sitelinks"].get(site_filter, None) + if site_value: + if to_print: + print(site_filter, ":", site_value['title']) entity_dict[unique_id] = site_value['title'] - labels = obj["labels"] - if labels: - for lang in languages: - lang_label = labels.get(lang, None) - if lang_label: - if to_print: - print("label (" + lang + "):", lang_label["value"]) + if parse_labels: + labels = obj["labels"] + if labels: + for lang in languages: + lang_label = labels.get(lang, None) + if lang_label: + if to_print: + print("label (" + lang + "):", lang_label["value"]) - descriptions = obj["descriptions"] - if descriptions: - for lang in languages: - lang_descr = descriptions.get(lang, None) - if lang_descr: - if to_print: - print("description (" + lang + "):", lang_descr["value"]) + if parse_descriptions: + descriptions = obj["descriptions"] + if descriptions: + for lang in languages: + lang_descr = descriptions.get(lang, None) + if lang_descr: + if to_print: + print("description (" + lang + "):", lang_descr["value"]) - aliases = obj["aliases"] - if aliases: - for lang in languages: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - for item in lang_aliases: - if to_print: - print("alias (" + lang + "):", item["value"]) + if parse_aliases: + aliases = obj["aliases"] + if aliases: + for lang in languages: + lang_aliases = aliases.get(lang, None) + if lang_aliases: + for item in lang_aliases: + if to_print: + print("alias (" + lang + "):", item["value"]) - if to_print: - print() + if to_print: + print() line = file.readline() cnt += 1 @@ -236,7 +287,7 @@ def _read_wikipedia_prior_probs(): cnt = 0 while line: if cnt % 5000000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines") + print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") clean_line = line.strip().decode("utf-8") matches = link_regex.findall(clean_line) @@ -394,7 +445,8 @@ def add_el(kb, nlp): text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ "Douglas reminds us to always bring our towel. " \ - "The main character in Doug's novel is the man Arthur Dent, but Douglas doesn't write about George Washington." + "The main character in Doug's novel is the man Arthur Dent, " \ + "but Douglas doesn't write about George Washington or Homer Simpson." doc = nlp(text) print() @@ -414,48 +466,46 @@ def capitalize_first(text): result += text[1:] return result + if __name__ == "__main__": + to_create_prior_probs = False + to_create_kb = True + to_read_kb = False + # STEP 1 : create prior probabilities from WP # run only once ! - # _read_wikipedia_prior_probs() + if to_create_prior_probs: + _read_wikipedia_prior_probs() - # STEP 2 : create KB - # nlp = spacy.load('en_core_web_sm') - # my_kb = create_kb(nlp.vocab, max_entities_per_alias=10, min_occ=5, to_print=True) + if to_create_kb: + # STEP 2 : create KB + my_nlp = spacy.load('en_core_web_sm') + my_vocab = my_nlp.vocab + my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False) + print("kb entities:", my_kb.get_size_entities()) + print("kb aliases:", my_kb.get_size_aliases()) - # STEP 3 : write KB to file - nlp1 = spacy.load('en_core_web_sm') - my_vocab = nlp1.vocab - kb1 = KnowledgeBase(vocab=my_vocab) + # STEP 3 : write KB to file + my_kb.dump(KB_FILE) + my_vocab.to_disk(VOCAB_DIR) - kb1.add_entity(entity="Q53", prob=0.33) - kb1.add_entity(entity="Q17", prob=0.1) - kb1.add_entity(entity="Q007", prob=0.7) - kb1.add_entity(entity="Q44", prob=0.4) - kb1.add_alias(alias="double07", entities=["Q007", "Q17"], probabilities=[0.9, 0.1]) - kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1]) - kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0]) + if to_read_kb: + # STEP 4 : read KB back in from file + my_vocab = Vocab() + my_vocab.from_disk(VOCAB_DIR) + my_kb = KnowledgeBase(vocab=my_vocab) + my_kb.load_bulk(KB_FILE) + print("kb entities:", my_kb.get_size_entities()) + print("kb aliases:", my_kb.get_size_aliases()) - print("kb1 size:", len(kb1), kb1.get_size_entities(), kb1.get_size_aliases()) - print("kb1 entities:", kb1.get_entity_strings()) - print("kb1 aliases:", kb1.get_alias_strings()) + # test KB + candidates = my_kb.get_candidates("Bush") + for c in candidates: + print() + print("entity:", c.entity_) + print("entity freq:", c.entity_freq) + print("alias:", c.alias_) + print("prior prob:", c.prior_prob) - print() - print("dumping kb1") - print(KB_FILE, type(KB_FILE)) - kb1.dump(KB_FILE) - - # STEP 4 : read KB back in from file - - kb3 = KnowledgeBase(vocab=my_vocab) - - print("loading kb3") - kb3.load_bulk(KB_FILE) - - print() - print("kb3 size:", len(kb3), kb3.get_size_entities(), kb3.get_size_aliases()) - print("kb3 entities:", kb3.get_entity_strings()) - print("kb3 aliases:", kb3.get_alias_strings()) - - # STEP 5 : actually use the EL functionality + # STEP 5: add KB to NLP pipeline # add_el(my_kb, nlp) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 3ff6eaef6..7b1380623 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,3 +1,5 @@ +import spacy +from spacy.lang.en import English from ..util import make_tempdir from ...util import ensure_path @@ -5,17 +7,8 @@ from spacy.kb import KnowledgeBase def test_serialize_kb_disk(en_vocab): - kb1 = KnowledgeBase(vocab=en_vocab) - - kb1.add_entity(entity="Q53", prob=0.33) - kb1.add_entity(entity="Q17", prob=0.2) - kb1.add_entity(entity="Q007", prob=0.7) - kb1.add_entity(entity="Q44", prob=0.4) - kb1.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9]) - kb1.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1]) - kb1.add_alias(alias="random", entities=["Q007"], probabilities=[1.0]) - # baseline assertions + kb1 = _get_dummy_kb(en_vocab) _check_kb(kb1) # dumping to file & loading back in @@ -34,6 +27,20 @@ def test_serialize_kb_disk(en_vocab): _check_kb(kb2) +def _get_dummy_kb(vocab): + kb = KnowledgeBase(vocab=vocab) + + kb.add_entity(entity="Q53", prob=0.33) + kb.add_entity(entity="Q17", prob=0.2) + kb.add_entity(entity="Q007", prob=0.7) + kb.add_entity(entity="Q44", prob=0.4) + kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9]) + kb.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1]) + kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0]) + + return kb + + def _check_kb(kb): # check entities assert kb.get_size_entities() == 4 From 653b7d9c87e62c8e37c96f0bac76f5c18ca4889c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 30 Apr 2019 11:39:42 +0200 Subject: [PATCH 019/148] calculate entity raw counts offline to speed up KB construction --- examples/pipeline/wikidata_entity_linking.py | 91 ++++++++++++++------ 1 file changed, 64 insertions(+), 27 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 2a544674f..43ba7d8d3 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -1,23 +1,25 @@ # coding: utf-8 from __future__ import unicode_literals -from spacy.vocab import Vocab - """ Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. """ import re +import csv import json import spacy import datetime import bz2 from spacy.kb import KnowledgeBase +from spacy.vocab import Vocab # TODO: remove hardcoded paths WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2' ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2' ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' + PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' +ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' @@ -44,18 +46,30 @@ map_alias_to_link = dict() def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): kb = KnowledgeBase(vocab=vocab) - id_to_title = _read_wikidata_entities(limit=None) - title_to_id = {v: k for k, v in id_to_title.items()} + print() + print("1. _read_wikidata_entities", datetime.datetime.now()) + print() + title_to_id = _read_wikidata_entities(limit=100000) - entity_list = list(id_to_title.keys()) - title_list = [id_to_title[x] for x in entity_list] - entity_frequencies = _get_entity_frequencies(entities=title_list, to_print=False) + title_list = list(title_to_id.keys()) + entity_list = [title_to_id[x] for x in title_list] + print() + print("2. _get_entity_frequencies", datetime.datetime.now()) + print() + entity_frequencies = _get_entity_frequencies(entities=title_list) + + print() + print("3. _add_entities", datetime.datetime.now()) + print() _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print) + print() + print("4. _add_aliases", datetime.datetime.now()) + print() _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, @@ -72,15 +86,26 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): return kb -def _get_entity_frequencies(entities, to_print=False): - count_entities = [0 for _ in entities] +def _get_entity_frequencies(entities): + entity_to_count = dict() + with open(ENTITY_COUNTS, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') + # skip header + next(csvreader) + for row in csvreader: + entity_to_count[row[0]] = int(row[1]) + + return [entity_to_count.get(e, 0) for e in entities] + + +def _write_entity_counts(to_print=False): + entity_to_count = dict() total_count = 0 with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: # skip header prior_file.readline() line = prior_file.readline() - # we can read this file sequentially, it's sorted by alias, and then by count while line: splits = line.replace('\n', "").split(sep='|') @@ -88,23 +113,26 @@ def _get_entity_frequencies(entities, to_print=False): count = int(splits[1]) entity = splits[2] - if entity in entities: - index = entities.index(entity) - count_entities[index] = count_entities[index] + count + current_count = entity_to_count.get(entity, 0) + entity_to_count[entity] = current_count + count total_count += count line = prior_file.readline() + with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file: + entity_file.write("entity" + "|" + "count" + "\n") + for entity, count in entity_to_count.items(): + entity_file.write(entity + "|" + str(count) + "\n") + if to_print: - for entity, count in zip(entities, count_entities): + for entity, count in entity_to_count.items(): print("Entity count:", entity, count) print("Total count:", total_count) - return [x*100 / total_count for x in count_entities] - def _add_entities(kb, entities, probs, to_print=False): + # TODO: this should be a bulk method for entity, prob in zip(entities, probs): kb.add_entity(entity=entity, prob=prob) @@ -166,13 +194,13 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals def _read_wikidata_entities(limit=None, to_print=False): - """ Read the JSON wiki data and parse out the entities""" + """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ languages = {'en', 'de'} prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected site_filter = 'enwiki' - entity_dict = dict() + title_to_id = dict() # parse appropriate fields - depending on what we need in the KB parse_properties = False @@ -192,12 +220,12 @@ def _read_wikidata_entities(limit=None, to_print=False): clean_line = clean_line[:-1] if len(clean_line) > 1: obj = json.loads(clean_line) - unique_id = obj["id"] entry_type = obj["type"] - if unique_id[0] == 'Q' and entry_type == "item": + if entry_type == "item": # filtering records on their properties keep = False + claims = obj["claims"] for prop, value_set in prop_filter.items(): claim_property = claims.get(prop, None) @@ -209,6 +237,8 @@ def _read_wikidata_entities(limit=None, to_print=False): keep = True if keep: + unique_id = obj["id"] + if to_print: print("ID:", unique_id) print("type:", entry_type) @@ -225,9 +255,10 @@ def _read_wikidata_entities(limit=None, to_print=False): if parse_sitelinks: site_value = obj["sitelinks"].get(site_filter, None) if site_value: + site = site_value['title'] if to_print: - print(site_filter, ":", site_value['title']) - entity_dict[unique_id] = site_value['title'] + print(site_filter, ":", site) + title_to_id[site] = unique_id if parse_labels: labels = obj["labels"] @@ -262,7 +293,7 @@ def _read_wikidata_entities(limit=None, to_print=False): line = file.readline() cnt += 1 - return entity_dict + return title_to_id def _read_wikipedia_prior_probs(): @@ -469,6 +500,7 @@ def capitalize_first(text): if __name__ == "__main__": to_create_prior_probs = False + to_create_entity_counts = False to_create_kb = True to_read_kb = False @@ -477,20 +509,25 @@ if __name__ == "__main__": if to_create_prior_probs: _read_wikipedia_prior_probs() + # STEP 2 : deduce entity frequencies from WP + # run only once ! + if to_create_entity_counts: + _write_entity_counts() + if to_create_kb: - # STEP 2 : create KB + # STEP 3 : create KB my_nlp = spacy.load('en_core_web_sm') my_vocab = my_nlp.vocab my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False) print("kb entities:", my_kb.get_size_entities()) print("kb aliases:", my_kb.get_size_aliases()) - # STEP 3 : write KB to file + # STEP 4 : write KB to file my_kb.dump(KB_FILE) my_vocab.to_disk(VOCAB_DIR) if to_read_kb: - # STEP 4 : read KB back in from file + # STEP 5 : read KB back in from file my_vocab = Vocab() my_vocab.from_disk(VOCAB_DIR) my_kb = KnowledgeBase(vocab=my_vocab) @@ -507,5 +544,5 @@ if __name__ == "__main__": print("alias:", c.alias_) print("prior prob:", c.prior_prob) - # STEP 5: add KB to NLP pipeline + # STEP 6: add KB to NLP pipeline # add_el(my_kb, nlp) From 60b54ae8ce4ca5ad2bbb59153af283032a8905fc Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 1 May 2019 00:00:38 +0200 Subject: [PATCH 020/148] bulk entity writing and experiment with regex wikidata reader to speed up processing --- examples/pipeline/wikidata_entity_linking.py | 92 ++++++++++++++++---- spacy/kb.pxd | 2 + spacy/kb.pyx | 56 ++++++++++++ 3 files changed, 135 insertions(+), 15 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 43ba7d8d3..0a373e5fa 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -49,7 +49,8 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): print() print("1. _read_wikidata_entities", datetime.datetime.now()) print() - title_to_id = _read_wikidata_entities(limit=100000) + # title_to_id = _read_wikidata_entities_regex(limit=1000) + title_to_id = _read_wikidata_entities_json(limit=1000) title_list = list(title_to_id.keys()) entity_list = [title_to_id[x] for x in title_list] @@ -62,19 +63,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): print() print("3. _add_entities", datetime.datetime.now()) print() - _add_entities(kb, - entities=entity_list, - probs=entity_frequencies, - to_print=to_print) + kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None) + # _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print) print() print("4. _add_aliases", datetime.datetime.now()) print() - _add_aliases(kb, - title_to_id=title_to_id, - max_entities_per_alias=max_entities_per_alias, - min_occ=min_occ, - to_print=to_print) + _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,) # TODO: read wikipedia texts for entity context # _read_wikipedia() @@ -83,6 +78,8 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): print() print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) + print("done with kb", datetime.datetime.now()) + return kb @@ -131,8 +128,7 @@ def _write_entity_counts(to_print=False): print("Total count:", total_count) -def _add_entities(kb, entities, probs, to_print=False): - # TODO: this should be a bulk method +def _add_entities_depr(kb, entities, probs, to_print=False): for entity, prob in zip(entities, probs): kb.add_entity(entity=entity, prob=prob) @@ -193,7 +189,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) -def _read_wikidata_entities(limit=None, to_print=False): +def _read_wikidata_entities_json(limit=None, to_print=False): """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ languages = {'en', 'de'} @@ -259,6 +255,7 @@ def _read_wikidata_entities(limit=None, to_print=False): if to_print: print(site_filter, ":", site) title_to_id[site] = unique_id + # print(site, "for", unique_id) if parse_labels: labels = obj["labels"] @@ -296,6 +293,56 @@ def _read_wikidata_entities(limit=None, to_print=False): return title_to_id +def _read_wikidata_entities_regex_depr(limit=None, to_print=False): + """ Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. """ + + regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE) + regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) + regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE) + regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) + + title_to_id = dict() + + with bz2.open(WIKIDATA_JSON, mode='rb') as file: + line = file.readline() + cnt = 0 + while line and (not limit or cnt < limit): + if cnt % 100000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") + clean_line = line.strip() + if clean_line.endswith(b","): + clean_line = clean_line[:-1] + if len(clean_line) > 1: + clean_line = line.strip().decode("utf-8") + keep = False + + p31_matches = regex_p31.findall(clean_line) + if p31_matches: + for p31_match in p31_matches: + id_matches = regex_id.findall(p31_match) + for id_match in id_matches: + id_match = id_match[6:][:-1] + if id_match == "Q5" or id_match == "Q15632617": + keep = True + + if keep: + id_match = regex_id.search(clean_line).group(0) + id_match = id_match[6:][:-1] + + enwiki_matches = regex_enwiki.findall(clean_line) + if enwiki_matches: + for enwiki_match in enwiki_matches: + title_match = regex_title.search(enwiki_match).group(0) + title = title_match[9:][:-1] + title_to_id[title] = id_match + # print(title, "for", id_match) + + line = file.readline() + cnt += 1 + + return title_to_id + + def _read_wikipedia_prior_probs(): """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities The full file takes about 2h to parse 1100M lines (update printed every 5M lines) @@ -499,50 +546,65 @@ def capitalize_first(text): if __name__ == "__main__": + print("START", datetime.datetime.now()) + to_create_prior_probs = False to_create_entity_counts = False to_create_kb = True - to_read_kb = False + to_read_kb = True # STEP 1 : create prior probabilities from WP # run only once ! if to_create_prior_probs: + print("STEP 1: to_create_prior_probs", datetime.datetime.now()) _read_wikipedia_prior_probs() + print() # STEP 2 : deduce entity frequencies from WP # run only once ! if to_create_entity_counts: + print("STEP 2: to_create_entity_counts", datetime.datetime.now()) _write_entity_counts() + print() if to_create_kb: # STEP 3 : create KB + print("STEP 3: to_create_kb", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_sm') my_vocab = my_nlp.vocab my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False) print("kb entities:", my_kb.get_size_entities()) print("kb aliases:", my_kb.get_size_aliases()) + print() # STEP 4 : write KB to file + print("STEP 4: write KB", datetime.datetime.now()) my_kb.dump(KB_FILE) my_vocab.to_disk(VOCAB_DIR) + print() if to_read_kb: # STEP 5 : read KB back in from file + print("STEP 5: to_read_kb", datetime.datetime.now()) my_vocab = Vocab() my_vocab.from_disk(VOCAB_DIR) my_kb = KnowledgeBase(vocab=my_vocab) my_kb.load_bulk(KB_FILE) print("kb entities:", my_kb.get_size_entities()) print("kb aliases:", my_kb.get_size_aliases()) + print() # test KB candidates = my_kb.get_candidates("Bush") for c in candidates: - print() print("entity:", c.entity_) print("entity freq:", c.entity_freq) print("alias:", c.alias_) print("prior prob:", c.prior_prob) + print() # STEP 6: add KB to NLP pipeline + # print("STEP 6: use KB", datetime.datetime.now()) # add_el(my_kb, nlp) + + print("STOP", datetime.datetime.now()) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 82b06d192..494848e5e 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -131,6 +131,8 @@ cdef class KnowledgeBase: self._aliases_table.push_back(alias) cpdef load_bulk(self, loc) + cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list) + cpdef set_aliases(self, alias_list, entities_list, probabilities_list) cdef class Writer: diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ad2e13b5e..ba870661d 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -111,6 +111,62 @@ cdef class KnowledgeBase: return entity_hash + cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list): + nr_entities = len(entity_list) + self._entry_index = PreshMap(nr_entities+1) + self._entries = entry_vec(nr_entities+1) + + i = 0 + cdef EntryC entry + cdef int32_t dummy_value = 342 + while i < nr_entities: + # TODO features and vectors + entity_hash = self.vocab.strings.add(entity_list[i]) + entry.entity_hash = entity_hash + entry.prob = prob_list[i] + entry.vector_rows = &dummy_value + entry.feats_row = dummy_value + + self._entries[i+1] = entry + self._entry_index[entity_hash] = i+1 + + i += 1 + + # TODO: this method is untested + cpdef set_aliases(self, alias_list, entities_list, probabilities_list): + nr_aliases = len(alias_list) + self._alias_index = PreshMap(nr_aliases+1) + self._aliases_table = alias_vec(nr_aliases+1) + + i = 0 + cdef AliasC alias + cdef int32_t dummy_value = 342 + while i <= nr_aliases: + alias_hash = self.vocab.strings.add(alias_list[i]) + entities = entities_list[i] + probabilities = probabilities_list[i] + + nr_candidates = len(entities) + entry_indices = vector[int64_t](nr_candidates) + probs = vector[float](nr_candidates) + + for j in range(0, nr_candidates): + entity = entities[j] + entity_hash = self.vocab.strings[entity] + if not entity_hash in self._entry_index: + raise ValueError(Errors.E134.format(alias=alias, entity=entity)) + + entry_index = self._entry_index.get(entity_hash) + entry_indices[j] = entry_index + + alias.entry_indices = entry_indices + alias.probs = probs + + self._aliases_table[i] = alias + self._alias_index[alias_hash] = i + + i += 1 + def add_alias(self, unicode alias, entities, probabilities): """ For a given alias, add its potential entities and prior probabilies to the KB. From 3629a52ede3479cbf494e5e9472ceefff78ea74b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 1 May 2019 01:00:59 +0200 Subject: [PATCH 021/148] reading all persons in wikidata --- examples/pipeline/wikidata_entity_linking.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 0a373e5fa..287e4a50b 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -50,7 +50,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): print("1. _read_wikidata_entities", datetime.datetime.now()) print() # title_to_id = _read_wikidata_entities_regex(limit=1000) - title_to_id = _read_wikidata_entities_json(limit=1000) + title_to_id = _read_wikidata_entities_json(limit=None) title_list = list(title_to_id.keys()) entity_list = [title_to_id[x] for x in title_list] @@ -209,7 +209,7 @@ def _read_wikidata_entities_json(limit=None, to_print=False): line = file.readline() cnt = 0 while line and (not limit or cnt < limit): - if cnt % 100000 == 0: + if cnt % 500000 == 0: print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") clean_line = line.strip() if clean_line.endswith(b","): @@ -307,7 +307,7 @@ def _read_wikidata_entities_regex_depr(limit=None, to_print=False): line = file.readline() cnt = 0 while line and (not limit or cnt < limit): - if cnt % 100000 == 0: + if cnt % 500000 == 0: print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") clean_line = line.strip() if clean_line.endswith(b","): From 1ae41daaa92e4099d4e15c7b5a9801ad7994ad68 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 1 May 2019 23:05:40 +0200 Subject: [PATCH 022/148] allow small rounding errors --- examples/pipeline/wikidata_entity_linking.py | 9 ++++++--- spacy/kb.pyx | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 287e4a50b..02a766d0f 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -61,13 +61,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): entity_frequencies = _get_entity_frequencies(entities=title_list) print() - print("3. _add_entities", datetime.datetime.now()) + print("3. adding", len(entity_list), "entities", datetime.datetime.now()) print() kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None) # _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print) print() - print("4. _add_aliases", datetime.datetime.now()) + print("4. adding aliases", datetime.datetime.now()) print() _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,) @@ -171,7 +171,10 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=Fals prior_probs.append(p_entity_givenalias) if selected_entities: - kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs) + try: + kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs) + except ValueError as e: + print(e) total_count = 0 counts = list() entities = list() diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ba870661d..d471130d0 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -179,9 +179,9 @@ cdef class KnowledgeBase: entities_length=len(entities), probabilities_length=len(probabilities))) - # Throw an error if the probabilities sum up to more than 1 + # Throw an error if the probabilities sum up to more than 1 (allow for some rounding errors) prob_sum = sum(probabilities) - if prob_sum > 1: + if prob_sum > 1.00001: raise ValueError(Errors.E133.format(alias=alias, sum=prob_sum)) cdef hash_t alias_hash = self.vocab.strings.add(alias) From 835355219123d4502eb3157a3700b6a7d3ae06d2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 1 May 2019 23:26:16 +0200 Subject: [PATCH 023/148] cleanup --- examples/pipeline/wikidata_entity_linking.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 02a766d0f..e293be90f 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -49,7 +49,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): print() print("1. _read_wikidata_entities", datetime.datetime.now()) print() - # title_to_id = _read_wikidata_entities_regex(limit=1000) + # title_to_id = _read_wikidata_entities_regex_depr(limit=1000) title_to_id = _read_wikidata_entities_json(limit=None) title_list = list(title_to_id.keys()) @@ -64,7 +64,6 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): print("3. adding", len(entity_list), "entities", datetime.datetime.now()) print() kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None) - # _add_entities(kb, entities=entity_list, probs=entity_frequencies, to_print=to_print) print() print("4. adding aliases", datetime.datetime.now()) @@ -128,14 +127,6 @@ def _write_entity_counts(to_print=False): print("Total count:", total_count) -def _add_entities_depr(kb, entities, probs, to_print=False): - for entity, prob in zip(entities, probs): - kb.add_entity(entity=entity, prob=prob) - - if to_print: - print("added", kb.get_size_entities(), "entities:", kb.get_entity_strings()) - - def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False): wp_titles = title_to_id.keys() @@ -553,7 +544,7 @@ if __name__ == "__main__": to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = True + to_create_kb = False to_read_kb = True # STEP 1 : create prior probabilities from WP From 581dc9742d2a7dc790bab7fe59993de8b1279b3b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 2 May 2019 17:09:56 +0200 Subject: [PATCH 024/148] parsing clean text from WP articles to use as input data for NER and NEL --- examples/pipeline/wikidata_entity_linking.py | 492 ++++++++++++------- 1 file changed, 320 insertions(+), 172 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index e293be90f..e6df39631 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -10,9 +10,13 @@ import json import spacy import datetime import bz2 + from spacy.kb import KnowledgeBase from spacy.vocab import Vocab +# requires: pip install neuralcoref --no-binary neuralcoref +# import neuralcoref + # TODO: remove hardcoded paths WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2' ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2' @@ -20,6 +24,7 @@ ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-ar PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' +ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' @@ -43,7 +48,151 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", map_alias_to_link = dict() -def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): +def read_wikipedia_prior_probs(): + """ + STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities + The full file takes about 2h to parse 1100M lines (update printed every 5M lines). + It works relatively fast because we don't care about which article we parsed the interwiki from, + we just process line by line. + """ + + with bz2.open(ENWIKI_DUMP, mode='rb') as file: + line = file.readline() + cnt = 0 + while line: + if cnt % 5000000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") + clean_line = line.strip().decode("utf-8") + + aliases, entities, normalizations = _get_wp_links(clean_line) + for alias, entity, norm in zip(aliases, entities, normalizations): + _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True) + _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True) + + line = file.readline() + cnt += 1 + + # write all aliases and their entities and occurrences to file + with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile: + outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") + for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): + for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True): + outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") + + +# find the links +link_regex = re.compile(r'\[\[[^\[\]]*\]\]') + +# match on interwiki links, e.g. `en:` or `:fr:` +ns_regex = r":?" + "[a-z][a-z]" + ":" + +# match on Namespace: optionally preceded by a : +for ns in wiki_namespaces: + ns_regex += "|" + ":?" + ns + ":" + +ns_regex = re.compile(ns_regex, re.IGNORECASE) + + +def _get_wp_links(text): + aliases = [] + entities = [] + normalizations = [] + + matches = link_regex.findall(text) + for match in matches: + match = match[2:][:-2].replace("_", " ").strip() + + if ns_regex.match(match): + pass # ignore namespaces at the beginning of the string + + # this is a simple link, with the alias the same as the mention + elif "|" not in match: + aliases.append(match) + entities.append(match) + normalizations.append(True) + + # in wiki format, the link is written as [[entity|alias]] + else: + splits = match.split("|") + entity = splits[0].strip() + alias = splits[1].strip() + # specific wiki format [[alias (specification)|]] + if len(alias) == 0 and "(" in entity: + alias = entity.split("(")[0] + aliases.append(alias) + entities.append(entity) + normalizations.append(False) + else: + aliases.append(alias) + entities.append(entity) + normalizations.append(False) + + return aliases, entities, normalizations + + +def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): + alias = alias.strip() + entity = entity.strip() + + # remove everything after # as this is not part of the title but refers to a specific paragraph + if normalize_entity: + # wikipedia titles are always capitalized + entity = _capitalize_first(entity.split("#")[0]) + if normalize_alias: + alias = alias.split("#")[0] + + if alias and entity: + alias_dict = map_alias_to_link.get(alias, dict()) + entity_count = alias_dict.get(entity, 0) + alias_dict[entity] = entity_count + 1 + map_alias_to_link[alias] = alias_dict + + +def _capitalize_first(text): + if not text: + return None + result = text[0].capitalize() + if len(result) > 0: + result += text[1:] + return result + + +def write_entity_counts(to_print=False): + """ STEP 2: write entity counts """ + entity_to_count = dict() + total_count = 0 + + with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: + # skip header + prior_file.readline() + line = prior_file.readline() + + while line: + splits = line.replace('\n', "").split(sep='|') + # alias = splits[0] + count = int(splits[1]) + entity = splits[2] + + current_count = entity_to_count.get(entity, 0) + entity_to_count[entity] = current_count + count + + total_count += count + + line = prior_file.readline() + + with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file: + entity_file.write("entity" + "|" + "count" + "\n") + for entity, count in entity_to_count.items(): + entity_file.write(entity + "|" + str(count) + "\n") + + if to_print: + for entity, count in entity_to_count.items(): + print("Entity count:", entity, count) + print("Total count:", total_count) + + +def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True): + """ STEP 3: create the knowledge base """ kb = KnowledgeBase(vocab=vocab) print() @@ -52,6 +201,13 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False): # title_to_id = _read_wikidata_entities_regex_depr(limit=1000) title_to_id = _read_wikidata_entities_json(limit=None) + # write the title-ID mapping to file + if write_entity_defs: + with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file: + entity_file.write("WP_title" + "|" + "WD_id" + "\n") + for title, qid in title_to_id.items(): + entity_file.write(title + "|" + str(qid) + "\n") + title_list = list(title_to_id.keys()) entity_list = [title_to_id[x] for x in title_list] @@ -94,37 +250,16 @@ def _get_entity_frequencies(entities): return [entity_to_count.get(e, 0) for e in entities] -def _write_entity_counts(to_print=False): - entity_to_count = dict() - total_count = 0 - - with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: +def _get_entity_to_id(): + entity_to_id = dict() + with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') # skip header - prior_file.readline() - line = prior_file.readline() + next(csvreader) + for row in csvreader: + entity_to_id[row[0]] = row[1] - while line: - splits = line.replace('\n', "").split(sep='|') - # alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - current_count = entity_to_count.get(entity, 0) - entity_to_count[entity] = current_count + count - - total_count += count - - line = prior_file.readline() - - with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file: - entity_file.write("entity" + "|" + "count" + "\n") - for entity, count in entity_to_count.items(): - entity_file.write(entity + "|" + str(count) + "\n") - - if to_print: - for entity, count in entity_to_count.items(): - print("Entity count:", entity, count) - print("Total count:", total_count) + return entity_to_id def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False): @@ -337,85 +472,60 @@ def _read_wikidata_entities_regex_depr(limit=None, to_print=False): return title_to_id -def _read_wikipedia_prior_probs(): - """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities - The full file takes about 2h to parse 1100M lines (update printed every 5M lines) - """ +def test_kb(kb): + # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO + nlp = spacy.load('en_core_web_sm') - # find the links - link_regex = re.compile(r'\[\[[^\[\]]*\]\]') + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) + nlp.add_pipe(el_pipe, last=True) - # match on interwiki links, e.g. `en:` or `:fr:` - ns_regex = r":?" + "[a-z][a-z]" + ":" + candidates = my_kb.get_candidates("Bush") - # match on Namespace: optionally preceded by a : - for ns in wiki_namespaces: - ns_regex += "|" + ":?" + ns + ":" + print("generating candidates for 'Bush' :") + for c in candidates: + print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") + print() - ns_regex = re.compile(ns_regex, re.IGNORECASE) + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is the man Arthur Dent, " \ + "but Douglas doesn't write about George Washington or Homer Simpson." + doc = nlp(text) - with bz2.open(ENWIKI_DUMP, mode='rb') as file: - line = file.readline() - cnt = 0 - while line: - if cnt % 5000000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") - clean_line = line.strip().decode("utf-8") - - matches = link_regex.findall(clean_line) - for match in matches: - match = match[2:][:-2].replace("_", " ").strip() - - if ns_regex.match(match): - pass # ignore namespaces at the beginning of the string - - # this is a simple link, with the alias the same as the mention - elif "|" not in match: - _store_alias(match, match, normalize_alias=True, normalize_entity=True) - - # in wiki format, the link is written as [[entity|alias]] - else: - splits = match.split("|") - entity = splits[0].strip() - alias = splits[1].strip() - # specific wiki format [[alias (specification)|]] - if len(alias) == 0 and "(" in entity: - alias = entity.split("(")[0] - _store_alias(alias, entity, normalize_alias=False, normalize_entity=True) - else: - _store_alias(alias, entity, normalize_alias=False, normalize_entity=True) - - line = file.readline() - cnt += 1 - - # write all aliases and their entities and occurrences to file - with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile: - outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") - for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): - for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True): - outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) -def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): - alias = alias.strip() - entity = entity.strip() +def add_coref(): + """ STEP 5: add coreference resolution to our model """ + nlp = spacy.load('en_core_web_sm') + # nlp = spacy.load('en') - # remove everything after # as this is not part of the title but refers to a specific paragraph - if normalize_entity: - # wikipedia titles are always capitalized - entity = capitalize_first(entity.split("#")[0]) - if normalize_alias: - alias = alias.split("#")[0] + # TODO: this doesn't work yet + # neuralcoref.add_to_pipe(nlp) + print("done adding to pipe") - if alias and entity: - alias_dict = map_alias_to_link.get(alias, dict()) - entity_count = alias_dict.get(entity, 0) - alias_dict[entity] = entity_count + 1 - map_alias_to_link[alias] = alias_dict + doc = nlp(u'My sister has a dog. She loves him.') + print("done doc") + + print(doc._.has_coref) + print(doc._.coref_clusters) -def _read_wikipedia(): - """ Read the XML wikipedia data """ +def create_training(): + nlp = spacy.load('en_core_web_sm') + wp_to_id = _get_entity_to_id() + _read_wikipedia(nlp, wp_to_id, limit=10000) + + +def _read_wikipedia(nlp, wp_to_id, limit=None): + """ Read the XML wikipedia data to parse out training data """ + + # regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) + # regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) + + title_regex = re.compile(r'(?<=).*(?=)') + id_regex = re.compile(r'(?<=)\d*(?=)') with bz2.open(ENWIKI_DUMP, mode='rb') as file: line = file.readline() @@ -424,19 +534,19 @@ def _read_wikipedia(): article_title = None article_id = None reading_text = False - while line and cnt < 1000000: + while line and (not limit or cnt < limit): clean_line = line.strip().decode("utf-8") # Start reading new page if clean_line == "": article_text = "" article_title = None - article_id = 342 + article_id = None # finished reading this page elif clean_line == "": if article_id: - _store_wp_article(article_id, article_title, article_text.strip()) + _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text.strip()) # start reading text within a page if ")\d*(?=)", clean_line) + ids = id_regex.search(clean_line) if ids: article_id = ids[0] # read the title of this article - titles = re.findall(r"(?<=).*(?=)", clean_line) + titles = title_regex.search(clean_line) if titles: article_title = titles[0].strip() @@ -463,107 +573,145 @@ def _read_wikipedia(): cnt += 1 -def _store_wp_article(article_id, article_title, article_text): - pass +def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text): + # remove the text tags + text_regex = re.compile(r'(?<=).*(?=)') + text = text_regex.search(article_text).group(0) + + # stop processing if this is a redirect page + if text.startswith("#REDIRECT"): + return + print("WP article", article_id, ":", article_title) - print(article_text) - print(_get_clean_wp_text(article_text)) + + article_dict = dict() + aliases, entities, normalizations = _get_wp_links(text) + for alias, entity, norm in zip(aliases, entities, normalizations): + entity_id = wp_to_id.get(entity) + if entity_id: + # print(" ", alias, '-->', entity, '-->', entity_id) + article_dict[alias] = entity_id + article_dict[entity] = entity_id + + # get the raw text without markup etc + clean_text = _get_clean_wp_text(text) + + #print(text) + print(clean_text) print() + _run_ner(nlp, article_id, article_title, clean_text, article_dict) + + +info_regex = re.compile(r'{[^{]*?}') +interwiki_regex = re.compile(r'\[\[([^|]*?)]]') +interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]') +htlm_regex = re.compile(r'<!--[^!]*-->') +category_regex = re.compile(r'\[\[Category:[^\[]*]]') +file_regex = re.compile(r'\[\[File:[^[\]]+]]') +ref_regex = re.compile(r'<ref.*?>') # non-greedy +ref_2_regex = re.compile(r'</ref.*?>') # non-greedy + def _get_clean_wp_text(article_text): - # TODO: compile the regular expressions + clean_text = article_text.strip() - # remove Category and File statements - clean_text = re.sub(r'\[\[Category:[^\[]*]]', '', article_text) - print("1", clean_text) - clean_text = re.sub(r'\[\[File:[^\[]*]]', '', clean_text) # TODO: this doesn't work yet - print("2", clean_text) - - # remove bolding markup - clean_text = re.sub('\'\'\'', '', clean_text) - clean_text = re.sub('\'\'', '', clean_text) + # remove bolding & italic markup + clean_text = clean_text.replace('\'\'\'', '') + clean_text = clean_text.replace('\'\'', '') # remove nested {{info}} statements by removing the inner/smallest ones first and iterating try_again = True previous_length = len(clean_text) while try_again: - clean_text = re.sub('{[^{]*?}', '', clean_text) # non-greedy match excluding a nested { + clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested { if len(clean_text) < previous_length: try_again = True else: try_again = False previous_length = len(clean_text) - # remove multiple spaces - while ' ' in clean_text: - clean_text = re.sub(' ', ' ', clean_text) - # remove simple interwiki links (no alternative name) - clean_text = re.sub('\[\[([^|]*?)]]', r'\1', clean_text) + clean_text = interwiki_regex.sub(r'\1', clean_text) # remove simple interwiki links by picking the alternative name - clean_text = re.sub(r'\[\[[^|]*?\|([^|]*?)]]', r'\1', clean_text) + clean_text = interwiki_2_regex.sub(r'\1', clean_text) # remove HTML comments - clean_text = re.sub('<!--[^!]*-->', '', clean_text) + clean_text = htlm_regex.sub('', clean_text) - return clean_text + # remove Category and File statements + clean_text = category_regex.sub('', clean_text) + clean_text = file_regex.sub('', clean_text) + + # remove multiple = + while '==' in clean_text: + clean_text = clean_text.replace("==", "=") + + clean_text = clean_text.replace(". =", ".") + clean_text = clean_text.replace(" = ", ". ") + clean_text = clean_text.replace("= ", ".") + clean_text = clean_text.replace(" =", "") + + # remove refs (non-greedy match) + clean_text = ref_regex.sub('', clean_text) + clean_text = ref_2_regex.sub('', clean_text) + + # remove additional wikiformatting + clean_text = re.sub(r'<blockquote>', '', clean_text) + clean_text = re.sub(r'</blockquote>', '', clean_text) + + # change special characters back to normal ones + clean_text = clean_text.replace(r'<', '<') + clean_text = clean_text.replace(r'>', '>') + clean_text = clean_text.replace(r'"', '"') + clean_text = clean_text.replace(r'&nbsp;', ' ') + clean_text = clean_text.replace(r'&', '&') + + # remove multiple spaces + while ' ' in clean_text: + clean_text = clean_text.replace(' ', ' ') + + return clean_text.strip() -def add_el(kb, nlp): - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) - nlp.add_pipe(el_pipe, last=True) - - text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel. " \ - "The main character in Doug's novel is the man Arthur Dent, " \ - "but Douglas doesn't write about George Washington or Homer Simpson." - doc = nlp(text) - - print() - for token in doc: - print("token", token.text, token.ent_type_, token.ent_kb_id_) - - print() - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) - - -def capitalize_first(text): - if not text: - return None - result = text[0].capitalize() - if len(result) > 0: - result += text[1:] - return result - +def _run_ner(nlp, article_id, article_title, clean_text, article_dict): + pass # TODO if __name__ == "__main__": print("START", datetime.datetime.now()) + print() + my_kb = None + # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False to_create_kb = False - to_read_kb = True + + # read KB back in from file + to_read_kb = False + to_test_kb = False + + create_wp_training = True # STEP 1 : create prior probabilities from WP # run only once ! if to_create_prior_probs: print("STEP 1: to_create_prior_probs", datetime.datetime.now()) - _read_wikipedia_prior_probs() + read_wikipedia_prior_probs() print() # STEP 2 : deduce entity frequencies from WP # run only once ! if to_create_entity_counts: print("STEP 2: to_create_entity_counts", datetime.datetime.now()) - _write_entity_counts() + write_entity_counts() print() + # STEP 3 : create KB and write to file + # run only once ! if to_create_kb: - # STEP 3 : create KB - print("STEP 3: to_create_kb", datetime.datetime.now()) + print("STEP 3a: to_create_kb", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_sm') my_vocab = my_nlp.vocab my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False) @@ -571,15 +719,14 @@ if __name__ == "__main__": print("kb aliases:", my_kb.get_size_aliases()) print() - # STEP 4 : write KB to file - print("STEP 4: write KB", datetime.datetime.now()) + print("STEP 3b: write KB", datetime.datetime.now()) my_kb.dump(KB_FILE) my_vocab.to_disk(VOCAB_DIR) print() + # STEP 4 : read KB back in from file if to_read_kb: - # STEP 5 : read KB back in from file - print("STEP 5: to_read_kb", datetime.datetime.now()) + print("STEP 4: to_read_kb", datetime.datetime.now()) my_vocab = Vocab() my_vocab.from_disk(VOCAB_DIR) my_kb = KnowledgeBase(vocab=my_vocab) @@ -589,16 +736,17 @@ if __name__ == "__main__": print() # test KB - candidates = my_kb.get_candidates("Bush") - for c in candidates: - print("entity:", c.entity_) - print("entity freq:", c.entity_freq) - print("alias:", c.alias_) - print("prior prob:", c.prior_prob) + if to_test_kb: + test_kb(my_kb) print() - # STEP 6: add KB to NLP pipeline - # print("STEP 6: use KB", datetime.datetime.now()) - # add_el(my_kb, nlp) + # STEP 5: create a training dataset from WP + if create_wp_training: + print("STEP 5: create training dataset", datetime.datetime.now()) + create_training() + # TODO coreference resolution + # add_coref() + + print() print("STOP", datetime.datetime.now()) From cba9680d13cd2cc1b5a2af9d82acf378dce8fede Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 2 May 2019 17:24:52 +0200 Subject: [PATCH 025/148] run NER on clean WP text and link to gold-standard entity IDs --- examples/pipeline/wikidata_entity_linking.py | 22 +++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index e6df39631..a0ffc3618 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -515,15 +515,12 @@ def add_coref(): def create_training(): nlp = spacy.load('en_core_web_sm') wp_to_id = _get_entity_to_id() - _read_wikipedia(nlp, wp_to_id, limit=10000) + _read_wikipedia_texts(nlp, wp_to_id, limit=10000) -def _read_wikipedia(nlp, wp_to_id, limit=None): +def _read_wikipedia_texts(nlp, wp_to_id, limit=None): """ Read the XML wikipedia data to parse out training data """ - # regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) - # regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) - title_regex = re.compile(r'(?<=).*(?=)') id_regex = re.compile(r'(?<=)\d*(?=)') @@ -589,18 +586,15 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text): for alias, entity, norm in zip(aliases, entities, normalizations): entity_id = wp_to_id.get(entity) if entity_id: - # print(" ", alias, '-->', entity, '-->', entity_id) article_dict[alias] = entity_id article_dict[entity] = entity_id # get the raw text without markup etc clean_text = _get_clean_wp_text(text) - - #print(text) print(clean_text) - print() _run_ner(nlp, article_id, article_title, clean_text, article_dict) + print() info_regex = re.compile(r'{[^{]*?}') @@ -676,7 +670,15 @@ def _get_clean_wp_text(article_text): def _run_ner(nlp, article_id, article_title, clean_text, article_dict): - pass # TODO + doc = nlp(clean_text) + for ent in doc.ents: + if ent.label_ == "PERSON": # TODO: expand to non-persons + ent_id = article_dict.get(ent.text) + if ent_id: + print(" -", ent.text, ent.label_, ent_id) + else: + print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases + if __name__ == "__main__": print("START", datetime.datetime.now()) From bbcb9da466d33c7ac118d8aa6cce67961a39ec9f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 3 May 2019 10:44:29 +0200 Subject: [PATCH 026/148] creating training data with clean WP texts and QID entities true/false --- examples/pipeline/wikidata_entity_linking.py | 92 ++++++++++++++++---- 1 file changed, 76 insertions(+), 16 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index a0ffc3618..cf388773a 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -29,6 +29,8 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' +TRAINING_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' + # these will/should be matched ignoring case wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", @@ -224,7 +226,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_enti print() print("4. adding aliases", datetime.datetime.now()) print() - _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ,) + _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ) # TODO: read wikipedia texts for entity context # _read_wikipedia() @@ -512,18 +514,27 @@ def add_coref(): print(doc._.coref_clusters) -def create_training(): - nlp = spacy.load('en_core_web_sm') +def create_training(kb): + if not kb: + raise ValueError("kb should be defined") + # nlp = spacy.load('en_core_web_sm') wp_to_id = _get_entity_to_id() - _read_wikipedia_texts(nlp, wp_to_id, limit=10000) + _read_wikipedia_texts(kb, wp_to_id, limit=None) -def _read_wikipedia_texts(nlp, wp_to_id, limit=None): +def _read_wikipedia_texts(kb, wp_to_id, limit=None): """ Read the XML wikipedia data to parse out training data """ title_regex = re.compile(r'(?<=).*(?=)') id_regex = re.compile(r'(?<=)\d*(?=)') + # read entity training header file + _write_training_entity(article_id="article_id", + alias="alias", + entity="entity", + correct="correct", + append=False) + with bz2.open(ENWIKI_DUMP, mode='rb') as file: line = file.readline() cnt = 1 @@ -532,6 +543,8 @@ def _read_wikipedia_texts(nlp, wp_to_id, limit=None): article_id = None reading_text = False while line and (not limit or cnt < limit): + if cnt % 500000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") clean_line = line.strip().decode("utf-8") # Start reading new page @@ -543,7 +556,7 @@ def _read_wikipedia_texts(nlp, wp_to_id, limit=None): # finished reading this page elif clean_line == "": if article_id: - _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text.strip()) + _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) # start reading text within a page if ").*(?=)') text = text_regex.search(article_text).group(0) @@ -579,7 +592,14 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text): if text.startswith("#REDIRECT"): return - print("WP article", article_id, ":", article_title) + # print("WP article", article_id, ":", article_title) + # print() + # print(text) + + # get the raw text without markup etc + clean_text = _get_clean_wp_text(text) + # print() + # print(clean_text) article_dict = dict() aliases, entities, normalizations = _get_wp_links(text) @@ -589,12 +609,37 @@ def _process_wp_text(nlp, wp_to_id, article_id, article_title, article_text): article_dict[alias] = entity_id article_dict[entity] = entity_id - # get the raw text without markup etc - clean_text = _get_clean_wp_text(text) - print(clean_text) + # print("found entities:") + for alias, entity in article_dict.items(): + # print(alias, "-->", entity) + candidates = kb.get_candidates(alias) - _run_ner(nlp, article_id, article_title, clean_text, article_dict) - print() + # as training data, we only store entities that are sufficiently ambiguous + if len(candidates) > 1: + _write_training_article(article_id=article_id, clean_text=clean_text) + # print("alias", alias) + + # print all incorrect candidates + for c in candidates: + if entity != c.entity_: + _write_training_entity(article_id=article_id, + alias=alias, + entity=c.entity_, + correct="0", + append=True) + + # print the one correct candidate + _write_training_entity(article_id=article_id, + alias=alias, + entity=entity, + correct="1", + append=True) + + # print("gold entity", entity) + # print() + + # _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict) + # print() info_regex = re.compile(r'{[^{]*?}') @@ -669,7 +714,22 @@ def _get_clean_wp_text(article_text): return clean_text.strip() -def _run_ner(nlp, article_id, article_title, clean_text, article_dict): +def _write_training_article(article_id, clean_text): + file_loc = TRAINING_SET_DIR + "/" + str(article_id) + ".txt" + with open(file_loc, mode='w', encoding='utf8') as outputfile: + outputfile.write(clean_text) + + +def _write_training_entity(article_id, alias, entity, correct, append=True): + mode = "w" + if append: + mode = "a" + file_loc = TRAINING_SET_DIR + "/" + "gold_entities.csv" + with open(file_loc, mode=mode, encoding='utf8') as outputfile: + outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n") + + +def _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict): doc = nlp(clean_text) for ent in doc.ents: if ent.label_ == "PERSON": # TODO: expand to non-persons @@ -691,7 +751,7 @@ if __name__ == "__main__": to_create_kb = False # read KB back in from file - to_read_kb = False + to_read_kb = True to_test_kb = False create_wp_training = True @@ -745,7 +805,7 @@ if __name__ == "__main__": # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - create_training() + create_training(my_kb) # TODO coreference resolution # add_coref() From 34600c92bd5be2948debf465b9de9c2f3f2f16ee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 3 May 2019 15:10:09 +0200 Subject: [PATCH 027/148] try catch per article to ensure the pipeline goes on --- examples/pipeline/wikidata_entity_linking.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index cf388773a..a9be49742 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -537,7 +537,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None): with bz2.open(ENWIKI_DUMP, mode='rb') as file: line = file.readline() - cnt = 1 + cnt = 0 article_text = "" article_title = None article_id = None @@ -556,7 +556,12 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None): # finished reading this page elif clean_line == "": if article_id: - _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) + try: + _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) + # on a previous run, an error occurred after 46M lines and 2h + except Exception as e: + print("Error processing article", article_id, article_title) + print(e) # start reading text within a page if ").*(?=)') + text_regex = re.compile(r'(?<=).*(?= Date: Fri, 3 May 2019 17:37:47 +0200 Subject: [PATCH 028/148] fix WP id parsing, speed up processing and remove ambiguous strings in one doc (for now) --- examples/pipeline/wikidata_entity_linking.py | 187 +++++++++++-------- 1 file changed, 110 insertions(+), 77 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index a9be49742..0db7f4665 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -29,7 +29,8 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' -TRAINING_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' +TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' +TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/' # these will/should be matched ignoring case @@ -523,74 +524,104 @@ def create_training(kb): def _read_wikipedia_texts(kb, wp_to_id, limit=None): - """ Read the XML wikipedia data to parse out training data """ + """ + Read the XML wikipedia data to parse out training data: + raw text data + positive and negative instances + """ title_regex = re.compile(r'(?<=).*(?=)') id_regex = re.compile(r'(?<=)\d*(?=)') - # read entity training header file - _write_training_entity(article_id="article_id", - alias="alias", - entity="entity", - correct="correct", - append=False) + read_ids = set() - with bz2.open(ENWIKI_DUMP, mode='rb') as file: - line = file.readline() - cnt = 0 - article_text = "" - article_title = None - article_id = None - reading_text = False - while line and (not limit or cnt < limit): - if cnt % 500000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") - clean_line = line.strip().decode("utf-8") - - # Start reading new page - if clean_line == "": - article_text = "" - article_title = None - article_id = None - - # finished reading this page - elif clean_line == "": - if article_id: - try: - _process_wp_text(kb, wp_to_id, article_id, article_title, article_text.strip()) - # on a previous run, an error occurred after 46M lines and 2h - except Exception as e: - print("Error processing article", article_id, article_title) - print(e) - - # start reading text within a page - if "": + reading_revision = True + elif clean_line == "": + reading_revision = False + + # Start reading new page + if clean_line == "": + article_text = "" + article_title = None + article_id = None + + # finished reading this page + elif clean_line == "": + if article_id: + try: + _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip()) + # on a previous run, an error occurred after 46M lines and 2h + except Exception as e: + print("Error processing article", article_id, article_title) + print(e) + else: + print("Done processing a page, but couldn't find an article_id ?") + print(article_title) + print(article_text) + article_text = "" + article_title = None + article_id = None + reading_text = False + reading_revision = False + + # start reading text within a page + if ").*(?=).*(?= Date: Fri, 3 May 2019 18:09:09 +0200 Subject: [PATCH 029/148] run only 100M of WP data as training dataset (9%) --- examples/pipeline/wikidata_entity_linking.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 0db7f4665..4fe97e874 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -520,7 +520,7 @@ def create_training(kb): raise ValueError("kb should be defined") # nlp = spacy.load('en_core_web_sm') wp_to_id = _get_entity_to_id() - _read_wikipedia_texts(kb, wp_to_id, limit=None) + _read_wikipedia_texts(kb, wp_to_id, limit=100000000) # TODO: full dataset def _read_wikipedia_texts(kb, wp_to_id, limit=None): @@ -552,7 +552,7 @@ def _read_wikipedia_texts(kb, wp_to_id, limit=None): reading_text = False reading_revision = False while line and (not limit or cnt < limit): - if cnt % 500000 == 0: + if cnt % 1000000 == 0: print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") clean_line = line.strip().decode("utf-8") # print(clean_line) From 69612155782d586c26532dce0f3816d8befcf41a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 6 May 2019 10:56:56 +0200 Subject: [PATCH 030/148] refactor code to separate functionality into different files --- examples/pipeline/dummy_entity_linking.py | 6 +- .../pipeline/wiki_entity_linking/__init__.py | 0 .../wiki_entity_linking/kb_creator.py | 137 +++ .../pipeline/wiki_entity_linking/run_el.py | 36 + .../training_set_creator.py | 276 ++++++ .../wiki_entity_linking/wiki_nel_pipeline.py | 103 +++ .../wiki_entity_linking/wikidata_processor.py | 166 ++++ .../wikipedia_processor.py | 187 ++++ examples/pipeline/wikidata_entity_linking.py | 852 ------------------ 9 files changed, 908 insertions(+), 855 deletions(-) create mode 100644 examples/pipeline/wiki_entity_linking/__init__.py create mode 100644 examples/pipeline/wiki_entity_linking/kb_creator.py create mode 100644 examples/pipeline/wiki_entity_linking/run_el.py create mode 100644 examples/pipeline/wiki_entity_linking/training_set_creator.py create mode 100644 examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py create mode 100644 examples/pipeline/wiki_entity_linking/wikidata_processor.py create mode 100644 examples/pipeline/wiki_entity_linking/wikipedia_processor.py delete mode 100644 examples/pipeline/wikidata_entity_linking.py diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py index e93e3e20b..ae36a57b3 100644 --- a/examples/pipeline/dummy_entity_linking.py +++ b/examples/pipeline/dummy_entity_linking.py @@ -66,6 +66,6 @@ def add_el(kb, nlp): if __name__ == "__main__": - nlp = spacy.load('en_core_web_sm') - my_kb = create_kb(nlp.vocab) - add_el(my_kb, nlp) + my_nlp = spacy.load('en_core_web_sm') + my_kb = create_kb(my_nlp.vocab) + add_el(my_kb, my_nlp) diff --git a/examples/pipeline/wiki_entity_linking/__init__.py b/examples/pipeline/wiki_entity_linking/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py new file mode 100644 index 000000000..7ca7cfad1 --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -0,0 +1,137 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import spacy +from spacy.kb import KnowledgeBase + +import datetime + +from . import wikipedia_processor as wp +from . import wikidata_processor as wd + + +def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input, prior_prob_input, + to_print=False, write_entity_defs=True): + """ Create the knowledge base from Wikidata entries """ + kb = KnowledgeBase(vocab=vocab) + + print() + print("1. _read_wikidata_entities", datetime.datetime.now()) + print() + # title_to_id = _read_wikidata_entities_regex_depr(limit=1000) + title_to_id = wd.read_wikidata_entities_json(limit=None) + + # write the title-ID mapping to file + if write_entity_defs: + with open(entity_output, mode='w', encoding='utf8') as entity_file: + entity_file.write("WP_title" + "|" + "WD_id" + "\n") + for title, qid in title_to_id.items(): + entity_file.write(title + "|" + str(qid) + "\n") + + title_list = list(title_to_id.keys()) + entity_list = [title_to_id[x] for x in title_list] + + print() + print("2. _get_entity_frequencies", datetime.datetime.now()) + print() + entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list) + + print() + print("3. adding", len(entity_list), "entities", datetime.datetime.now()) + print() + kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None) + + print() + print("4. adding aliases", datetime.datetime.now()) + print() + _add_aliases(kb, title_to_id=title_to_id, + max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, + prior_prob_input=prior_prob_input) + + if to_print: + print() + print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) + + print("done with kb", datetime.datetime.now()) + + return kb + + +def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False): + wp_titles = title_to_id.keys() + + if to_print: + print("wp titles:", wp_titles) + + # adding aliases with prior probabilities + with open(prior_prob_input, mode='r', encoding='utf8') as prior_file: + # skip header + prior_file.readline() + line = prior_file.readline() + # we can read this file sequentially, it's sorted by alias, and then by count + previous_alias = None + total_count = 0 + counts = list() + entities = list() + while line: + splits = line.replace('\n', "").split(sep='|') + new_alias = splits[0] + count = int(splits[1]) + entity = splits[2] + + if new_alias != previous_alias and previous_alias: + # done reading the previous alias --> output + if len(entities) > 0: + selected_entities = list() + prior_probs = list() + for ent_count, ent_string in zip(counts, entities): + if ent_string in wp_titles: + wd_id = title_to_id[ent_string] + p_entity_givenalias = ent_count / total_count + selected_entities.append(wd_id) + prior_probs.append(p_entity_givenalias) + + if selected_entities: + try: + kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs) + except ValueError as e: + print(e) + total_count = 0 + counts = list() + entities = list() + + total_count += count + + if len(entities) < max_entities_per_alias and count >= min_occ: + counts.append(count) + entities.append(entity) + previous_alias = new_alias + + line = prior_file.readline() + + if to_print: + print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) + + +def test_kb(kb): + # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO + nlp = spacy.load('en_core_web_sm') + + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) + nlp.add_pipe(el_pipe, last=True) + + candidates = kb.get_candidates("Bush") + + print("generating candidates for 'Bush' :") + for c in candidates: + print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") + print() + + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is the man Arthur Dent, " \ + "but Douglas doesn't write about George Washington or Homer Simpson." + doc = nlp(text) + + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py new file mode 100644 index 000000000..eb8343722 --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -0,0 +1,36 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import spacy + +# requires: pip install neuralcoref --no-binary neuralcoref +# import neuralcoref + + +# TODO +def add_coref(): + """ Add coreference resolution to our model """ + nlp = spacy.load('en_core_web_sm') + # nlp = spacy.load('en') + + # TODO: this doesn't work yet + # neuralcoref.add_to_pipe(nlp) + print("done adding to pipe") + + doc = nlp(u'My sister has a dog. She loves him.') + print("done doc") + + print(doc._.has_coref) + print(doc._.coref_clusters) + + +# TODO +def _run_ner_depr(nlp, clean_text, article_dict): + doc = nlp(clean_text) + for ent in doc.ents: + if ent.label_ == "PERSON": # TODO: expand to non-persons + ent_id = article_dict.get(ent.text) + if ent_id: + print(" -", ent.text, ent.label_, ent_id) + else: + print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py new file mode 100644 index 000000000..e46aeec5b --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -0,0 +1,276 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import csv +import bz2 +import datetime + +from . import wikipedia_processor as wp + +""" +Process Wikipedia interlinks to generate a training dataset for the EL algorithm +""" + + +def create_training(kb, entity_input, training_output): + if not kb: + raise ValueError("kb should be defined") + # nlp = spacy.load('en_core_web_sm') + wp_to_id = _get_entity_to_id(entity_input) + _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset + + +def _get_entity_to_id(entity_input): + entity_to_id = dict() + with open(entity_input, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') + # skip header + next(csvreader) + for row in csvreader: + entity_to_id[row[0]] = row[1] + + return entity_to_id + + +def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): + """ + Read the XML wikipedia data to parse out training data: + raw text data + positive and negative instances + """ + + title_regex = re.compile(r'(?<=).*(?=)') + id_regex = re.compile(r'(?<=)\d*(?=)') + + read_ids = set() + + entityfile_loc = training_output + "/" + "gold_entities.csv" + with open(entityfile_loc, mode="w", encoding='utf8') as entityfile: + # write entity training header file + _write_training_entity(outputfile=entityfile, + article_id="article_id", + alias="alias", + entity="entity", + correct="correct") + + with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file: + line = file.readline() + cnt = 0 + article_text = "" + article_title = None + article_id = None + reading_text = False + reading_revision = False + while line and (not limit or cnt < limit): + if cnt % 1000000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") + clean_line = line.strip().decode("utf-8") + # print(clean_line) + + if clean_line == "": + reading_revision = True + elif clean_line == "": + reading_revision = False + + # Start reading new page + if clean_line == "": + article_text = "" + article_title = None + article_id = None + + # finished reading this page + elif clean_line == "": + if article_id: + try: + _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text.strip(), training_output) + # on a previous run, an error occurred after 46M lines and 2h + except Exception as e: + print("Error processing article", article_id, article_title, e) + else: + print("Done processing a page, but couldn't find an article_id ?") + print(article_title) + print(article_text) + article_text = "" + article_title = None + article_id = None + reading_text = False + reading_revision = False + + # start reading text within a page + if ").*(?=", entity) + candidates = kb.get_candidates(alias) + + # as training data, we only store entities that are sufficiently ambiguous + if len(candidates) > 1: + _write_training_article(article_id=article_id, clean_text=clean_text, training_output=training_output) + # print("alias", alias) + + # print all incorrect candidates + for c in candidates: + if entity != c.entity_: + _write_training_entity(outputfile=entityfile, + article_id=article_id, + alias=alias, + entity=c.entity_, + correct="0") + + # print the one correct candidate + _write_training_entity(outputfile=entityfile, + article_id=article_id, + alias=alias, + entity=entity, + correct="1") + + # print("gold entity", entity) + # print() + + # _run_ner_depr(nlp, clean_text, article_dict) + # print() + + +info_regex = re.compile(r'{[^{]*?}') +interwiki_regex = re.compile(r'\[\[([^|]*?)]]') +interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]') +htlm_regex = re.compile(r'<!--[^!]*-->') +category_regex = re.compile(r'\[\[Category:[^\[]*]]') +file_regex = re.compile(r'\[\[File:[^[\]]+]]') +ref_regex = re.compile(r'<ref.*?>') # non-greedy +ref_2_regex = re.compile(r'</ref.*?>') # non-greedy + + +def _get_clean_wp_text(article_text): + clean_text = article_text.strip() + + # remove bolding & italic markup + clean_text = clean_text.replace('\'\'\'', '') + clean_text = clean_text.replace('\'\'', '') + + # remove nested {{info}} statements by removing the inner/smallest ones first and iterating + try_again = True + previous_length = len(clean_text) + while try_again: + clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested { + if len(clean_text) < previous_length: + try_again = True + else: + try_again = False + previous_length = len(clean_text) + + # remove simple interwiki links (no alternative name) + clean_text = interwiki_regex.sub(r'\1', clean_text) + + # remove simple interwiki links by picking the alternative name + clean_text = interwiki_2_regex.sub(r'\1', clean_text) + + # remove HTML comments + clean_text = htlm_regex.sub('', clean_text) + + # remove Category and File statements + clean_text = category_regex.sub('', clean_text) + clean_text = file_regex.sub('', clean_text) + + # remove multiple = + while '==' in clean_text: + clean_text = clean_text.replace("==", "=") + + clean_text = clean_text.replace(". =", ".") + clean_text = clean_text.replace(" = ", ". ") + clean_text = clean_text.replace("= ", ".") + clean_text = clean_text.replace(" =", "") + + # remove refs (non-greedy match) + clean_text = ref_regex.sub('', clean_text) + clean_text = ref_2_regex.sub('', clean_text) + + # remove additional wikiformatting + clean_text = re.sub(r'<blockquote>', '', clean_text) + clean_text = re.sub(r'</blockquote>', '', clean_text) + + # change special characters back to normal ones + clean_text = clean_text.replace(r'<', '<') + clean_text = clean_text.replace(r'>', '>') + clean_text = clean_text.replace(r'"', '"') + clean_text = clean_text.replace(r'&nbsp;', ' ') + clean_text = clean_text.replace(r'&', '&') + + # remove multiple spaces + while ' ' in clean_text: + clean_text = clean_text.replace(' ', ' ') + + return clean_text.strip() + + +def _write_training_article(article_id, clean_text, training_output): + file_loc = training_output + "/" + str(article_id) + ".txt" + with open(file_loc, mode='w', encoding='utf8') as outputfile: + outputfile.write(clean_text) + + +def _write_training_entity(outputfile, article_id, alias, entity, correct): + outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n") diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py new file mode 100644 index 000000000..20d4f5953 --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from . import wikipedia_processor as wp, kb_creator, training_set_creator + +import spacy +from spacy.vocab import Vocab +from spacy.kb import KnowledgeBase +import datetime + +""" +Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. +""" + +PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' +ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' +ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' + +KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' +VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' + +TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' +TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/' + + +if __name__ == "__main__": + print("START", datetime.datetime.now()) + print() + my_kb = None + + # one-time methods to create KB and write to file + to_create_prior_probs = False + to_create_entity_counts = False + to_create_kb = False + + # read KB back in from file + to_read_kb = True + to_test_kb = False + + create_wp_training = False + + # STEP 1 : create prior probabilities from WP + # run only once ! + if to_create_prior_probs: + print("STEP 1: to_create_prior_probs", datetime.datetime.now()) + wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB) + print() + + # STEP 2 : deduce entity frequencies from WP + # run only once ! + if to_create_entity_counts: + print("STEP 2: to_create_entity_counts", datetime.datetime.now()) + wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False) + print() + + # STEP 3 : create KB and write to file + # run only once ! + if to_create_kb: + print("STEP 3a: to_create_kb", datetime.datetime.now()) + my_nlp = spacy.load('en_core_web_sm') + my_vocab = my_nlp.vocab + my_kb = kb_creator.create_kb(my_vocab, + max_entities_per_alias=10, + min_occ=5, + entity_output=ENTITY_DEFS, + count_input=ENTITY_COUNTS, + prior_prob_input=PRIOR_PROB, + to_print=False) + print("kb entities:", my_kb.get_size_entities()) + print("kb aliases:", my_kb.get_size_aliases()) + print() + + print("STEP 3b: write KB", datetime.datetime.now()) + my_kb.dump(KB_FILE) + my_vocab.to_disk(VOCAB_DIR) + print() + + # STEP 4 : read KB back in from file + if to_read_kb: + print("STEP 4: to_read_kb", datetime.datetime.now()) + my_vocab = Vocab() + my_vocab.from_disk(VOCAB_DIR) + my_kb = KnowledgeBase(vocab=my_vocab) + my_kb.load_bulk(KB_FILE) + print("kb entities:", my_kb.get_size_entities()) + print("kb aliases:", my_kb.get_size_aliases()) + print() + + # test KB + if to_test_kb: + kb_creator.test_kb(my_kb) + print() + + # STEP 5: create a training dataset from WP + if create_wp_training: + print("STEP 5: create training dataset", datetime.datetime.now()) + training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR) + + # TODO coreference resolution + # add_coref() + + print() + print("STOP", datetime.datetime.now()) diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py new file mode 100644 index 000000000..03db05414 --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py @@ -0,0 +1,166 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import bz2 +import json +import datetime + +# TODO: remove hardcoded paths +WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2' + + +def read_wikidata_entities_json(limit=None, to_print=False): + """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ + + languages = {'en', 'de'} + prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected + site_filter = 'enwiki' + + title_to_id = dict() + + # parse appropriate fields - depending on what we need in the KB + parse_properties = False + parse_sitelinks = True + parse_labels = False + parse_descriptions = False + parse_aliases = False + + with bz2.open(WIKIDATA_JSON, mode='rb') as file: + line = file.readline() + cnt = 0 + while line and (not limit or cnt < limit): + if cnt % 500000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") + clean_line = line.strip() + if clean_line.endswith(b","): + clean_line = clean_line[:-1] + if len(clean_line) > 1: + obj = json.loads(clean_line) + entry_type = obj["type"] + + if entry_type == "item": + # filtering records on their properties + keep = False + + claims = obj["claims"] + for prop, value_set in prop_filter.items(): + claim_property = claims.get(prop, None) + if claim_property: + for cp in claim_property: + cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') + cp_rank = cp['rank'] + if cp_rank != "deprecated" and cp_id in value_set: + keep = True + + if keep: + unique_id = obj["id"] + + if to_print: + print("ID:", unique_id) + print("type:", entry_type) + + # parsing all properties that refer to other entities + if parse_properties: + for prop, claim_property in claims.items(): + cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')] + cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None] + if cp_values: + if to_print: + print("prop:", prop, cp_values) + + if parse_sitelinks: + site_value = obj["sitelinks"].get(site_filter, None) + if site_value: + site = site_value['title'] + if to_print: + print(site_filter, ":", site) + title_to_id[site] = unique_id + # print(site, "for", unique_id) + + if parse_labels: + labels = obj["labels"] + if labels: + for lang in languages: + lang_label = labels.get(lang, None) + if lang_label: + if to_print: + print("label (" + lang + "):", lang_label["value"]) + + if parse_descriptions: + descriptions = obj["descriptions"] + if descriptions: + for lang in languages: + lang_descr = descriptions.get(lang, None) + if lang_descr: + if to_print: + print("description (" + lang + "):", lang_descr["value"]) + + if parse_aliases: + aliases = obj["aliases"] + if aliases: + for lang in languages: + lang_aliases = aliases.get(lang, None) + if lang_aliases: + for item in lang_aliases: + if to_print: + print("alias (" + lang + "):", item["value"]) + + if to_print: + print() + line = file.readline() + cnt += 1 + + return title_to_id + + +def _read_wikidata_entities_regex_depr(limit=None): + """ + Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. + TODO: doesn't work yet. may be deleted ? + """ + + regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE) + regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) + regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE) + regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) + + title_to_id = dict() + + with bz2.open(WIKIDATA_JSON, mode='rb') as file: + line = file.readline() + cnt = 0 + while line and (not limit or cnt < limit): + if cnt % 500000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") + clean_line = line.strip() + if clean_line.endswith(b","): + clean_line = clean_line[:-1] + if len(clean_line) > 1: + clean_line = line.strip().decode("utf-8") + keep = False + + p31_matches = regex_p31.findall(clean_line) + if p31_matches: + for p31_match in p31_matches: + id_matches = regex_id.findall(p31_match) + for id_match in id_matches: + id_match = id_match[6:][:-1] + if id_match == "Q5" or id_match == "Q15632617": + keep = True + + if keep: + id_match = regex_id.search(clean_line).group(0) + id_match = id_match[6:][:-1] + + enwiki_matches = regex_enwiki.findall(clean_line) + if enwiki_matches: + for enwiki_match in enwiki_matches: + title_match = regex_title.search(enwiki_match).group(0) + title = title_match[9:][:-1] + title_to_id[title] = id_match + + line = file.readline() + cnt += 1 + + return title_to_id diff --git a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py new file mode 100644 index 000000000..0461cb19f --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py @@ -0,0 +1,187 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re +import bz2 +import csv +import datetime + +""" +Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions. +""" + + +# TODO: remove hardcoded paths +ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2' +ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' + +map_alias_to_link = dict() + +# these will/should be matched ignoring case +wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", + "d", "dbdump", "download", "Draft", "Education", "Foundation", + "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator", + "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki", + "MediaZilla", "Meta", "Metawikipedia", "Module", + "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki", + "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev", + "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn", + "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki", + "User", "User talk", "v", "voy", + "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews", + "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech", + "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"] + +# find the links +link_regex = re.compile(r'\[\[[^\[\]]*\]\]') + +# match on interwiki links, e.g. `en:` or `:fr:` +ns_regex = r":?" + "[a-z][a-z]" + ":" + +# match on Namespace: optionally preceded by a : +for ns in wiki_namespaces: + ns_regex += "|" + ":?" + ns + ":" + +ns_regex = re.compile(ns_regex, re.IGNORECASE) + + +def read_wikipedia_prior_probs(prior_prob_output): + """ + Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities + The full file takes about 2h to parse 1100M lines (update printed every 5M lines). + It works relatively fast because we don't care about which article we parsed the interwiki from, + we just process line by line. + """ + + with bz2.open(ENWIKI_DUMP, mode='rb') as file: + line = file.readline() + cnt = 0 + while line: + if cnt % 5000000 == 0: + print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") + clean_line = line.strip().decode("utf-8") + + aliases, entities, normalizations = get_wp_links(clean_line) + for alias, entity, norm in zip(aliases, entities, normalizations): + _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True) + _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True) + + line = file.readline() + cnt += 1 + + # write all aliases and their entities and occurrences to file + with open(prior_prob_output, mode='w', encoding='utf8') as outputfile: + outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") + for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): + for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True): + outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") + + +def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): + alias = alias.strip() + entity = entity.strip() + + # remove everything after # as this is not part of the title but refers to a specific paragraph + if normalize_entity: + # wikipedia titles are always capitalized + entity = _capitalize_first(entity.split("#")[0]) + if normalize_alias: + alias = alias.split("#")[0] + + if alias and entity: + alias_dict = map_alias_to_link.get(alias, dict()) + entity_count = alias_dict.get(entity, 0) + alias_dict[entity] = entity_count + 1 + map_alias_to_link[alias] = alias_dict + + +def get_wp_links(text): + aliases = [] + entities = [] + normalizations = [] + + matches = link_regex.findall(text) + for match in matches: + match = match[2:][:-2].replace("_", " ").strip() + + if ns_regex.match(match): + pass # ignore namespaces at the beginning of the string + + # this is a simple link, with the alias the same as the mention + elif "|" not in match: + aliases.append(match) + entities.append(match) + normalizations.append(True) + + # in wiki format, the link is written as [[entity|alias]] + else: + splits = match.split("|") + entity = splits[0].strip() + alias = splits[1].strip() + # specific wiki format [[alias (specification)|]] + if len(alias) == 0 and "(" in entity: + alias = entity.split("(")[0] + aliases.append(alias) + entities.append(entity) + normalizations.append(False) + else: + aliases.append(alias) + entities.append(entity) + normalizations.append(False) + + return aliases, entities, normalizations + + +def _capitalize_first(text): + if not text: + return None + result = text[0].capitalize() + if len(result) > 0: + result += text[1:] + return result + + +def write_entity_counts(prior_prob_input, count_output, to_print=False): + """ Write entity counts for quick access later """ + entity_to_count = dict() + total_count = 0 + + with open(prior_prob_input, mode='r', encoding='utf8') as prior_file: + # skip header + prior_file.readline() + line = prior_file.readline() + + while line: + splits = line.replace('\n', "").split(sep='|') + # alias = splits[0] + count = int(splits[1]) + entity = splits[2] + + current_count = entity_to_count.get(entity, 0) + entity_to_count[entity] = current_count + count + + total_count += count + + line = prior_file.readline() + + with open(count_output, mode='w', encoding='utf8') as entity_file: + entity_file.write("entity" + "|" + "count" + "\n") + for entity, count in entity_to_count.items(): + entity_file.write(entity + "|" + str(count) + "\n") + + if to_print: + for entity, count in entity_to_count.items(): + print("Entity count:", entity, count) + print("Total count:", total_count) + + +def get_entity_frequencies(count_input, entities): + entity_to_count = dict() + with open(count_input, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') + # skip header + next(csvreader) + for row in csvreader: + entity_to_count[row[0]] = int(row[1]) + + return [entity_to_count.get(e, 0) for e in entities] diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py deleted file mode 100644 index 4fe97e874..000000000 --- a/examples/pipeline/wikidata_entity_linking.py +++ /dev/null @@ -1,852 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -""" -Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. -""" -import re -import csv -import json -import spacy -import datetime -import bz2 - -from spacy.kb import KnowledgeBase -from spacy.vocab import Vocab - -# requires: pip install neuralcoref --no-binary neuralcoref -# import neuralcoref - -# TODO: remove hardcoded paths -WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2' -ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2' -ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' - -PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' -ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' -ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' - -KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' -VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' - -TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' -TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/' - - -# these will/should be matched ignoring case -wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", - "d", "dbdump", "download", "Draft", "Education", "Foundation", - "Gadget", "Gadget definition", "gerrit", "File", "Help", "Image", "Incubator", - "m", "mail", "mailarchive", "media", "MediaWiki", "MediaWiki talk", "Mediawikiwiki", - "MediaZilla", "Meta", "Metawikipedia", "Module", - "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki", - "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev", - "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn", - "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki", - "User", "User talk", "v", "voy", - "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews", - "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech", - "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"] - -map_alias_to_link = dict() - - -def read_wikipedia_prior_probs(): - """ - STEP 1: Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities - The full file takes about 2h to parse 1100M lines (update printed every 5M lines). - It works relatively fast because we don't care about which article we parsed the interwiki from, - we just process line by line. - """ - - with bz2.open(ENWIKI_DUMP, mode='rb') as file: - line = file.readline() - cnt = 0 - while line: - if cnt % 5000000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") - clean_line = line.strip().decode("utf-8") - - aliases, entities, normalizations = _get_wp_links(clean_line) - for alias, entity, norm in zip(aliases, entities, normalizations): - _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True) - _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True) - - line = file.readline() - cnt += 1 - - # write all aliases and their entities and occurrences to file - with open(PRIOR_PROB, mode='w', encoding='utf8') as outputfile: - outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") - for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): - for entity, count in sorted(alias_dict.items(), key=lambda x: x[1], reverse=True): - outputfile.write(alias + "|" + str(count) + "|" + entity + "\n") - - -# find the links -link_regex = re.compile(r'\[\[[^\[\]]*\]\]') - -# match on interwiki links, e.g. `en:` or `:fr:` -ns_regex = r":?" + "[a-z][a-z]" + ":" - -# match on Namespace: optionally preceded by a : -for ns in wiki_namespaces: - ns_regex += "|" + ":?" + ns + ":" - -ns_regex = re.compile(ns_regex, re.IGNORECASE) - - -def _get_wp_links(text): - aliases = [] - entities = [] - normalizations = [] - - matches = link_regex.findall(text) - for match in matches: - match = match[2:][:-2].replace("_", " ").strip() - - if ns_regex.match(match): - pass # ignore namespaces at the beginning of the string - - # this is a simple link, with the alias the same as the mention - elif "|" not in match: - aliases.append(match) - entities.append(match) - normalizations.append(True) - - # in wiki format, the link is written as [[entity|alias]] - else: - splits = match.split("|") - entity = splits[0].strip() - alias = splits[1].strip() - # specific wiki format [[alias (specification)|]] - if len(alias) == 0 and "(" in entity: - alias = entity.split("(")[0] - aliases.append(alias) - entities.append(entity) - normalizations.append(False) - else: - aliases.append(alias) - entities.append(entity) - normalizations.append(False) - - return aliases, entities, normalizations - - -def _store_alias(alias, entity, normalize_alias=False, normalize_entity=True): - alias = alias.strip() - entity = entity.strip() - - # remove everything after # as this is not part of the title but refers to a specific paragraph - if normalize_entity: - # wikipedia titles are always capitalized - entity = _capitalize_first(entity.split("#")[0]) - if normalize_alias: - alias = alias.split("#")[0] - - if alias and entity: - alias_dict = map_alias_to_link.get(alias, dict()) - entity_count = alias_dict.get(entity, 0) - alias_dict[entity] = entity_count + 1 - map_alias_to_link[alias] = alias_dict - - -def _capitalize_first(text): - if not text: - return None - result = text[0].capitalize() - if len(result) > 0: - result += text[1:] - return result - - -def write_entity_counts(to_print=False): - """ STEP 2: write entity counts """ - entity_to_count = dict() - total_count = 0 - - with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: - # skip header - prior_file.readline() - line = prior_file.readline() - - while line: - splits = line.replace('\n', "").split(sep='|') - # alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - current_count = entity_to_count.get(entity, 0) - entity_to_count[entity] = current_count + count - - total_count += count - - line = prior_file.readline() - - with open(ENTITY_COUNTS, mode='w', encoding='utf8') as entity_file: - entity_file.write("entity" + "|" + "count" + "\n") - for entity, count in entity_to_count.items(): - entity_file.write(entity + "|" + str(count) + "\n") - - if to_print: - for entity, count in entity_to_count.items(): - print("Entity count:", entity, count) - print("Total count:", total_count) - - -def create_kb(vocab, max_entities_per_alias, min_occ, to_print=False, write_entity_defs=True): - """ STEP 3: create the knowledge base """ - kb = KnowledgeBase(vocab=vocab) - - print() - print("1. _read_wikidata_entities", datetime.datetime.now()) - print() - # title_to_id = _read_wikidata_entities_regex_depr(limit=1000) - title_to_id = _read_wikidata_entities_json(limit=None) - - # write the title-ID mapping to file - if write_entity_defs: - with open(ENTITY_DEFS, mode='w', encoding='utf8') as entity_file: - entity_file.write("WP_title" + "|" + "WD_id" + "\n") - for title, qid in title_to_id.items(): - entity_file.write(title + "|" + str(qid) + "\n") - - title_list = list(title_to_id.keys()) - entity_list = [title_to_id[x] for x in title_list] - - print() - print("2. _get_entity_frequencies", datetime.datetime.now()) - print() - entity_frequencies = _get_entity_frequencies(entities=title_list) - - print() - print("3. adding", len(entity_list), "entities", datetime.datetime.now()) - print() - kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None) - - print() - print("4. adding aliases", datetime.datetime.now()) - print() - _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ) - - # TODO: read wikipedia texts for entity context - # _read_wikipedia() - - if to_print: - print() - print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) - - print("done with kb", datetime.datetime.now()) - - return kb - - -def _get_entity_frequencies(entities): - entity_to_count = dict() - with open(ENTITY_COUNTS, 'r', encoding='utf8') as csvfile: - csvreader = csv.reader(csvfile, delimiter='|') - # skip header - next(csvreader) - for row in csvreader: - entity_to_count[row[0]] = int(row[1]) - - return [entity_to_count.get(e, 0) for e in entities] - - -def _get_entity_to_id(): - entity_to_id = dict() - with open(ENTITY_DEFS, 'r', encoding='utf8') as csvfile: - csvreader = csv.reader(csvfile, delimiter='|') - # skip header - next(csvreader) - for row in csvreader: - entity_to_id[row[0]] = row[1] - - return entity_to_id - - -def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, to_print=False): - wp_titles = title_to_id.keys() - - if to_print: - print("wp titles:", wp_titles) - - # adding aliases with prior probabilities - with open(PRIOR_PROB, mode='r', encoding='utf8') as prior_file: - # skip header - prior_file.readline() - line = prior_file.readline() - # we can read this file sequentially, it's sorted by alias, and then by count - previous_alias = None - total_count = 0 - counts = list() - entities = list() - while line: - splits = line.replace('\n', "").split(sep='|') - new_alias = splits[0] - count = int(splits[1]) - entity = splits[2] - - if new_alias != previous_alias and previous_alias: - # done reading the previous alias --> output - if len(entities) > 0: - selected_entities = list() - prior_probs = list() - for ent_count, ent_string in zip(counts, entities): - if ent_string in wp_titles: - wd_id = title_to_id[ent_string] - p_entity_givenalias = ent_count / total_count - selected_entities.append(wd_id) - prior_probs.append(p_entity_givenalias) - - if selected_entities: - try: - kb.add_alias(alias=previous_alias, entities=selected_entities, probabilities=prior_probs) - except ValueError as e: - print(e) - total_count = 0 - counts = list() - entities = list() - - total_count += count - - if len(entities) < max_entities_per_alias and count >= min_occ: - counts.append(count) - entities.append(entity) - previous_alias = new_alias - - line = prior_file.readline() - - if to_print: - print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) - - -def _read_wikidata_entities_json(limit=None, to_print=False): - """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ - - languages = {'en', 'de'} - prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected - site_filter = 'enwiki' - - title_to_id = dict() - - # parse appropriate fields - depending on what we need in the KB - parse_properties = False - parse_sitelinks = True - parse_labels = False - parse_descriptions = False - parse_aliases = False - - with bz2.open(WIKIDATA_JSON, mode='rb') as file: - line = file.readline() - cnt = 0 - while line and (not limit or cnt < limit): - if cnt % 500000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") - clean_line = line.strip() - if clean_line.endswith(b","): - clean_line = clean_line[:-1] - if len(clean_line) > 1: - obj = json.loads(clean_line) - entry_type = obj["type"] - - if entry_type == "item": - # filtering records on their properties - keep = False - - claims = obj["claims"] - for prop, value_set in prop_filter.items(): - claim_property = claims.get(prop, None) - if claim_property: - for cp in claim_property: - cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') - cp_rank = cp['rank'] - if cp_rank != "deprecated" and cp_id in value_set: - keep = True - - if keep: - unique_id = obj["id"] - - if to_print: - print("ID:", unique_id) - print("type:", entry_type) - - # parsing all properties that refer to other entities - if parse_properties: - for prop, claim_property in claims.items(): - cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')] - cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None] - if cp_values: - if to_print: - print("prop:", prop, cp_values) - - if parse_sitelinks: - site_value = obj["sitelinks"].get(site_filter, None) - if site_value: - site = site_value['title'] - if to_print: - print(site_filter, ":", site) - title_to_id[site] = unique_id - # print(site, "for", unique_id) - - if parse_labels: - labels = obj["labels"] - if labels: - for lang in languages: - lang_label = labels.get(lang, None) - if lang_label: - if to_print: - print("label (" + lang + "):", lang_label["value"]) - - if parse_descriptions: - descriptions = obj["descriptions"] - if descriptions: - for lang in languages: - lang_descr = descriptions.get(lang, None) - if lang_descr: - if to_print: - print("description (" + lang + "):", lang_descr["value"]) - - if parse_aliases: - aliases = obj["aliases"] - if aliases: - for lang in languages: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - for item in lang_aliases: - if to_print: - print("alias (" + lang + "):", item["value"]) - - if to_print: - print() - line = file.readline() - cnt += 1 - - return title_to_id - - -def _read_wikidata_entities_regex_depr(limit=None, to_print=False): - """ Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. """ - - regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE) - regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) - regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE) - regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) - - title_to_id = dict() - - with bz2.open(WIKIDATA_JSON, mode='rb') as file: - line = file.readline() - cnt = 0 - while line and (not limit or cnt < limit): - if cnt % 500000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") - clean_line = line.strip() - if clean_line.endswith(b","): - clean_line = clean_line[:-1] - if len(clean_line) > 1: - clean_line = line.strip().decode("utf-8") - keep = False - - p31_matches = regex_p31.findall(clean_line) - if p31_matches: - for p31_match in p31_matches: - id_matches = regex_id.findall(p31_match) - for id_match in id_matches: - id_match = id_match[6:][:-1] - if id_match == "Q5" or id_match == "Q15632617": - keep = True - - if keep: - id_match = regex_id.search(clean_line).group(0) - id_match = id_match[6:][:-1] - - enwiki_matches = regex_enwiki.findall(clean_line) - if enwiki_matches: - for enwiki_match in enwiki_matches: - title_match = regex_title.search(enwiki_match).group(0) - title = title_match[9:][:-1] - title_to_id[title] = id_match - # print(title, "for", id_match) - - line = file.readline() - cnt += 1 - - return title_to_id - - -def test_kb(kb): - # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO - nlp = spacy.load('en_core_web_sm') - - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) - nlp.add_pipe(el_pipe, last=True) - - candidates = my_kb.get_candidates("Bush") - - print("generating candidates for 'Bush' :") - for c in candidates: - print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") - print() - - text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel. " \ - "The main character in Doug's novel is the man Arthur Dent, " \ - "but Douglas doesn't write about George Washington or Homer Simpson." - doc = nlp(text) - - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) - - -def add_coref(): - """ STEP 5: add coreference resolution to our model """ - nlp = spacy.load('en_core_web_sm') - # nlp = spacy.load('en') - - # TODO: this doesn't work yet - # neuralcoref.add_to_pipe(nlp) - print("done adding to pipe") - - doc = nlp(u'My sister has a dog. She loves him.') - print("done doc") - - print(doc._.has_coref) - print(doc._.coref_clusters) - - -def create_training(kb): - if not kb: - raise ValueError("kb should be defined") - # nlp = spacy.load('en_core_web_sm') - wp_to_id = _get_entity_to_id() - _read_wikipedia_texts(kb, wp_to_id, limit=100000000) # TODO: full dataset - - -def _read_wikipedia_texts(kb, wp_to_id, limit=None): - """ - Read the XML wikipedia data to parse out training data: - raw text data + positive and negative instances - """ - - title_regex = re.compile(r'(?<=).*(?=)') - id_regex = re.compile(r'(?<=)\d*(?=)') - - read_ids = set() - - entityfile_loc = TRAINING_OUTPUT_SET_DIR + "/" + "gold_entities.csv" - with open(entityfile_loc, mode="w", encoding='utf8') as entityfile: - # write entity training header file - _write_training_entity(outputfile=entityfile, - article_id="article_id", - alias="alias", - entity="entity", - correct="correct") - - with bz2.open(ENWIKI_DUMP, mode='rb') as file: - line = file.readline() - cnt = 0 - article_text = "" - article_title = None - article_id = None - reading_text = False - reading_revision = False - while line and (not limit or cnt < limit): - if cnt % 1000000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") - clean_line = line.strip().decode("utf-8") - # print(clean_line) - - if clean_line == "": - reading_revision = True - elif clean_line == "": - reading_revision = False - - # Start reading new page - if clean_line == "": - article_text = "" - article_title = None - article_id = None - - # finished reading this page - elif clean_line == "": - if article_id: - try: - _process_wp_text(kb, wp_to_id, entityfile, article_id, article_title, article_text.strip()) - # on a previous run, an error occurred after 46M lines and 2h - except Exception as e: - print("Error processing article", article_id, article_title) - print(e) - else: - print("Done processing a page, but couldn't find an article_id ?") - print(article_title) - print(article_text) - article_text = "" - article_title = None - article_id = None - reading_text = False - reading_revision = False - - # start reading text within a page - if ").*(?=", entity) - candidates = kb.get_candidates(alias) - - # as training data, we only store entities that are sufficiently ambiguous - if len(candidates) > 1: - _write_training_article(article_id=article_id, clean_text=clean_text) - # print("alias", alias) - - # print all incorrect candidates - for c in candidates: - if entity != c.entity_: - _write_training_entity(outputfile=entityfile, - article_id=article_id, - alias=alias, - entity=c.entity_, - correct="0") - - # print the one correct candidate - _write_training_entity(outputfile=entityfile, - article_id=article_id, - alias=alias, - entity=entity, - correct="1") - - # print("gold entity", entity) - # print() - - # _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict) - # print() - - -info_regex = re.compile(r'{[^{]*?}') -interwiki_regex = re.compile(r'\[\[([^|]*?)]]') -interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]') -htlm_regex = re.compile(r'<!--[^!]*-->') -category_regex = re.compile(r'\[\[Category:[^\[]*]]') -file_regex = re.compile(r'\[\[File:[^[\]]+]]') -ref_regex = re.compile(r'<ref.*?>') # non-greedy -ref_2_regex = re.compile(r'</ref.*?>') # non-greedy - - -def _get_clean_wp_text(article_text): - clean_text = article_text.strip() - - # remove bolding & italic markup - clean_text = clean_text.replace('\'\'\'', '') - clean_text = clean_text.replace('\'\'', '') - - # remove nested {{info}} statements by removing the inner/smallest ones first and iterating - try_again = True - previous_length = len(clean_text) - while try_again: - clean_text = info_regex.sub('', clean_text) # non-greedy match excluding a nested { - if len(clean_text) < previous_length: - try_again = True - else: - try_again = False - previous_length = len(clean_text) - - # remove simple interwiki links (no alternative name) - clean_text = interwiki_regex.sub(r'\1', clean_text) - - # remove simple interwiki links by picking the alternative name - clean_text = interwiki_2_regex.sub(r'\1', clean_text) - - # remove HTML comments - clean_text = htlm_regex.sub('', clean_text) - - # remove Category and File statements - clean_text = category_regex.sub('', clean_text) - clean_text = file_regex.sub('', clean_text) - - # remove multiple = - while '==' in clean_text: - clean_text = clean_text.replace("==", "=") - - clean_text = clean_text.replace(". =", ".") - clean_text = clean_text.replace(" = ", ". ") - clean_text = clean_text.replace("= ", ".") - clean_text = clean_text.replace(" =", "") - - # remove refs (non-greedy match) - clean_text = ref_regex.sub('', clean_text) - clean_text = ref_2_regex.sub('', clean_text) - - # remove additional wikiformatting - clean_text = re.sub(r'<blockquote>', '', clean_text) - clean_text = re.sub(r'</blockquote>', '', clean_text) - - # change special characters back to normal ones - clean_text = clean_text.replace(r'<', '<') - clean_text = clean_text.replace(r'>', '>') - clean_text = clean_text.replace(r'"', '"') - clean_text = clean_text.replace(r'&nbsp;', ' ') - clean_text = clean_text.replace(r'&', '&') - - # remove multiple spaces - while ' ' in clean_text: - clean_text = clean_text.replace(' ', ' ') - - return clean_text.strip() - - -def _write_training_article(article_id, clean_text): - file_loc = TRAINING_OUTPUT_SET_DIR + "/" + str(article_id) + ".txt" - with open(file_loc, mode='w', encoding='utf8') as outputfile: - outputfile.write(clean_text) - - -def _write_training_entity(outputfile, article_id, alias, entity, correct): - outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n") - - -def _run_ner_depr(nlp, article_id, article_title, clean_text, article_dict): - doc = nlp(clean_text) - for ent in doc.ents: - if ent.label_ == "PERSON": # TODO: expand to non-persons - ent_id = article_dict.get(ent.text) - if ent_id: - print(" -", ent.text, ent.label_, ent_id) - else: - print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases - - -if __name__ == "__main__": - print("START", datetime.datetime.now()) - print() - my_kb = None - - # one-time methods to create KB and write to file - to_create_prior_probs = False - to_create_entity_counts = False - to_create_kb = False - - # read KB back in from file - to_read_kb = True - to_test_kb = False - - create_wp_training = True - - # STEP 1 : create prior probabilities from WP - # run only once ! - if to_create_prior_probs: - print("STEP 1: to_create_prior_probs", datetime.datetime.now()) - read_wikipedia_prior_probs() - print() - - # STEP 2 : deduce entity frequencies from WP - # run only once ! - if to_create_entity_counts: - print("STEP 2: to_create_entity_counts", datetime.datetime.now()) - write_entity_counts() - print() - - # STEP 3 : create KB and write to file - # run only once ! - if to_create_kb: - print("STEP 3a: to_create_kb", datetime.datetime.now()) - my_nlp = spacy.load('en_core_web_sm') - my_vocab = my_nlp.vocab - my_kb = create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, to_print=False) - print("kb entities:", my_kb.get_size_entities()) - print("kb aliases:", my_kb.get_size_aliases()) - print() - - print("STEP 3b: write KB", datetime.datetime.now()) - my_kb.dump(KB_FILE) - my_vocab.to_disk(VOCAB_DIR) - print() - - # STEP 4 : read KB back in from file - if to_read_kb: - print("STEP 4: to_read_kb", datetime.datetime.now()) - my_vocab = Vocab() - my_vocab.from_disk(VOCAB_DIR) - my_kb = KnowledgeBase(vocab=my_vocab) - my_kb.load_bulk(KB_FILE) - print("kb entities:", my_kb.get_size_entities()) - print("kb aliases:", my_kb.get_size_aliases()) - print() - - # test KB - if to_test_kb: - test_kb(my_kb) - print() - - # STEP 5: create a training dataset from WP - if create_wp_training: - print("STEP 5: create training dataset", datetime.datetime.now()) - create_training(my_kb) - - # TODO coreference resolution - # add_coref() - - print() - print("STOP", datetime.datetime.now()) From 7e348d7f7ff2d79beec90f8f9862fc52cad8b654 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 6 May 2019 15:13:50 +0200 Subject: [PATCH 031/148] baseline evaluation using highest-freq candidate --- .../wiki_entity_linking/kb_creator.py | 23 ---- .../pipeline/wiki_entity_linking/run_el.py | 101 ++++++++++++++++++ .../training_set_creator.py | 36 ++++++- .../wiki_entity_linking/wiki_nel_pipeline.py | 21 +++- 4 files changed, 152 insertions(+), 29 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index 7ca7cfad1..b9e663bb9 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -112,26 +112,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in if to_print: print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) - -def test_kb(kb): - # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO - nlp = spacy.load('en_core_web_sm') - - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) - nlp.add_pipe(el_pipe, last=True) - - candidates = kb.get_candidates("Bush") - - print("generating candidates for 'Bush' :") - for c in candidates: - print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") - print() - - text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel. " \ - "The main character in Doug's novel is the man Arthur Dent, " \ - "but Douglas doesn't write about George Washington or Homer Simpson." - doc = nlp(text) - - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index eb8343722..c2156e31b 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -1,12 +1,113 @@ # coding: utf-8 from __future__ import unicode_literals +import os import spacy +import datetime +from os import listdir + +from examples.pipeline.wiki_entity_linking import training_set_creator # requires: pip install neuralcoref --no-binary neuralcoref # import neuralcoref +def run_el_toy_example(nlp, kb): + _prepare_pipeline(nlp, kb) + + candidates = kb.get_candidates("Bush") + + print("generating candidates for 'Bush' :") + for c in candidates: + print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") + print() + + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is the man Arthur Dent, " \ + "but Douglas doesn't write about George Washington or Homer Simpson." + doc = nlp(text) + + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) + + +def run_el_training(nlp, kb, training_dir, limit=None): + _prepare_pipeline(nlp, kb) + + correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir, + collect_correct=True, + collect_incorrect=False) + + predictions = list() + golds = list() + + cnt = 0 + for f in listdir(training_dir): + if not limit or cnt < limit: + if is_dev(f): + article_id = f.replace(".txt", "") + if cnt % 500 == 0: + print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") + cnt += 1 + with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: + text = file.read() + doc = nlp(text) + for ent in doc.ents: + if ent.label_ == "PERSON": # TODO: expand to other types + gold_entity = correct_entries_per_article[article_id].get(ent.text, None) + # only evaluating gold entities we know, because the training data is not complete + if gold_entity: + predictions.append(ent.kb_id_) + golds.append(gold_entity) + + print("Processed", cnt, "dev articles") + print() + evaluate(predictions, golds) + + +def is_dev(file_name): + return file_name.endswith("3.txt") + + +def evaluate(predictions, golds): + if len(predictions) != len(golds): + raise ValueError("predictions and gold entities should have the same length") + + print("Evaluating", len(golds), "entities") + + tp = 0 + fp = 0 + fn = 0 + + for pred, gold in zip(predictions, golds): + is_correct = pred == gold + if not pred: + fn += 1 + elif is_correct: + tp += 1 + else: + fp += 1 + + print("tp", tp) + print("fp", fp) + print("fn", fn) + + precision = tp / (tp + fp + 0.0000001) + recall = tp / (tp + fn + 0.0000001) + fscore = 2 * recall * precision / (recall + precision + 0.0000001) + + print("precision", round(100 * precision, 1), "%") + print("recall", round(100 * recall, 1), "%") + print("Fscore", round(100 * fscore, 1), "%") + + +def _prepare_pipeline(nlp, kb): + # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) + nlp.add_pipe(el_pipe, last=True) + + # TODO def add_coref(): """ Add coreference resolution to our model """ diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index e46aeec5b..47349d3dc 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -12,6 +12,7 @@ from . import wikipedia_processor as wp Process Wikipedia interlinks to generate a training dataset for the EL algorithm """ +ENTITY_FILE = "gold_entities.csv" def create_training(kb, entity_input, training_output): if not kb: @@ -44,7 +45,7 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): read_ids = set() - entityfile_loc = training_output + "/" + "gold_entities.csv" + entityfile_loc = training_output + "/" + ENTITY_FILE with open(entityfile_loc, mode="w", encoding='utf8') as entityfile: # write entity training header file _write_training_entity(outputfile=entityfile, @@ -274,3 +275,36 @@ def _write_training_article(article_id, clean_text, training_output): def _write_training_entity(outputfile, article_id, alias, entity, correct): outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n") + + +def read_training_entities(training_output, collect_correct=True, collect_incorrect=False): + entityfile_loc = training_output + "/" + ENTITY_FILE + incorrect_entries_per_article = dict() + correct_entries_per_article = dict() + with open(entityfile_loc, mode='r', encoding='utf8') as file: + for line in file: + fields = line.replace('\n', "").split(sep='|') + article_id = fields[0] + alias = fields[1] + entity = fields[2] + correct = fields[3] + + if correct == "1" and collect_correct: + entry_dict = correct_entries_per_article.get(article_id, dict()) + if alias in entry_dict: + raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE) + entry_dict[alias] = entity + correct_entries_per_article[article_id] = entry_dict + + if correct == "0" and collect_incorrect: + entry_dict = incorrect_entries_per_article.get(article_id, dict()) + entities = entry_dict.get(alias, set()) + entities.add(entity) + entry_dict[alias] = entities + incorrect_entries_per_article[article_id] = entry_dict + + return correct_entries_per_article, incorrect_entries_per_article + + + + diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 20d4f5953..ebc1e7958 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from . import wikipedia_processor as wp, kb_creator, training_set_creator +from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el import spacy from spacy.vocab import Vocab @@ -19,8 +19,7 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' -TRAINING_OUTPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' -TRAINING_INPUT_SET_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel_sample_3may2019/' +TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' if __name__ == "__main__": @@ -37,8 +36,12 @@ if __name__ == "__main__": to_read_kb = True to_test_kb = False + # create training dataset create_wp_training = False + # apply named entity linking to the training dataset + apply_to_training = True + # STEP 1 : create prior probabilities from WP # run only once ! if to_create_prior_probs: @@ -88,13 +91,21 @@ if __name__ == "__main__": # test KB if to_test_kb: - kb_creator.test_kb(my_kb) + my_nlp = spacy.load('en_core_web_sm') + run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp) print() # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_OUTPUT_SET_DIR) + training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR) + + # STEP 6: apply the EL algorithm on the training dataset + if apply_to_training: + my_nlp = spacy.load('en_core_web_sm') + run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000) + print() + # TODO coreference resolution # add_coref() From 9f33732b96310dc482097e1a6661415a08acc57a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 7 May 2019 16:03:42 +0200 Subject: [PATCH 032/148] using entity descriptions and article texts as input embedding vectors for training --- .../wiki_entity_linking/kb_creator.py | 50 +++++++++-- .../pipeline/wiki_entity_linking/run_el.py | 4 +- .../pipeline/wiki_entity_linking/train_el.py | 58 ++++++++++++ .../training_set_creator.py | 19 +--- .../wiki_entity_linking/wiki_nel_pipeline.py | 34 ++++--- .../wiki_entity_linking/wikidata_processor.py | 90 ++++--------------- 6 files changed, 147 insertions(+), 108 deletions(-) create mode 100644 examples/pipeline/wiki_entity_linking/train_el.py diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index b9e663bb9..bb00f918d 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -4,13 +4,16 @@ from __future__ import unicode_literals import spacy from spacy.kb import KnowledgeBase +import csv import datetime from . import wikipedia_processor as wp from . import wikidata_processor as wd -def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input, prior_prob_input, +def create_kb(vocab, max_entities_per_alias, min_occ, + entity_def_output, entity_descr_output, + count_input, prior_prob_input, to_print=False, write_entity_defs=True): """ Create the knowledge base from Wikidata entries """ kb = KnowledgeBase(vocab=vocab) @@ -18,15 +21,11 @@ def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input print() print("1. _read_wikidata_entities", datetime.datetime.now()) print() - # title_to_id = _read_wikidata_entities_regex_depr(limit=1000) - title_to_id = wd.read_wikidata_entities_json(limit=None) + title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None) - # write the title-ID mapping to file + # write the title-ID and ID-description mappings to file if write_entity_defs: - with open(entity_output, mode='w', encoding='utf8') as entity_file: - entity_file.write("WP_title" + "|" + "WD_id" + "\n") - for title, qid in title_to_id.items(): - entity_file.write(title + "|" + str(qid) + "\n") + _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr) title_list = list(title_to_id.keys()) entity_list = [title_to_id[x] for x in title_list] @@ -57,6 +56,41 @@ def create_kb(vocab, max_entities_per_alias, min_occ, entity_output, count_input return kb +def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr): + with open(entity_def_output, mode='w', encoding='utf8') as id_file: + id_file.write("WP_title" + "|" + "WD_id" + "\n") + for title, qid in title_to_id.items(): + id_file.write(title + "|" + str(qid) + "\n") + with open(entity_descr_output, mode='w', encoding='utf8') as descr_file: + descr_file.write("WD_id" + "|" + "description" + "\n") + for qid, descr in id_to_descr.items(): + descr_file.write(str(qid) + "|" + descr + "\n") + + +def _get_entity_to_id(entity_def_output): + entity_to_id = dict() + with open(entity_def_output, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') + # skip header + next(csvreader) + for row in csvreader: + entity_to_id[row[0]] = row[1] + + return entity_to_id + + +def _get_id_to_description(entity_descr_output): + id_to_desc = dict() + with open(entity_descr_output, 'r', encoding='utf8') as csvfile: + csvreader = csv.reader(csvfile, delimiter='|') + # skip header + next(csvreader) + for row in csvreader: + id_to_desc[row[0]] = row[1] + + return id_to_desc + + def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False): wp_titles = title_to_id.keys() diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index c2156e31b..96fe58740 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -32,7 +32,7 @@ def run_el_toy_example(nlp, kb): print("ent", ent.text, ent.label_, ent.kb_id_) -def run_el_training(nlp, kb, training_dir, limit=None): +def run_el_dev(nlp, kb, training_dir, limit=None): _prepare_pipeline(nlp, kb) correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir, @@ -48,7 +48,7 @@ def run_el_training(nlp, kb, training_dir, limit=None): if is_dev(f): article_id = f.replace(".txt", "") if cnt % 500 == 0: - print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") + print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") cnt += 1 with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: text = file.read() diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py new file mode 100644 index 000000000..b3ebb658f --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import os +import datetime +from os import listdir + +from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator +from examples.pipeline.wiki_entity_linking import wikidata_processor as wd + +""" TODO: this code needs to be implemented in pipes.pyx""" + + +def train_model(kb, nlp, training_dir, entity_descr_output, limit=None): + run_el._prepare_pipeline(nlp, kb) + + correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, + collect_correct=True, + collect_incorrect=True) + + entities = kb.get_entity_strings() + + id_to_descr = kb_creator._get_id_to_description(entity_descr_output) + + cnt = 0 + for f in listdir(training_dir): + if not limit or cnt < limit: + if not run_el.is_dev(f): + article_id = f.replace(".txt", "") + if cnt % 500 == 0: + print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") + cnt += 1 + with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: + text = file.read() + print() + doc = nlp(text) + doc_vector = doc.vector + print("FILE", f, len(doc_vector), "D vector") + + for mention_pos, entity_pos in correct_entries[article_id].items(): + descr = id_to_descr.get(entity_pos) + if descr: + doc_descr = nlp(descr) + descr_vector = doc_descr.vector + print("GOLD POS", mention_pos, entity_pos, len(descr_vector), "D vector") + + for mention_neg, entity_negs in incorrect_entries[article_id].items(): + for entity_neg in entity_negs: + descr = id_to_descr.get(entity_neg) + if descr: + doc_descr = nlp(descr) + descr_vector = doc_descr.vector + print("GOLD NEG", mention_neg, entity_neg, len(descr_vector), "D vector") + + print() + print("Processed", cnt, "dev articles") + print() + diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index 47349d3dc..b1c63c55c 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -6,7 +6,7 @@ import csv import bz2 import datetime -from . import wikipedia_processor as wp +from . import wikipedia_processor as wp, kb_creator """ Process Wikipedia interlinks to generate a training dataset for the EL algorithm @@ -14,26 +14,15 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm ENTITY_FILE = "gold_entities.csv" -def create_training(kb, entity_input, training_output): + +def create_training(kb, entity_def_input, training_output): if not kb: raise ValueError("kb should be defined") # nlp = spacy.load('en_core_web_sm') - wp_to_id = _get_entity_to_id(entity_input) + wp_to_id = kb_creator._get_entity_to_id(entity_def_input) _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset -def _get_entity_to_id(entity_input): - entity_to_id = dict() - with open(entity_input, 'r', encoding='utf8') as csvfile: - csvreader = csv.reader(csvfile, delimiter='|') - # skip header - next(csvreader) - for row in csvreader: - entity_to_id[row[0]] = row[1] - - return entity_to_id - - def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): """ Read the XML wikipedia data to parse out training data: diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index ebc1e7958..26e2a7ae2 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -1,7 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals -from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el +from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el, train_el import spacy from spacy.vocab import Vocab @@ -15,11 +15,12 @@ Demonstrate how to build a knowledge base from WikiData and run an Entity Linkin PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' +ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' -TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_nel/' +TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' if __name__ == "__main__": @@ -30,17 +31,20 @@ if __name__ == "__main__": # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = False + to_create_kb = True # read KB back in from file to_read_kb = True - to_test_kb = False + to_test_kb = True # create training dataset create_wp_training = False - # apply named entity linking to the training dataset - apply_to_training = True + # run training + run_training = False + + # apply named entity linking to the dev dataset + apply_to_dev = False # STEP 1 : create prior probabilities from WP # run only once ! @@ -65,7 +69,8 @@ if __name__ == "__main__": my_kb = kb_creator.create_kb(my_vocab, max_entities_per_alias=10, min_occ=5, - entity_output=ENTITY_DEFS, + entity_def_output=ENTITY_DEFS, + entity_descr_output=ENTITY_DESCR, count_input=ENTITY_COUNTS, prior_prob_input=PRIOR_PROB, to_print=False) @@ -98,12 +103,19 @@ if __name__ == "__main__": # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - training_set_creator.create_training(kb=my_kb, entity_input=ENTITY_DEFS, training_output=TRAINING_DIR) + training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) - # STEP 6: apply the EL algorithm on the training dataset - if apply_to_training: + # STEP 7: apply the EL algorithm on the training dataset + if run_training: + print("STEP 6: training ", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_sm') - run_el.run_el_training(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=1000) + train_el.train_model(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=5) + print() + + # STEP 8: apply the EL algorithm on the dev dataset + if apply_to_dev: + my_nlp = spacy.load('en_core_web_sm') + run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000) print() diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py index 03db05414..7d84b1a2a 100644 --- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py +++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py @@ -13,17 +13,18 @@ WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.js def read_wikidata_entities_json(limit=None, to_print=False): """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ - languages = {'en', 'de'} + lang = 'en' prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected site_filter = 'enwiki' title_to_id = dict() + id_to_descr = dict() # parse appropriate fields - depending on what we need in the KB parse_properties = False parse_sitelinks = True parse_labels = False - parse_descriptions = False + parse_descriptions = True parse_aliases = False with bz2.open(WIKIDATA_JSON, mode='rb') as file: @@ -76,91 +77,36 @@ def read_wikidata_entities_json(limit=None, to_print=False): if to_print: print(site_filter, ":", site) title_to_id[site] = unique_id - # print(site, "for", unique_id) if parse_labels: labels = obj["labels"] if labels: - for lang in languages: - lang_label = labels.get(lang, None) - if lang_label: - if to_print: - print("label (" + lang + "):", lang_label["value"]) + lang_label = labels.get(lang, None) + if lang_label: + if to_print: + print("label (" + lang + "):", lang_label["value"]) if parse_descriptions: descriptions = obj["descriptions"] if descriptions: - for lang in languages: - lang_descr = descriptions.get(lang, None) - if lang_descr: - if to_print: - print("description (" + lang + "):", lang_descr["value"]) + lang_descr = descriptions.get(lang, None) + if lang_descr: + if to_print: + print("description (" + lang + "):", lang_descr["value"]) + id_to_descr[unique_id] = lang_descr["value"] if parse_aliases: aliases = obj["aliases"] if aliases: - for lang in languages: - lang_aliases = aliases.get(lang, None) - if lang_aliases: - for item in lang_aliases: - if to_print: - print("alias (" + lang + "):", item["value"]) + lang_aliases = aliases.get(lang, None) + if lang_aliases: + for item in lang_aliases: + if to_print: + print("alias (" + lang + "):", item["value"]) if to_print: print() line = file.readline() cnt += 1 - return title_to_id - - -def _read_wikidata_entities_regex_depr(limit=None): - """ - Read the JSON wiki data and parse out the entities with regular expressions. Takes XXX to parse 55M lines. - TODO: doesn't work yet. may be deleted ? - """ - - regex_p31 = re.compile(r'mainsnak[^}]*\"P31\"[^}]*}', re.UNICODE) - regex_id = re.compile(r'\"id\":"Q[0-9]*"', re.UNICODE) - regex_enwiki = re.compile(r'\"enwiki\":[^}]*}', re.UNICODE) - regex_title = re.compile(r'\"title\":"[^"]*"', re.UNICODE) - - title_to_id = dict() - - with bz2.open(WIKIDATA_JSON, mode='rb') as file: - line = file.readline() - cnt = 0 - while line and (not limit or cnt < limit): - if cnt % 500000 == 0: - print(datetime.datetime.now(), "processed", cnt, "lines of WikiData dump") - clean_line = line.strip() - if clean_line.endswith(b","): - clean_line = clean_line[:-1] - if len(clean_line) > 1: - clean_line = line.strip().decode("utf-8") - keep = False - - p31_matches = regex_p31.findall(clean_line) - if p31_matches: - for p31_match in p31_matches: - id_matches = regex_id.findall(p31_match) - for id_match in id_matches: - id_match = id_match[6:][:-1] - if id_match == "Q5" or id_match == "Q15632617": - keep = True - - if keep: - id_match = regex_id.search(clean_line).group(0) - id_match = id_match[6:][:-1] - - enwiki_matches = regex_enwiki.findall(clean_line) - if enwiki_matches: - for enwiki_match in enwiki_matches: - title_match = regex_title.search(enwiki_match).group(0) - title = title_match[9:][:-1] - title_to_id[title] = id_match - - line = file.readline() - cnt += 1 - - return title_to_id + return title_to_id, id_to_descr From c6ca8649d7ab67af88af1682fa93a63fc635481c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 9 May 2019 17:23:19 +0200 Subject: [PATCH 033/148] first stab at model - not functional yet --- .../pipeline/wiki_entity_linking/train_el.py | 179 ++++++++++++++---- .../wiki_entity_linking/wiki_nel_pipeline.py | 20 +- 2 files changed, 158 insertions(+), 41 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index b3ebb658f..8dcea9256 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -6,53 +6,168 @@ import datetime from os import listdir from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator -from examples.pipeline.wiki_entity_linking import wikidata_processor as wd + +from spacy._ml import SpacyVectors, create_default_optimizer, zero_init + +from thinc.api import chain +from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu +from thinc.api import flatten_add_lengths +from thinc.t2v import Pooling, sum_pool, mean_pool +from thinc.t2t import ExtractWindow, ParametricAttention +from thinc.misc import Residual """ TODO: this code needs to be implemented in pipes.pyx""" -def train_model(kb, nlp, training_dir, entity_descr_output, limit=None): - run_el._prepare_pipeline(nlp, kb) +class EL_Model(): - correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, - collect_correct=True, - collect_incorrect=True) + labels = ["MATCH", "NOMATCH"] + name = "entity_linker" - entities = kb.get_entity_strings() + def __init__(self, kb, nlp): + run_el._prepare_pipeline(nlp, kb) + self.nlp = nlp + self.kb = kb - id_to_descr = kb_creator._get_id_to_description(entity_descr_output) + self.entity_encoder = self._simple_encoder(width=300) + self.article_encoder = self._simple_encoder(width=300) - cnt = 0 - for f in listdir(training_dir): - if not limit or cnt < limit: - if not run_el.is_dev(f): - article_id = f.replace(".txt", "") - if cnt % 500 == 0: - print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") - cnt += 1 - with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: - text = file.read() - print() - doc = nlp(text) - doc_vector = doc.vector - print("FILE", f, len(doc_vector), "D vector") + def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True): + instances, gold_vectors, entity_descriptions, doc_by_article = self._get_training_data(training_dir, + entity_descr_output, + limit, to_print) + + if to_print: + print("Training on", len(gold_vectors), "instances") + print(" - pos:", len([x for x in gold_vectors if x]), "instances") + print(" - pos:", len([x for x in gold_vectors if not x]), "instances") + print() + + self.sgd_entity = self.begin_training(self.entity_encoder) + self.sgd_article = self.begin_training(self.article_encoder) + + losses = {} + + for inst, label, entity_descr in zip(instances, gold_vectors, entity_descriptions): + article = inst.split(sep="_")[0] + entity_id = inst.split(sep="_")[1] + article_doc = doc_by_article[article] + self.update(article_doc, entity_descr, label, losses=losses) + + def _simple_encoder(self, width): + with Model.define_operators({">>": chain}): + encoder = SpacyVectors \ + >> flatten_add_lengths \ + >> ParametricAttention(width)\ + >> Pooling(sum_pool) \ + >> Residual(zero_init(Maxout(width, width))) + + return encoder + + def begin_training(self, model): + # TODO ? link_vectors_to_models(self.vocab) + sgd = create_default_optimizer(model.ops) + return sgd + + def update(self, article_doc, entity_descr, label, drop=0., losses=None): + entity_encoding, entity_bp = self.entity_encoder.begin_update([entity_descr], drop=drop) + doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) + + # print("entity/article output dim", len(entity_encoding[0]), len(doc_encoding[0])) + + mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding) + + # print() + + # TODO: proper backpropagation taking ranking of elements into account ? + # TODO backpropagation also for negative examples + if label: + entity_bp(diffs, sgd=self.sgd_entity) + article_bp(diffs, sgd=self.sgd_article) + print(mse) + + + # TODO delete ? + def _simple_cnn_model(self, internal_dim): + nr_class = len(self.labels) + with Model.define_operators({">>": chain}): + model_entity = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool) # entity encoding + model_doc = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool) # doc encoding + output_layer = Softmax(nr_class, internal_dim*2) + model = (model_entity | model_doc) >> output_layer + # model.tok2vec = chain(tok2vec, flatten) + model.nO = nr_class + return model + + def predict(self, entity_doc, article_doc): + entity_encoding = self.entity_encoder(entity_doc) + doc_encoding = self.article_encoder(article_doc) + + print("entity_encodings", len(entity_encoding), entity_encoding) + print("doc_encodings", len(doc_encoding), doc_encoding) + mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding) + print("mse", mse) + + return mse + + def _calculate_similarity(self, vector1, vector2): + if len(vector1) != len(vector2): + raise ValueError("To calculate similarity, both vectors should be of equal length") + + diffs = (vector2 - vector1) + error_sum = (diffs ** 2).sum(axis=1) + mean_square_error = error_sum / len(vector1) + return float(mean_square_error), diffs + + def _get_labels(self): + return tuple(self.labels) + + def _get_training_data(self, training_dir, entity_descr_output, limit, to_print): + id_to_descr = kb_creator._get_id_to_description(entity_descr_output) + + correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, + collect_correct=True, + collect_incorrect=True) + + instances = list() + entity_descriptions = list() + local_vectors = list() # TODO: local vectors + gold_vectors = list() + doc_by_article = dict() + + cnt = 0 + for f in listdir(training_dir): + if not limit or cnt < limit: + if not run_el.is_dev(f): + article_id = f.replace(".txt", "") + if cnt % 500 == 0 and to_print: + print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") + cnt += 1 + if article_id not in doc_by_article: + with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: + text = file.read() + doc = self.nlp(text) + doc_by_article[article_id] = doc for mention_pos, entity_pos in correct_entries[article_id].items(): descr = id_to_descr.get(entity_pos) if descr: - doc_descr = nlp(descr) - descr_vector = doc_descr.vector - print("GOLD POS", mention_pos, entity_pos, len(descr_vector), "D vector") + instances.append(article_id + "_" + entity_pos) + doc = self.nlp(descr) + entity_descriptions.append(doc) + gold_vectors.append(True) for mention_neg, entity_negs in incorrect_entries[article_id].items(): for entity_neg in entity_negs: descr = id_to_descr.get(entity_neg) if descr: - doc_descr = nlp(descr) - descr_vector = doc_descr.vector - print("GOLD NEG", mention_neg, entity_neg, len(descr_vector), "D vector") - - print() - print("Processed", cnt, "dev articles") - print() + instances.append(article_id + "_" + entity_neg) + doc = self.nlp(descr) + entity_descriptions.append(doc) + gold_vectors.append(False) + if to_print: + print() + print("Processed", cnt, "dev articles") + print() + return instances, gold_vectors, entity_descriptions, doc_by_article diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 26e2a7ae2..83650aa8d 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -1,7 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals -from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el, train_el +from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el +from examples.pipeline.wiki_entity_linking.train_el import EL_Model import spacy from spacy.vocab import Vocab @@ -31,17 +32,17 @@ if __name__ == "__main__": # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = True + to_create_kb = False # read KB back in from file to_read_kb = True - to_test_kb = True + to_test_kb = False # create training dataset create_wp_training = False # run training - run_training = False + run_training = True # apply named entity linking to the dev dataset apply_to_dev = False @@ -105,16 +106,17 @@ if __name__ == "__main__": print("STEP 5: create training dataset", datetime.datetime.now()) training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) - # STEP 7: apply the EL algorithm on the training dataset + # STEP 6: apply the EL algorithm on the training dataset if run_training: print("STEP 6: training ", datetime.datetime.now()) - my_nlp = spacy.load('en_core_web_sm') - train_el.train_model(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=5) + my_nlp = spacy.load('en_core_web_md') + trainer = EL_Model(kb=my_kb, nlp=my_nlp) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=50) print() - # STEP 8: apply the EL algorithm on the dev dataset + # STEP 7: apply the EL algorithm on the dev dataset if apply_to_dev: - my_nlp = spacy.load('en_core_web_sm') + my_nlp = spacy.load('en_core_web_md') run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000) print() From 9d089c0410c8f71cdf80b0b5d613d8c2983fb454 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 9 May 2019 18:11:49 +0200 Subject: [PATCH 034/148] grouping clusters of instances per doc+mention --- .../pipeline/wiki_entity_linking/train_el.py | 75 +++++++++++-------- 1 file changed, 44 insertions(+), 31 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 8dcea9256..c91058d5f 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -7,7 +7,7 @@ from os import listdir from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator -from spacy._ml import SpacyVectors, create_default_optimizer, zero_init +from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, cosine from thinc.api import chain from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu @@ -33,14 +33,12 @@ class EL_Model(): self.article_encoder = self._simple_encoder(width=300) def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True): - instances, gold_vectors, entity_descriptions, doc_by_article = self._get_training_data(training_dir, + instances, pos_entities, neg_entities, doc_by_article = self._get_training_data(training_dir, entity_descr_output, limit, to_print) if to_print: - print("Training on", len(gold_vectors), "instances") - print(" - pos:", len([x for x in gold_vectors if x]), "instances") - print(" - pos:", len([x for x in gold_vectors if not x]), "instances") + print("Training on", len(instances), "instance clusters") print() self.sgd_entity = self.begin_training(self.entity_encoder) @@ -48,11 +46,20 @@ class EL_Model(): losses = {} - for inst, label, entity_descr in zip(instances, gold_vectors, entity_descriptions): - article = inst.split(sep="_")[0] - entity_id = inst.split(sep="_")[1] - article_doc = doc_by_article[article] - self.update(article_doc, entity_descr, label, losses=losses) + for inst_cluster in instances: + pos_ex = pos_entities.get(inst_cluster) + neg_exs = neg_entities.get(inst_cluster, []) + + if pos_ex and neg_exs: + article = inst_cluster.split(sep="_")[0] + entity_id = inst_cluster.split(sep="_")[1] + article_doc = doc_by_article[article] + self.update(article_doc, pos_ex, neg_exs, losses=losses) + # TODO + # elif not pos_ex: + # print("Weird. Couldn't find pos example for", inst_cluster) + # elif not neg_exs: + # print("Weird. Couldn't find neg examples for", inst_cluster) def _simple_encoder(self, width): with Model.define_operators({">>": chain}): @@ -69,22 +76,29 @@ class EL_Model(): sgd = create_default_optimizer(model.ops) return sgd - def update(self, article_doc, entity_descr, label, drop=0., losses=None): - entity_encoding, entity_bp = self.entity_encoder.begin_update([entity_descr], drop=drop) + def update(self, article_doc, true_entity, false_entities, drop=0., losses=None): doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) + true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) + # true_similarity = cosine(true_entity_encoding, doc_encoding) + # print("true_similarity", true_similarity) + + # for false_entity in false_entities: + # false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) + # false_similarity = cosine(false_entity_encoding, doc_encoding) + # print("false_similarity", false_similarity) + # print("entity/article output dim", len(entity_encoding[0]), len(doc_encoding[0])) - mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding) + mse, diffs = self._calculate_similarity(true_entity_encoding, doc_encoding) # print() # TODO: proper backpropagation taking ranking of elements into account ? # TODO backpropagation also for negative examples - if label: - entity_bp(diffs, sgd=self.sgd_entity) - article_bp(diffs, sgd=self.sgd_article) - print(mse) + true_entity_bp(diffs, sgd=self.sgd_entity) + article_bp(diffs, sgd=self.sgd_article) + print(mse) # TODO delete ? @@ -115,7 +129,7 @@ class EL_Model(): raise ValueError("To calculate similarity, both vectors should be of equal length") diffs = (vector2 - vector1) - error_sum = (diffs ** 2).sum(axis=1) + error_sum = (diffs ** 2).sum() mean_square_error = error_sum / len(vector1) return float(mean_square_error), diffs @@ -130,10 +144,10 @@ class EL_Model(): collect_incorrect=True) instances = list() - entity_descriptions = list() local_vectors = list() # TODO: local vectors - gold_vectors = list() doc_by_article = dict() + pos_entities = dict() + neg_entities = dict() cnt = 0 for f in listdir(training_dir): @@ -149,25 +163,24 @@ class EL_Model(): doc = self.nlp(text) doc_by_article[article_id] = doc - for mention_pos, entity_pos in correct_entries[article_id].items(): + for mention, entity_pos in correct_entries[article_id].items(): descr = id_to_descr.get(entity_pos) if descr: - instances.append(article_id + "_" + entity_pos) - doc = self.nlp(descr) - entity_descriptions.append(doc) - gold_vectors.append(True) + instances.append(article_id + "_" + mention) + doc_descr = self.nlp(descr) + pos_entities[article_id + "_" + mention] = doc_descr - for mention_neg, entity_negs in incorrect_entries[article_id].items(): + for mention, entity_negs in incorrect_entries[article_id].items(): for entity_neg in entity_negs: descr = id_to_descr.get(entity_neg) if descr: - instances.append(article_id + "_" + entity_neg) - doc = self.nlp(descr) - entity_descriptions.append(doc) - gold_vectors.append(False) + doc_descr = self.nlp(descr) + descr_list = neg_entities.get(article_id + "_" + mention, []) + descr_list.append(doc_descr) + neg_entities[article_id + "_" + mention] = descr_list if to_print: print() print("Processed", cnt, "dev articles") print() - return instances, gold_vectors, entity_descriptions, doc_by_article + return instances, pos_entities, neg_entities, doc_by_article From b6d788064afdd5871e3d15303d6f622b91a59cc0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 10 May 2019 12:53:14 +0200 Subject: [PATCH 035/148] some first experiments with different architectures and metrics --- .../pipeline/wiki_entity_linking/train_el.py | 110 ++++++++++++++---- 1 file changed, 86 insertions(+), 24 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index c91058d5f..cfd17bd78 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -4,17 +4,17 @@ from __future__ import unicode_literals import os import datetime from os import listdir +import numpy as np from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator -from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, cosine +from spacy._ml import SpacyVectors, create_default_optimizer, zero_init -from thinc.api import chain +from thinc.api import chain, flatten_add_lengths, with_getitem, clone, with_flatten from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu -from thinc.api import flatten_add_lengths from thinc.t2v import Pooling, sum_pool, mean_pool from thinc.t2t import ExtractWindow, ParametricAttention -from thinc.misc import Residual +from thinc.misc import Residual, LayerNorm as LN """ TODO: this code needs to be implemented in pipes.pyx""" @@ -29,8 +29,8 @@ class EL_Model(): self.nlp = nlp self.kb = kb - self.entity_encoder = self._simple_encoder(width=300) - self.article_encoder = self._simple_encoder(width=300) + self.entity_encoder = self._simple_encoder(in_width=300, out_width=96) + self.article_encoder = self._simple_encoder(in_width=300, out_width=96) def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True): instances, pos_entities, neg_entities, doc_by_article = self._get_training_data(training_dir, @@ -61,13 +61,36 @@ class EL_Model(): # elif not neg_exs: # print("Weird. Couldn't find neg examples for", inst_cluster) - def _simple_encoder(self, width): - with Model.define_operators({">>": chain}): + def _simple_encoder(self, in_width, out_width): + conv_depth = 1 + cnn_maxout_pieces = 3 + with Model.define_operators({">>": chain, "**": clone}): + # encoder = SpacyVectors \ + # >> flatten_add_lengths \ + # >> ParametricAttention(in_width)\ + # >> Pooling(mean_pool) \ + # >> Residual(zero_init(Maxout(in_width, in_width))) \ + # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) encoder = SpacyVectors \ - >> flatten_add_lengths \ - >> ParametricAttention(width)\ - >> Pooling(sum_pool) \ - >> Residual(zero_init(Maxout(width, width))) + >> flatten_add_lengths \ + >> with_getitem(0, Affine(in_width, in_width)) \ + >> ParametricAttention(in_width) \ + >> Pooling(sum_pool) \ + >> Residual(ReLu(in_width, in_width)) ** conv_depth \ + >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) + + # >> zero_init(Affine(nr_class, width, drop_factor=0.0)) + # >> logistic + + # convolution = Residual( + # ExtractWindow(nW=1) + # >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) + # ) + + # embed = SpacyVectors >> LN(Maxout(width, width, pieces=3)) + + # encoder = SpacyVectors >> flatten_add_lengths >> convolution ** conv_depth + # encoder = with_flatten(embed >> convolution ** conv_depth, pad=conv_depth) return encoder @@ -80,25 +103,56 @@ class EL_Model(): doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) - # true_similarity = cosine(true_entity_encoding, doc_encoding) - # print("true_similarity", true_similarity) + # print("encoding dim", len(true_entity_encoding[0])) - # for false_entity in false_entities: - # false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) - # false_similarity = cosine(false_entity_encoding, doc_encoding) - # print("false_similarity", false_similarity) + consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) + consensus_encoding_t = consensus_encoding.transpose() - # print("entity/article output dim", len(entity_encoding[0]), len(doc_encoding[0])) + doc_mse, doc_diffs = self._calculate_similarity(doc_encoding, consensus_encoding) - mse, diffs = self._calculate_similarity(true_entity_encoding, doc_encoding) + entity_mses = list() + + true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding) + # print("true_mse", true_mse) + # print("true_diffs", true_diffs) + entity_mses.append(true_mse) + # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t)) + # print("true_exp", true_exp) + + # false_exp_sum = 0 + + for false_entity in false_entities: + false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) + false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding) + # print("false_mse", false_mse) + # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t)) + # print("false_exp", false_exp) + # print("false_diffs", false_diffs) + entity_mses.append(false_mse) + # if false_mse > true_mse: + # true_diffs = true_diffs - false_diffs ??? + # false_exp_sum += false_exp + + # prob = true_exp / false_exp_sum + # print("prob", prob) + + entity_mses = sorted(entity_mses) + # mse_sum = sum(entity_mses) + # entity_probs = [1 - x/mse_sum for x in entity_mses] + # print("entity_mses", entity_mses) + # print("entity_probs", entity_probs) + true_index = entity_mses.index(true_mse) + # print("true index", true_index) + # print("true prob", entity_probs[true_index]) + + print(true_mse) # print() # TODO: proper backpropagation taking ranking of elements into account ? # TODO backpropagation also for negative examples - true_entity_bp(diffs, sgd=self.sgd_entity) - article_bp(diffs, sgd=self.sgd_article) - print(mse) + true_entity_bp(true_diffs, sgd=self.sgd_entity) + article_bp(doc_diffs, sgd=self.sgd_article) # TODO delete ? @@ -124,11 +178,19 @@ class EL_Model(): return mse + # TODO: expand to more than 2 vectors + def _calculate_consensus(self, vector1, vector2): + if len(vector1) != len(vector2): + raise ValueError("To calculate consenus, both vectors should be of equal length") + + avg = (vector2 + vector1) / 2 + return avg + def _calculate_similarity(self, vector1, vector2): if len(vector1) != len(vector2): raise ValueError("To calculate similarity, both vectors should be of equal length") - diffs = (vector2 - vector1) + diffs = (vector1 - vector2) error_sum = (diffs ** 2).sum() mean_square_error = error_sum / len(vector1) return float(mean_square_error), diffs From 3b81b009547b5c48dea7660e8081f050014f8609 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 13 May 2019 14:26:04 +0200 Subject: [PATCH 036/148] evaluating on dev set during training --- .../pipeline/wiki_entity_linking/run_el.py | 25 +++--- .../pipeline/wiki_entity_linking/train_el.py | 87 ++++++++++++++++--- 2 files changed, 90 insertions(+), 22 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index 96fe58740..66ab0385e 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -70,12 +70,10 @@ def is_dev(file_name): return file_name.endswith("3.txt") -def evaluate(predictions, golds): +def evaluate(predictions, golds, to_print=True): if len(predictions) != len(golds): raise ValueError("predictions and gold entities should have the same length") - print("Evaluating", len(golds), "entities") - tp = 0 fp = 0 fn = 0 @@ -89,17 +87,22 @@ def evaluate(predictions, golds): else: fp += 1 - print("tp", tp) - print("fp", fp) - print("fn", fn) + if to_print: + print("Evaluating", len(golds), "entities") + print("tp", tp) + print("fp", fp) + print("fn", fn) - precision = tp / (tp + fp + 0.0000001) - recall = tp / (tp + fn + 0.0000001) + precision = 100 * tp / (tp + fp + 0.0000001) + recall = 100 * tp / (tp + fn + 0.0000001) fscore = 2 * recall * precision / (recall + precision + 0.0000001) - print("precision", round(100 * precision, 1), "%") - print("recall", round(100 * recall, 1), "%") - print("Fscore", round(100 * fscore, 1), "%") + if to_print: + print("precision", round(precision, 1), "%") + print("recall", round(recall, 1), "%") + print("Fscore", round(fscore, 1), "%") + + return precision, recall, fscore def _prepare_pipeline(nlp, kb): diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index cfd17bd78..7fd301e02 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -5,6 +5,7 @@ import os import datetime from os import listdir import numpy as np +from random import shuffle from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator @@ -16,6 +17,8 @@ from thinc.t2v import Pooling, sum_pool, mean_pool from thinc.t2t import ExtractWindow, ParametricAttention from thinc.misc import Residual, LayerNorm as LN +from spacy.tokens import Doc + """ TODO: this code needs to be implemented in pipes.pyx""" @@ -33,34 +36,93 @@ class EL_Model(): self.article_encoder = self._simple_encoder(in_width=300, out_width=96) def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True): - instances, pos_entities, neg_entities, doc_by_article = self._get_training_data(training_dir, - entity_descr_output, - limit, to_print) + Doc.set_extension("entity_id", default=None) + + train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir, + entity_descr_output, + False, + limit, to_print) + + dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir, + entity_descr_output, + True, + limit, to_print) if to_print: - print("Training on", len(instances), "instance clusters") + print("Training on", len(train_instances), "instance clusters") + print("Dev test on", len(dev_instances), "instance clusters") print() self.sgd_entity = self.begin_training(self.entity_encoder) self.sgd_article = self.begin_training(self.article_encoder) + self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc) + losses = {} - for inst_cluster in instances: - pos_ex = pos_entities.get(inst_cluster) - neg_exs = neg_entities.get(inst_cluster, []) + for inst_cluster in train_instances: + pos_ex = train_pos.get(inst_cluster) + neg_exs = train_neg.get(inst_cluster, []) if pos_ex and neg_exs: article = inst_cluster.split(sep="_")[0] entity_id = inst_cluster.split(sep="_")[1] - article_doc = doc_by_article[article] + article_doc = train_doc[article] self.update(article_doc, pos_ex, neg_exs, losses=losses) + p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc) + print(round(fscore, 1)) # TODO # elif not pos_ex: # print("Weird. Couldn't find pos example for", inst_cluster) # elif not neg_exs: # print("Weird. Couldn't find neg examples for", inst_cluster) + def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc): + predictions = list() + golds = list() + + for inst_cluster in dev_instances: + pos_ex = dev_pos.get(inst_cluster) + neg_exs = dev_neg.get(inst_cluster, []) + ex_to_id = dict() + + if pos_ex and neg_exs: + ex_to_id[pos_ex] = pos_ex._.entity_id + for neg_ex in neg_exs: + ex_to_id[neg_ex] = neg_ex._.entity_id + + article = inst_cluster.split(sep="_")[0] + entity_id = inst_cluster.split(sep="_")[1] + article_doc = dev_doc[article] + + examples = list(neg_exs) + examples.append(pos_ex) + shuffle(examples) + + best_entity, lowest_mse = self._predict(examples, article_doc) + predictions.append(ex_to_id[best_entity]) + golds.append(ex_to_id[pos_ex]) + + + # TODO: use lowest_mse and combine with prior probability + p, r, F = run_el.evaluate(predictions, golds, to_print=False) + return p, r, F + + def _predict(self, entities, article_doc): + doc_encoding = self.article_encoder([article_doc]) + + lowest_mse = None + best_entity = None + + for entity in entities: + entity_encoding = self.entity_encoder([entity]) + mse, _ = self._calculate_similarity(doc_encoding, entity_encoding) + if not best_entity or mse < lowest_mse: + lowest_mse = mse + best_entity = entity + + return best_entity, lowest_mse + def _simple_encoder(self, in_width, out_width): conv_depth = 1 cnn_maxout_pieces = 3 @@ -145,7 +207,7 @@ class EL_Model(): # print("true index", true_index) # print("true prob", entity_probs[true_index]) - print(true_mse) + # print("training loss", true_mse) # print() @@ -198,13 +260,14 @@ class EL_Model(): def _get_labels(self): return tuple(self.labels) - def _get_training_data(self, training_dir, entity_descr_output, limit, to_print): + def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, collect_correct=True, collect_incorrect=True) + instances = list() local_vectors = list() # TODO: local vectors doc_by_article = dict() @@ -214,7 +277,7 @@ class EL_Model(): cnt = 0 for f in listdir(training_dir): if not limit or cnt < limit: - if not run_el.is_dev(f): + if dev == run_el.is_dev(f): article_id = f.replace(".txt", "") if cnt % 500 == 0 and to_print: print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") @@ -230,6 +293,7 @@ class EL_Model(): if descr: instances.append(article_id + "_" + mention) doc_descr = self.nlp(descr) + doc_descr._.entity_id = entity_pos pos_entities[article_id + "_" + mention] = doc_descr for mention, entity_negs in incorrect_entries[article_id].items(): @@ -237,6 +301,7 @@ class EL_Model(): descr = id_to_descr.get(entity_neg) if descr: doc_descr = self.nlp(descr) + doc_descr._.entity_id = entity_neg descr_list = neg_entities.get(article_id + "_" + mention, []) descr_list.append(doc_descr) neg_entities[article_id + "_" + mention] = descr_list From 4142e8dd1b05e396c6e24efb7550a86837359118 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 13 May 2019 17:02:34 +0200 Subject: [PATCH 037/148] train and predict per article (saving time for doc encoding) --- .../pipeline/wiki_entity_linking/train_el.py | 182 ++++++++++-------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 103 insertions(+), 81 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 7fd301e02..1e2c25ffc 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -46,11 +46,11 @@ class EL_Model(): dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir, entity_descr_output, True, - limit, to_print) + limit / 10, to_print) if to_print: - print("Training on", len(train_instances), "instance clusters") - print("Dev test on", len(dev_instances), "instance clusters") + print("Training on", len(train_instances.values()), "articles") + print("Dev test on", len(dev_instances.values()), "articles") print() self.sgd_entity = self.begin_training(self.entity_encoder) @@ -60,49 +60,51 @@ class EL_Model(): losses = {} - for inst_cluster in train_instances: - pos_ex = train_pos.get(inst_cluster) - neg_exs = train_neg.get(inst_cluster, []) + instance_count = 0 + + for article_id, inst_cluster_set in train_instances.items(): + article_doc = train_doc[article_id] + pos_ex_list = list() + neg_exs_list = list() + for inst_cluster in inst_cluster_set: + instance_count += 1 + pos_ex_list.append(train_pos.get(inst_cluster)) + neg_exs_list.append(train_neg.get(inst_cluster, [])) + + self.update(article_doc, pos_ex_list, neg_exs_list, losses=losses) + p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc) + print(round(fscore, 1)) + + if to_print: + print("Trained on", instance_count, "instance clusters") - if pos_ex and neg_exs: - article = inst_cluster.split(sep="_")[0] - entity_id = inst_cluster.split(sep="_")[1] - article_doc = train_doc[article] - self.update(article_doc, pos_ex, neg_exs, losses=losses) - p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc) - print(round(fscore, 1)) - # TODO - # elif not pos_ex: - # print("Weird. Couldn't find pos example for", inst_cluster) - # elif not neg_exs: - # print("Weird. Couldn't find neg examples for", inst_cluster) def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc): predictions = list() golds = list() - for inst_cluster in dev_instances: - pos_ex = dev_pos.get(inst_cluster) - neg_exs = dev_neg.get(inst_cluster, []) - ex_to_id = dict() + for article_id, inst_cluster_set in dev_instances.items(): + for inst_cluster in inst_cluster_set: + pos_ex = dev_pos.get(inst_cluster) + neg_exs = dev_neg.get(inst_cluster, []) + ex_to_id = dict() - if pos_ex and neg_exs: - ex_to_id[pos_ex] = pos_ex._.entity_id - for neg_ex in neg_exs: - ex_to_id[neg_ex] = neg_ex._.entity_id + if pos_ex and neg_exs: + ex_to_id[pos_ex] = pos_ex._.entity_id + for neg_ex in neg_exs: + ex_to_id[neg_ex] = neg_ex._.entity_id - article = inst_cluster.split(sep="_")[0] - entity_id = inst_cluster.split(sep="_")[1] - article_doc = dev_doc[article] + article = inst_cluster.split(sep="_")[0] + entity_id = inst_cluster.split(sep="_")[1] + article_doc = dev_doc[article] - examples = list(neg_exs) - examples.append(pos_ex) - shuffle(examples) - - best_entity, lowest_mse = self._predict(examples, article_doc) - predictions.append(ex_to_id[best_entity]) - golds.append(ex_to_id[pos_ex]) + examples = list(neg_exs) + examples.append(pos_ex) + shuffle(examples) + best_entity, lowest_mse = self._predict(examples, article_doc) + predictions.append(ex_to_id[best_entity]) + golds.append(ex_to_id[pos_ex]) # TODO: use lowest_mse and combine with prior probability p, r, F = run_el.evaluate(predictions, golds, to_print=False) @@ -161,60 +163,79 @@ class EL_Model(): sgd = create_default_optimizer(model.ops) return sgd - def update(self, article_doc, true_entity, false_entities, drop=0., losses=None): + def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None): + # TODO: one call only to begin_update ? + + entity_diffs = None + doc_diffs = None + doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) - true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) - # print("encoding dim", len(true_entity_encoding[0])) + for i, true_entity in enumerate(true_entity_list): + false_entities = false_entities_list[i] - consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) - consensus_encoding_t = consensus_encoding.transpose() + true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) + # print("encoding dim", len(true_entity_encoding[0])) - doc_mse, doc_diffs = self._calculate_similarity(doc_encoding, consensus_encoding) + consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) + # consensus_encoding_t = consensus_encoding.transpose() - entity_mses = list() + doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding) - true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding) - # print("true_mse", true_mse) - # print("true_diffs", true_diffs) - entity_mses.append(true_mse) - # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t)) - # print("true_exp", true_exp) + entity_mses = list() - # false_exp_sum = 0 + true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding) + # print("true_mse", true_mse) + # print("true_diffs", true_diffs) + entity_mses.append(true_mse) + # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t)) + # print("true_exp", true_exp) - for false_entity in false_entities: - false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) - false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding) - # print("false_mse", false_mse) - # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t)) - # print("false_exp", false_exp) - # print("false_diffs", false_diffs) - entity_mses.append(false_mse) - # if false_mse > true_mse: - # true_diffs = true_diffs - false_diffs ??? - # false_exp_sum += false_exp + # false_exp_sum = 0 - # prob = true_exp / false_exp_sum - # print("prob", prob) + if doc_diffs is not None: + doc_diffs += doc_diff + entity_diffs += true_diffs + else: + doc_diffs = doc_diff + entity_diffs = true_diffs - entity_mses = sorted(entity_mses) - # mse_sum = sum(entity_mses) - # entity_probs = [1 - x/mse_sum for x in entity_mses] - # print("entity_mses", entity_mses) - # print("entity_probs", entity_probs) - true_index = entity_mses.index(true_mse) - # print("true index", true_index) - # print("true prob", entity_probs[true_index]) + for false_entity in false_entities: + false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) + false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding) + # print("false_mse", false_mse) + # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t)) + # print("false_exp", false_exp) + # print("false_diffs", false_diffs) + entity_mses.append(false_mse) + # if false_mse > true_mse: + # true_diffs = true_diffs - false_diffs ??? + # false_exp_sum += false_exp - # print("training loss", true_mse) + # prob = true_exp / false_exp_sum + # print("prob", prob) - # print() + entity_mses = sorted(entity_mses) + # mse_sum = sum(entity_mses) + # entity_probs = [1 - x/mse_sum for x in entity_mses] + # print("entity_mses", entity_mses) + # print("entity_probs", entity_probs) + true_index = entity_mses.index(true_mse) + # print("true index", true_index) + # print("true prob", entity_probs[true_index]) + + # print("training loss", true_mse) + + # print() # TODO: proper backpropagation taking ranking of elements into account ? # TODO backpropagation also for negative examples - true_entity_bp(true_diffs, sgd=self.sgd_entity) - article_bp(doc_diffs, sgd=self.sgd_article) + + if doc_diffs is not None: + doc_diffs = doc_diffs / len(true_entity_list) + + true_entity_bp(entity_diffs, sgd=self.sgd_entity) + article_bp(doc_diffs, sgd=self.sgd_article) # TODO delete ? @@ -268,7 +289,7 @@ class EL_Model(): collect_incorrect=True) - instances = list() + instance_by_doc = dict() local_vectors = list() # TODO: local vectors doc_by_article = dict() pos_entities = dict() @@ -280,18 +301,19 @@ class EL_Model(): if dev == run_el.is_dev(f): article_id = f.replace(".txt", "") if cnt % 500 == 0 and to_print: - print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") + print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") cnt += 1 if article_id not in doc_by_article: with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: text = file.read() doc = self.nlp(text) doc_by_article[article_id] = doc + instance_by_doc[article_id] = set() for mention, entity_pos in correct_entries[article_id].items(): descr = id_to_descr.get(entity_pos) if descr: - instances.append(article_id + "_" + mention) + instance_by_doc[article_id].add(article_id + "_" + mention) doc_descr = self.nlp(descr) doc_descr._.entity_id = entity_pos pos_entities[article_id + "_" + mention] = doc_descr @@ -308,6 +330,6 @@ class EL_Model(): if to_print: print() - print("Processed", cnt, "dev articles") + print("Processed", cnt, "training articles, dev=" + str(dev)) print() - return instances, pos_entities, neg_entities, doc_by_article + return instance_by_doc, pos_entities, neg_entities, doc_by_article diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 83650aa8d..581d38b1b 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training ", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=50) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=500) print() # STEP 7: apply the EL algorithm on the dev dataset From 09ed446b20fbeac06f6c88869d0e9a20e6332b03 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 14 May 2019 08:37:52 +0200 Subject: [PATCH 038/148] different architecture / settings --- .../pipeline/wiki_entity_linking/train_el.py | 43 +++++++++---------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 22 insertions(+), 23 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 1e2c25ffc..b3f42dcc4 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -4,18 +4,17 @@ from __future__ import unicode_literals import os import datetime from os import listdir -import numpy as np from random import shuffle from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator from spacy._ml import SpacyVectors, create_default_optimizer, zero_init -from thinc.api import chain, flatten_add_lengths, with_getitem, clone, with_flatten -from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu +from thinc.api import chain, flatten_add_lengths, with_getitem, clone +from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu from thinc.t2v import Pooling, sum_pool, mean_pool -from thinc.t2t import ExtractWindow, ParametricAttention -from thinc.misc import Residual, LayerNorm as LN +from thinc.t2t import ParametricAttention +from thinc.misc import Residual from spacy.tokens import Doc @@ -35,18 +34,20 @@ class EL_Model(): self.entity_encoder = self._simple_encoder(in_width=300, out_width=96) self.article_encoder = self._simple_encoder(in_width=300, out_width=96) - def train_model(self, training_dir, entity_descr_output, limit=None, to_print=True): + def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): Doc.set_extension("entity_id", default=None) train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir, entity_descr_output, False, - limit, to_print) + trainlimit, + to_print) dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir, entity_descr_output, True, - limit / 10, to_print) + devlimit, + to_print) if to_print: print("Training on", len(train_instances.values()), "articles") @@ -78,7 +79,6 @@ class EL_Model(): if to_print: print("Trained on", instance_count, "instance clusters") - def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc): predictions = list() golds = list() @@ -129,19 +129,19 @@ class EL_Model(): conv_depth = 1 cnn_maxout_pieces = 3 with Model.define_operators({">>": chain, "**": clone}): - # encoder = SpacyVectors \ - # >> flatten_add_lengths \ - # >> ParametricAttention(in_width)\ - # >> Pooling(mean_pool) \ - # >> Residual(zero_init(Maxout(in_width, in_width))) \ - # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) encoder = SpacyVectors \ - >> flatten_add_lengths \ - >> with_getitem(0, Affine(in_width, in_width)) \ - >> ParametricAttention(in_width) \ - >> Pooling(sum_pool) \ - >> Residual(ReLu(in_width, in_width)) ** conv_depth \ - >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) + >> flatten_add_lengths \ + >> ParametricAttention(in_width)\ + >> Pooling(mean_pool) \ + >> Residual(zero_init(Maxout(in_width, in_width))) \ + >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) + # encoder = SpacyVectors \ + # >> flatten_add_lengths \ + # >> with_getitem(0, Affine(in_width, in_width)) \ + # >> ParametricAttention(in_width) \ + # >> Pooling(sum_pool) \ + # >> Residual(ReLu(in_width, in_width)) ** conv_depth \ + # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) # >> zero_init(Affine(nr_class, width, drop_factor=0.0)) # >> logistic @@ -178,7 +178,6 @@ class EL_Model(): # print("encoding dim", len(true_entity_encoding[0])) consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) - # consensus_encoding_t = consensus_encoding.transpose() doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 581d38b1b..43cc41392 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training ", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, limit=500) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=50) print() # STEP 7: apply the EL algorithm on the dev dataset From 2713abc651dc9f601d98e5f9b402852798e22b79 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 14 May 2019 22:55:56 +0200 Subject: [PATCH 039/148] implement loss function using dot product and prob estimate per candidate cluster --- .../pipeline/wiki_entity_linking/train_el.py | 203 +++++++++--------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 103 insertions(+), 102 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index b3f42dcc4..06ac8d1d4 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -5,12 +5,14 @@ import os import datetime from os import listdir from random import shuffle +import numpy as np from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator from spacy._ml import SpacyVectors, create_default_optimizer, zero_init from thinc.api import chain, flatten_add_lengths, with_getitem, clone +from thinc.neural.util import get_array_module from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu from thinc.t2v import Pooling, sum_pool, mean_pool from thinc.t2t import ParametricAttention @@ -23,6 +25,11 @@ from spacy.tokens import Doc class EL_Model(): + INPUT_DIM = 300 + OUTPUT_DIM = 5 # 96 + PRINT_LOSS = True + PRINT_F = True + labels = ["MATCH", "NOMATCH"] name = "entity_linker" @@ -31,8 +38,8 @@ class EL_Model(): self.nlp = nlp self.kb = kb - self.entity_encoder = self._simple_encoder(in_width=300, out_width=96) - self.article_encoder = self._simple_encoder(in_width=300, out_width=96) + self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) + self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): Doc.set_extension("entity_id", default=None) @@ -64,17 +71,20 @@ class EL_Model(): instance_count = 0 for article_id, inst_cluster_set in train_instances.items(): + print("article", article_id) article_doc = train_doc[article_id] pos_ex_list = list() neg_exs_list = list() for inst_cluster in inst_cluster_set: + print("inst_cluster", inst_cluster) instance_count += 1 pos_ex_list.append(train_pos.get(inst_cluster)) neg_exs_list.append(train_neg.get(inst_cluster, [])) self.update(article_doc, pos_ex_list, neg_exs_list, losses=losses) p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc) - print(round(fscore, 1)) + if self.PRINT_F: + print(round(fscore, 1)) if to_print: print("Trained on", instance_count, "instance clusters") @@ -102,7 +112,7 @@ class EL_Model(): examples.append(pos_ex) shuffle(examples) - best_entity, lowest_mse = self._predict(examples, article_doc) + best_entity, highest_prob = self._predict(examples, article_doc) predictions.append(ex_to_id[best_entity]) golds.append(ex_to_id[pos_ex]) @@ -113,17 +123,21 @@ class EL_Model(): def _predict(self, entities, article_doc): doc_encoding = self.article_encoder([article_doc]) - lowest_mse = None + highest_prob = None best_entity = None + entity_to_vector = dict() for entity in entities: - entity_encoding = self.entity_encoder([entity]) - mse, _ = self._calculate_similarity(doc_encoding, entity_encoding) - if not best_entity or mse < lowest_mse: - lowest_mse = mse + entity_to_vector[entity] = self.entity_encoder([entity]) + + for entity in entities: + entity_encoding = entity_to_vector[entity] + prob = self._calculate_probability(doc_encoding, entity_encoding, entity_to_vector.values()) + if not best_entity or prob > highest_prob: + highest_prob = prob best_entity = entity - return best_entity, lowest_mse + return best_entity, highest_prob def _simple_encoder(self, in_width, out_width): conv_depth = 1 @@ -164,103 +178,56 @@ class EL_Model(): return sgd def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None): - # TODO: one call only to begin_update ? - entity_diffs = None - doc_diffs = None - - doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) for i, true_entity in enumerate(true_entity_list): - false_entities = false_entities_list[i] + for cnt in range(10): + #try: + false_vectors = list() + false_entities = false_entities_list[i] + if len(false_entities) > 0: + # TODO: batch per doc + doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) + doc_encoding = doc_encoding[0] + print() + print(cnt) + print("doc", doc_encoding) - true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) - # print("encoding dim", len(true_entity_encoding[0])) + for false_entity in false_entities: + # TODO: one call only to begin_update ? + false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) + false_entity_encoding = false_entity_encoding[0] + false_vectors.append(false_entity_encoding) - consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) + true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) + true_entity_encoding = true_entity_encoding[0] - doc_mse, doc_diff = self._calculate_similarity(doc_encoding, consensus_encoding) + all_vectors = [true_entity_encoding] + all_vectors.extend(false_vectors) - entity_mses = list() + # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) - true_mse, true_diffs = self._calculate_similarity(true_entity_encoding, consensus_encoding) - # print("true_mse", true_mse) - # print("true_diffs", true_diffs) - entity_mses.append(true_mse) - # true_exp = np.exp(true_entity_encoding.dot(consensus_encoding_t)) - # print("true_exp", true_exp) + true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors) + print("true", true_prob, true_entity_encoding) - # false_exp_sum = 0 + all_probs = [true_prob] + for false_vector in false_vectors: + false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors) + print("false", false_prob, false_vector) + all_probs.append(false_prob) - if doc_diffs is not None: - doc_diffs += doc_diff - entity_diffs += true_diffs - else: - doc_diffs = doc_diff - entity_diffs = true_diffs + loss = self._calculate_loss(true_prob, all_probs).astype(np.float32) + if self.PRINT_LOSS: + print("loss", round(loss, 5)) - for false_entity in false_entities: - false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) - false_mse, false_diffs = self._calculate_similarity(false_entity_encoding, consensus_encoding) - # print("false_mse", false_mse) - # false_exp = np.exp(false_entity_encoding.dot(consensus_encoding_t)) - # print("false_exp", false_exp) - # print("false_diffs", false_diffs) - entity_mses.append(false_mse) - # if false_mse > true_mse: - # true_diffs = true_diffs - false_diffs ??? - # false_exp_sum += false_exp - - # prob = true_exp / false_exp_sum - # print("prob", prob) - - entity_mses = sorted(entity_mses) - # mse_sum = sum(entity_mses) - # entity_probs = [1 - x/mse_sum for x in entity_mses] - # print("entity_mses", entity_mses) - # print("entity_probs", entity_probs) - true_index = entity_mses.index(true_mse) - # print("true index", true_index) - # print("true prob", entity_probs[true_index]) - - # print("training loss", true_mse) - - # print() - - # TODO: proper backpropagation taking ranking of elements into account ? - # TODO backpropagation also for negative examples - - if doc_diffs is not None: - doc_diffs = doc_diffs / len(true_entity_list) - - true_entity_bp(entity_diffs, sgd=self.sgd_entity) - article_bp(doc_diffs, sgd=self.sgd_article) + doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors) + print("doc_gradient", doc_gradient) + article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article) + #except Exception as e: + #pass - # TODO delete ? - def _simple_cnn_model(self, internal_dim): - nr_class = len(self.labels) - with Model.define_operators({">>": chain}): - model_entity = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool) # entity encoding - model_doc = SpacyVectors >> flatten_add_lengths >> Pooling(mean_pool) # doc encoding - output_layer = Softmax(nr_class, internal_dim*2) - model = (model_entity | model_doc) >> output_layer - # model.tok2vec = chain(tok2vec, flatten) - model.nO = nr_class - return model - - def predict(self, entity_doc, article_doc): - entity_encoding = self.entity_encoder(entity_doc) - doc_encoding = self.article_encoder(article_doc) - - print("entity_encodings", len(entity_encoding), entity_encoding) - print("doc_encodings", len(doc_encoding), doc_encoding) - mse, diffs = self._calculate_similarity(entity_encoding, doc_encoding) - print("mse", mse) - - return mse - - # TODO: expand to more than 2 vectors + # TODO: FIX def _calculate_consensus(self, vector1, vector2): if len(vector1) != len(vector2): raise ValueError("To calculate consenus, both vectors should be of equal length") @@ -268,17 +235,51 @@ class EL_Model(): avg = (vector2 + vector1) / 2 return avg - def _calculate_similarity(self, vector1, vector2): + def _calculate_probability(self, vector1, vector2, allvectors): + """ Make sure that vector2 is included in allvectors """ if len(vector1) != len(vector2): raise ValueError("To calculate similarity, both vectors should be of equal length") - diffs = (vector1 - vector2) - error_sum = (diffs ** 2).sum() - mean_square_error = error_sum / len(vector1) - return float(mean_square_error), diffs + vector1_t = vector1.transpose() + e = self._calculate_dot_exp(vector2, vector1_t) + e_sum = 0 + for v in allvectors: + e_sum += self._calculate_dot_exp(v, vector1_t) - def _get_labels(self): - return tuple(self.labels) + return float(e / e_sum) + + @staticmethod + def _calculate_loss(true_prob, all_probs): + """ all_probs should include true_prob ! """ + return -1 * np.log(true_prob / sum(all_probs)) + + @staticmethod + def _calculate_doc_gradient(loss, doc_vector, true_vector, false_vectors): + gradient = np.zeros(len(doc_vector)) + for i in range(len(doc_vector)): + min_false = min(x[i] for x in false_vectors) + max_false = max(x[i] for x in false_vectors) + + if true_vector[i] > max_false: + if doc_vector[i] > 0: + gradient[i] = 0 + else: + gradient[i] = -loss + elif true_vector[i] < min_false: + if doc_vector[i] > 0: + gradient[i] = loss + if doc_vector[i] < 0: + gradient[i] = 0 + else: + target = 0 # non-distinctive vector positions should convert to 0 + gradient[i] = doc_vector[i] - target + + return gradient + + @staticmethod + def _calculate_dot_exp(vector1, vector2_transposed): + e = np.exp(vector1.dot(vector2_transposed)) + return e def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 43cc41392..bc75ac09a 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training ", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=50) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=5) print() # STEP 7: apply the EL algorithm on the dev dataset From 9ffe5437aee37c02db2d32a79bc4a2072448cce3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 15 May 2019 02:23:08 +0200 Subject: [PATCH 040/148] calculate gradient for entity encoding --- .../pipeline/wiki_entity_linking/train_el.py | 125 ++++++++++++------ .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 88 insertions(+), 39 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 06ac8d1d4..9f674d239 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -26,9 +26,10 @@ from spacy.tokens import Doc class EL_Model(): INPUT_DIM = 300 - OUTPUT_DIM = 5 # 96 - PRINT_LOSS = True + OUTPUT_DIM = 96 + PRINT_LOSS = False PRINT_F = True + EPS = 0.0000000005 labels = ["MATCH", "NOMATCH"] name = "entity_linker" @@ -71,12 +72,12 @@ class EL_Model(): instance_count = 0 for article_id, inst_cluster_set in train_instances.items(): - print("article", article_id) + # print("article", article_id) article_doc = train_doc[article_id] pos_ex_list = list() neg_exs_list = list() for inst_cluster in inst_cluster_set: - print("inst_cluster", inst_cluster) + # print("inst_cluster", inst_cluster) instance_count += 1 pos_ex_list.append(train_pos.get(inst_cluster)) neg_exs_list.append(train_neg.get(inst_cluster, [])) @@ -143,19 +144,19 @@ class EL_Model(): conv_depth = 1 cnn_maxout_pieces = 3 with Model.define_operators({">>": chain, "**": clone}): - encoder = SpacyVectors \ - >> flatten_add_lengths \ - >> ParametricAttention(in_width)\ - >> Pooling(mean_pool) \ - >> Residual(zero_init(Maxout(in_width, in_width))) \ - >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) # encoder = SpacyVectors \ - # >> flatten_add_lengths \ - # >> with_getitem(0, Affine(in_width, in_width)) \ - # >> ParametricAttention(in_width) \ - # >> Pooling(sum_pool) \ - # >> Residual(ReLu(in_width, in_width)) ** conv_depth \ - # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) + # >> flatten_add_lengths \ + # >> ParametricAttention(in_width)\ + # >> Pooling(mean_pool) \ + # >> Residual(zero_init(Maxout(in_width, in_width))) \ + # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) + encoder = SpacyVectors \ + >> flatten_add_lengths \ + >> with_getitem(0, Affine(in_width, in_width)) \ + >> ParametricAttention(in_width) \ + >> Pooling(sum_pool) \ + >> Residual(ReLu(in_width, in_width)) ** conv_depth \ + >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) # >> zero_init(Affine(nr_class, width, drop_factor=0.0)) # >> logistic @@ -178,20 +179,16 @@ class EL_Model(): return sgd def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None): - + doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) + doc_encoding = doc_encoding[0] + # print("doc", doc_encoding) for i, true_entity in enumerate(true_entity_list): - for cnt in range(10): - #try: + try: false_vectors = list() false_entities = false_entities_list[i] if len(false_entities) > 0: # TODO: batch per doc - doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) - doc_encoding = doc_encoding[0] - print() - print(cnt) - print("doc", doc_encoding) for false_entity in false_entities: # TODO: one call only to begin_update ? @@ -201,6 +198,7 @@ class EL_Model(): true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) true_entity_encoding = true_entity_encoding[0] + # true_gradient = self._calculate_true_gradient(doc_encoding, true_entity_encoding) all_vectors = [true_entity_encoding] all_vectors.extend(false_vectors) @@ -208,29 +206,37 @@ class EL_Model(): # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors) - print("true", true_prob, true_entity_encoding) + # print("true", true_prob, true_entity_encoding) + # print("true gradient", true_gradient) + # print() all_probs = [true_prob] for false_vector in false_vectors: false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors) - print("false", false_prob, false_vector) + # print("false", false_prob, false_vector) + # print("false gradient", false_gradient) + # print() all_probs.append(false_prob) loss = self._calculate_loss(true_prob, all_probs).astype(np.float32) if self.PRINT_LOSS: - print("loss", round(loss, 5)) + print(round(loss, 5)) - doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors) - print("doc_gradient", doc_gradient) - article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article) - #except Exception as e: - #pass + #doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors) + entity_gradient = self._calculate_entity_gradient(doc_encoding, true_entity_encoding, false_vectors) + # print("entity_gradient", entity_gradient) + # print("doc_gradient", doc_gradient) + # article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article) + true_entity_bp([entity_gradient.astype(np.float32)], sgd=self.sgd_entity) + #true_entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity) + except Exception as e: + pass # TODO: FIX def _calculate_consensus(self, vector1, vector2): if len(vector1) != len(vector2): - raise ValueError("To calculate consenus, both vectors should be of equal length") + raise ValueError("To calculate consensus, both vectors should be of equal length") avg = (vector2 + vector1) / 2 return avg @@ -246,12 +252,11 @@ class EL_Model(): for v in allvectors: e_sum += self._calculate_dot_exp(v, vector1_t) - return float(e / e_sum) + return float(e / (self.EPS + e_sum)) - @staticmethod - def _calculate_loss(true_prob, all_probs): + def _calculate_loss(self, true_prob, all_probs): """ all_probs should include true_prob ! """ - return -1 * np.log(true_prob / sum(all_probs)) + return -1 * np.log((self.EPS + true_prob) / (self.EPS + sum(all_probs))) @staticmethod def _calculate_doc_gradient(loss, doc_vector, true_vector, false_vectors): @@ -276,9 +281,53 @@ class EL_Model(): return gradient + def _calculate_true_gradient(self, doc_vector, entity_vector): + # sum_entity_vector = sum(entity_vector) + # gradient = [-sum_entity_vector/(self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))] + gradient = [1 / (self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))] + return np.asarray(gradient) + + def _calculate_entity_gradient(self, doc_vector, true_vector, false_vectors): + entity_gradient = list() + prob_true = list() + false_prob_list = list() + for i in range(len(true_vector)): + doc_i = np.asarray([doc_vector[i]]) + true_i = np.asarray([true_vector[i]]) + falses_i = np.asarray([[fv[i]] for fv in false_vectors]) + all_i = [true_i] + all_i.extend(falses_i) + + prob_true_i = self._calculate_probability(doc_i, true_i, all_i) + prob_true.append(prob_true_i) + + false_list = list() + all_probs_i = [prob_true_i] + for false_vector in falses_i: + false_prob_i = self._calculate_probability(doc_i, false_vector, all_i) + all_probs_i.append(false_prob_i) + false_list.append(false_prob_i) + false_prob_list.append(false_list) + + sign_loss_i = 1 + if doc_vector[i] * true_vector[i] < 0: + sign_loss_i = -1 + + loss_i = sign_loss_i * self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32) + entity_gradient.append(loss_i) + # print("prob_true", prob_true) + # print("false_prob_list", false_prob_list) + return np.asarray(entity_gradient) + + @staticmethod def _calculate_dot_exp(vector1, vector2_transposed): - e = np.exp(vector1.dot(vector2_transposed)) + dot_product = vector1.dot(vector2_transposed) + dot_product = min(50, dot_product) + # dot_product = max(-10000, dot_product) + # print("DOT", dot_product) + e = np.exp(dot_product) + # print("E", e) return e def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index bc75ac09a..cccc67650 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training ", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=5) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1500, devlimit=50) print() # STEP 7: apply the EL algorithm on the dev dataset From b5470f3d753dd3bac3423121a44c0862a67b607c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 16 May 2019 18:25:34 +0200 Subject: [PATCH 041/148] various tests, architectures and experiments --- .../pipeline/wiki_entity_linking/train_el.py | 472 ++++++++++++++---- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 363 insertions(+), 111 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 9f674d239..5cb027d0e 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -6,32 +6,40 @@ import datetime from os import listdir from random import shuffle import numpy as np +import random +from thinc.neural._classes.convolution import ExtractWindow +from thinc.neural._classes.feature_extracter import FeatureExtracter from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator -from spacy._ml import SpacyVectors, create_default_optimizer, zero_init +from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic -from thinc.api import chain, flatten_add_lengths, with_getitem, clone +from thinc.api import chain, concatenate, flatten_add_lengths, with_getitem, clone, with_flatten from thinc.neural.util import get_array_module from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu -from thinc.t2v import Pooling, sum_pool, mean_pool +from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool from thinc.t2t import ParametricAttention from thinc.misc import Residual +from thinc.misc import LayerNorm as LN from spacy.tokens import Doc """ TODO: this code needs to be implemented in pipes.pyx""" -class EL_Model(): +class EL_Model: - INPUT_DIM = 300 - OUTPUT_DIM = 96 - PRINT_LOSS = False + PRINT_LOSS = True PRINT_F = True EPS = 0.0000000005 + CUTOFF = 0.5 + + INPUT_DIM = 300 + ENTITY_WIDTH = 64 + ARTICLE_WIDTH = 64 + HIDDEN_1_WIDTH = 256 + HIDDEN_2_WIDTH = 64 - labels = ["MATCH", "NOMATCH"] name = "entity_linker" def __init__(self, kb, nlp): @@ -39,58 +47,102 @@ class EL_Model(): self.nlp = nlp self.kb = kb - self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) - self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) + self._build_cnn(hidden_entity_width=self.ENTITY_WIDTH, hidden_article_width=self.ARTICLE_WIDTH) + + # self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) + # self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): + # raise errors instead of runtime warnings in case of int/float overflow + np.seterr(all='raise') + Doc.set_extension("entity_id", default=None) train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir, entity_descr_output, False, trainlimit, - to_print) + to_print=False) dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir, entity_descr_output, True, devlimit, - to_print) + to_print=False) + + # self.sgd_entity = self.begin_training(self.entity_encoder) + # self.sgd_article = self.begin_training(self.article_encoder) + self._begin_training() + + if self.PRINT_F: + _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True) + _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False) + _, _, f_random_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, calc_random=True) + _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True) + _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False) + _, _, f_random_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, calc_random=True) + + print("random F train", round(f_random_train, 1)) + print("random F dev", round(f_random_dev, 1)) + print() + print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1)) + print("avg/nonavg F dev", round(f_avg_dev, 1), round(f_nonavg_dev, 1)) + print() + + instance_pos_count = 0 + instance_neg_count = 0 if to_print: print("Training on", len(train_instances.values()), "articles") print("Dev test on", len(dev_instances.values()), "articles") print() - self.sgd_entity = self.begin_training(self.entity_encoder) - self.sgd_article = self.begin_training(self.article_encoder) + # for article_id, inst_cluster_set in train_instances.items(): + # article_doc = train_doc[article_id] + # print("training on", article_id, inst_cluster_set) + # pos_ex_list = list() + # neg_exs_list = list() + # for inst_cluster in inst_cluster_set: + # instance_count += 1 + # pos_ex_list.append(train_pos.get(inst_cluster)) + # neg_exs_list.append(train_neg.get(inst_cluster, [])) - self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc) - - losses = {} - - instance_count = 0 + #self.update(article_doc, pos_ex_list, neg_exs_list) + article_docs = list() + entities = list() + golds = list() for article_id, inst_cluster_set in train_instances.items(): - # print("article", article_id) - article_doc = train_doc[article_id] - pos_ex_list = list() - neg_exs_list = list() for inst_cluster in inst_cluster_set: - # print("inst_cluster", inst_cluster) - instance_count += 1 - pos_ex_list.append(train_pos.get(inst_cluster)) - neg_exs_list.append(train_neg.get(inst_cluster, [])) + article_docs.append(train_doc[article_id]) + entities.append(train_pos.get(inst_cluster)) + golds.append(float(1.0)) + instance_pos_count += 1 + for neg_entity in train_neg.get(inst_cluster, []): + article_docs.append(train_doc[article_id]) + entities.append(neg_entity) + golds.append(float(0.0)) + instance_neg_count += 1 - self.update(article_doc, pos_ex_list, neg_exs_list, losses=losses) - p, r, fscore = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc) + for x in range(10): + print("Updating", x) + self.update(article_docs=article_docs, entities=entities, golds=golds) + + # eval again if self.PRINT_F: - print(round(fscore, 1)) + _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True) + _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False) + _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True) + _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False) + + print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1)) + print("avg/nonavg F dev", round(f_avg_dev, 1), round(f_nonavg_dev, 1)) + print() if to_print: - print("Trained on", instance_count, "instance clusters") + print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg") - def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc): + def _test_dev_depr(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False): predictions = list() golds = list() @@ -113,23 +165,65 @@ class EL_Model(): examples.append(pos_ex) shuffle(examples) - best_entity, highest_prob = self._predict(examples, article_doc) + best_entity, highest_prob = self._predict(examples, article_doc, avg) + if calc_random: + best_entity, highest_prob = self._predict_random(examples) predictions.append(ex_to_id[best_entity]) golds.append(ex_to_id[pos_ex]) # TODO: use lowest_mse and combine with prior probability - p, r, F = run_el.evaluate(predictions, golds, to_print=False) - return p, r, F + p, r, f = run_el.evaluate(predictions, golds, to_print=False) + return p, r, f - def _predict(self, entities, article_doc): - doc_encoding = self.article_encoder([article_doc]) + def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False): + predictions = list() + golds = list() + + for article_id, inst_cluster_set in dev_instances.items(): + for inst_cluster in inst_cluster_set: + pos_ex = dev_pos.get(inst_cluster) + neg_exs = dev_neg.get(inst_cluster, []) + + article = inst_cluster.split(sep="_")[0] + entity_id = inst_cluster.split(sep="_")[1] + article_doc = dev_doc[article] + + if calc_random: + prediction = self._predict_random(entity=pos_ex) + else: + prediction = self._predict(article_doc=article_doc, entity=pos_ex, avg=avg) + predictions.append(prediction) + golds.append(float(1.0)) + + for neg_ex in neg_exs: + if calc_random: + prediction = self._predict_random(entity=neg_ex) + else: + prediction = self._predict(article_doc=article_doc, entity=neg_ex, avg=avg) + predictions.append(prediction) + golds.append(float(0.0)) + + # TODO: use lowest_mse and combine with prior probability + p, r, f = run_el.evaluate(predictions, golds, to_print=False) + return p, r, f + + def _predict_depr(self, entities, article_doc, avg=False): + if avg: + with self.article_encoder.use_params(self.sgd_article.averages): + doc_encoding = self.article_encoder([article_doc]) + else: + doc_encoding = self.article_encoder([article_doc]) highest_prob = None best_entity = None entity_to_vector = dict() for entity in entities: - entity_to_vector[entity] = self.entity_encoder([entity]) + if avg: + with self.entity_encoder.use_params(self.sgd_entity.averages): + entity_to_vector[entity] = self.entity_encoder([entity]) + else: + entity_to_vector[entity] = self.entity_encoder([entity]) for entity in entities: entity_encoding = entity_to_vector[entity] @@ -140,7 +234,97 @@ class EL_Model(): return best_entity, highest_prob - def _simple_encoder(self, in_width, out_width): + def _predict(self, article_doc, entity, avg=False, apply_threshold=True): + if avg: + with self.sgd.use_params(self.model.averages): + doc_encoding = self.article_encoder([article_doc]) + entity_encoding = self.entity_encoder([entity]) + return self.model(np.append(entity_encoding, doc_encoding)) # TODO list + + doc_encoding = self.article_encoder([article_doc])[0] + entity_encoding = self.entity_encoder([entity])[0] + concat_encoding = list(entity_encoding) + list(doc_encoding) + np_array = np.asarray([concat_encoding]) + prediction = self.model(np_array) + if not apply_threshold: + return float(prediction) + if prediction > self.CUTOFF: + return float(1.0) + return float(0.0) + + def _predict_random_depr(self, entities): + highest_prob = 1 + best_entity = random.choice(entities) + return best_entity, highest_prob + + def _predict_random(self, entity, apply_threshold=True): + r = random.uniform(0, 1) + if not apply_threshold: + return r + if r > self.CUTOFF: + return float(1.0) + return float(0.0) + + def _build_cnn(self, hidden_entity_width, hidden_article_width): + with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): + self.entity_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_entity_width) # entity encoding + self.article_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_article_width) # doc encoding + + hidden_input_with = hidden_entity_width + hidden_article_width + hidden_output_with = self.HIDDEN_1_WIDTH + + convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_output_with, hidden_output_with * 3)))) + + # self.entity_encoder | self.article_encoder \ + # self.model = with_flatten(LN(Maxout(hidden_with, hidden_with)) >> convolution_2 ** 2, pad=2) \ + # >> flatten_add_lengths \ + # >> ParametricAttention(hidden_with) \ + # >> Pooling(sum_pool) \ + # >> Softmax(nr_class, nr_class) + + self.model = Affine(hidden_output_with, hidden_input_with) \ + >> LN(Maxout(hidden_output_with, hidden_output_with)) \ + >> convolution_2 \ + >> Affine(self.HIDDEN_2_WIDTH, hidden_output_with) \ + >> Affine(1, self.HIDDEN_2_WIDTH) \ + >> logistic + # >> with_flatten(LN(Maxout(hidden_output_with, hidden_output_with)) >> convolution_2 ** 2, pad=2) + + # >> convolution_2 \ + + # >> flatten_add_lengths + # >> ParametricAttention(hidden_output_with) \ + # >> Pooling(max_pool) \ + # >> Softmax(nr_class, nr_class) + + # self.model.nO = nr_class + + @staticmethod + def _encoder(in_width, hidden_width): + with Model.define_operators({">>": chain}): + encoder = SpacyVectors \ + >> flatten_add_lengths \ + >> ParametricAttention(in_width)\ + >> Pooling(mean_pool) \ + >> Residual(zero_init(Maxout(in_width, in_width))) \ + >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0)) + + return encoder + + def begin_training_depr(self, model): + # TODO ? link_vectors_to_models(self.vocab) depr? + sgd = create_default_optimizer(model.ops) + return sgd + + def _begin_training(self): + # self.sgd_entity = self.begin_training(self.entity_encoder) + # self.sgd_article = self.begin_training(self.article_encoder) + self.sgd = create_default_optimizer(self.model.ops) + + # TODO: deprecated ? + def _simple_encoder_depr(self, in_width, out_width): + hidden_with = 128 + conv_depth = 1 cnn_maxout_pieces = 3 with Model.define_operators({">>": chain, "**": clone}): @@ -150,21 +334,56 @@ class EL_Model(): # >> Pooling(mean_pool) \ # >> Residual(zero_init(Maxout(in_width, in_width))) \ # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) - encoder = SpacyVectors \ - >> flatten_add_lengths \ - >> with_getitem(0, Affine(in_width, in_width)) \ - >> ParametricAttention(in_width) \ - >> Pooling(sum_pool) \ - >> Residual(ReLu(in_width, in_width)) ** conv_depth \ - >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) + # encoder = SpacyVectors \ + # >> flatten_add_lengths \ + # >> with_getitem(0, Affine(in_width, in_width)) \ + # >> ParametricAttention(in_width) \ + # >> Pooling(sum_pool) \ + # >> Residual(ReLu(in_width, in_width)) ** conv_depth \ + # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) + # encoder = SpacyVectors \ + # >> flatten_add_lengths \ + # >> ParametricAttention(in_width)\ + # >> Pooling(sum_pool) \ + # >> Residual(zero_init(Maxout(in_width, in_width))) \ + # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) # >> zero_init(Affine(nr_class, width, drop_factor=0.0)) # >> logistic - # convolution = Residual( - # ExtractWindow(nW=1) - # >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces)) - # ) + #convolution = Residual(ExtractWindow(nW=1) + # >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)) + #) + #encoder = SpacyVectors >> with_flatten( + # embed >> convolution ** conv_depth, pad=conv_depth + #) + + # static_vectors = SpacyVectors >> with_flatten( + # Affine(in_width, in_width) + #) + + convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3)))) + + encoder = SpacyVectors >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution_2 ** 2, pad = 2) \ + >> flatten_add_lengths \ + >> ParametricAttention(hidden_with) \ + >> Pooling(sum_pool) \ + >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \ + >> zero_init(Affine(out_width, hidden_with, drop_factor=0.0)) \ + >> logistic + + # convolution = Residual(ExtractWindow(nW=1) >> ReLu(in_width, in_width*3)) + + # encoder = static_vectors # >> with_flatten( + # ReLu(in_width, in_width) + # >> convolution ** conv_depth, pad=conv_depth) \ + # >> Affine(out_width, in_width, drop_factor=0.0) + + # encoder = SpacyVectors >> with_flatten( + # LN(Maxout(in_width, in_width)) + # >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)))) ** conv_depth, + # pad=conv_depth, + #) >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) # embed = SpacyVectors >> LN(Maxout(width, width, pieces=3)) @@ -173,75 +392,91 @@ class EL_Model(): return encoder - def begin_training(self, model): - # TODO ? link_vectors_to_models(self.vocab) - sgd = create_default_optimizer(model.ops) - return sgd - - def update(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None): + def update_depr(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None): doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) doc_encoding = doc_encoding[0] + # print() # print("doc", doc_encoding) for i, true_entity in enumerate(true_entity_list): try: - false_vectors = list() false_entities = false_entities_list[i] if len(false_entities) > 0: # TODO: batch per doc - for false_entity in false_entities: - # TODO: one call only to begin_update ? - false_entity_encoding, false_entity_bp = self.entity_encoder.begin_update([false_entity], drop=drop) - false_entity_encoding = false_entity_encoding[0] - false_vectors.append(false_entity_encoding) + all_entities = [true_entity] + all_entities.extend(false_entities) - true_entity_encoding, true_entity_bp = self.entity_encoder.begin_update([true_entity], drop=drop) - true_entity_encoding = true_entity_encoding[0] - # true_gradient = self._calculate_true_gradient(doc_encoding, true_entity_encoding) + entity_encodings, entity_bp = self.entity_encoder.begin_update(all_entities, drop=drop) + true_entity_encoding = entity_encodings[0] + false_entity_encodings = entity_encodings[1:] all_vectors = [true_entity_encoding] - all_vectors.extend(false_vectors) + all_vectors.extend(false_entity_encodings) # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors) # print("true", true_prob, true_entity_encoding) - # print("true gradient", true_gradient) - # print() all_probs = [true_prob] - for false_vector in false_vectors: + for false_vector in false_entity_encodings: false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors) # print("false", false_prob, false_vector) - # print("false gradient", false_gradient) - # print() all_probs.append(false_prob) loss = self._calculate_loss(true_prob, all_probs).astype(np.float32) if self.PRINT_LOSS: - print(round(loss, 5)) + print("loss train", round(loss, 5)) - #doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_vectors) - entity_gradient = self._calculate_entity_gradient(doc_encoding, true_entity_encoding, false_vectors) - # print("entity_gradient", entity_gradient) + # for false_vector in false_vectors: + # false_gradient = -1 * self._calculate_entity_gradient(loss, doc_encoding, false_vector, false_vectors) + # print("false gradient", false_gradient) + + # doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings) + true_gradient, doc_gradient = self._calculate_entity_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings) + # print("true_gradient", true_gradient) # print("doc_gradient", doc_gradient) - # article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article) - true_entity_bp([entity_gradient.astype(np.float32)], sgd=self.sgd_entity) + article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article) + entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity) #true_entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity) except Exception as e: pass + def update(self, article_docs, entities, golds, drop=0.): + doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) + entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop) + concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] - # TODO: FIX - def _calculate_consensus(self, vector1, vector2): - if len(vector1) != len(vector2): - raise ValueError("To calculate consensus, both vectors should be of equal length") + predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=drop) - avg = (vector2 + vector1) / 2 - return avg + predictions = self.model.ops.flatten(predictions) + golds = self.model.ops.asarray(golds) - def _calculate_probability(self, vector1, vector2, allvectors): + # print("predictions", predictions) + # print("golds", golds) + + d_scores = (predictions - golds) # / predictions.shape[0] + # print("d_scores (1)", d_scores) + + loss = (d_scores ** 2).sum() + + if self.PRINT_LOSS: + print("loss train", round(loss, 5)) + + d_scores = d_scores.reshape((-1, 1)) + d_scores = d_scores.astype(np.float32) + # print("d_scores (2)", d_scores) + + model_gradient = bp_model(d_scores, sgd=self.sgd) + + doc_gradient = [x[0:self.ARTICLE_WIDTH] for x in model_gradient] + entity_gradient = [x[self.ARTICLE_WIDTH:] for x in model_gradient] + + bp_doc(doc_gradient) + bp_encoding(entity_gradient) + + def _calculate_probability_depr(self, vector1, vector2, allvectors): """ Make sure that vector2 is included in allvectors """ if len(vector1) != len(vector2): raise ValueError("To calculate similarity, both vectors should be of equal length") @@ -254,12 +489,12 @@ class EL_Model(): return float(e / (self.EPS + e_sum)) - def _calculate_loss(self, true_prob, all_probs): + def _calculate_loss_depr(self, true_prob, all_probs): """ all_probs should include true_prob ! """ return -1 * np.log((self.EPS + true_prob) / (self.EPS + sum(all_probs))) @staticmethod - def _calculate_doc_gradient(loss, doc_vector, true_vector, false_vectors): + def _calculate_doc_gradient_depr(loss, doc_vector, true_vector, false_vectors): gradient = np.zeros(len(doc_vector)) for i in range(len(doc_vector)): min_false = min(x[i] for x in false_vectors) @@ -276,21 +511,25 @@ class EL_Model(): if doc_vector[i] < 0: gradient[i] = 0 else: - target = 0 # non-distinctive vector positions should convert to 0 - gradient[i] = doc_vector[i] - target + # non-distinctive vector positions should converge to 0 + gradient[i] = doc_vector[i] return gradient - def _calculate_true_gradient(self, doc_vector, entity_vector): + # TODO: delete ? try again ? + def depr__calculate_true_gradient(self, doc_vector, entity_vector): # sum_entity_vector = sum(entity_vector) # gradient = [-sum_entity_vector/(self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))] gradient = [1 / (self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))] return np.asarray(gradient) - def _calculate_entity_gradient(self, doc_vector, true_vector, false_vectors): - entity_gradient = list() - prob_true = list() - false_prob_list = list() + def _calculate_losses_vector_depr(self, doc_vector, true_vector, false_vectors): + # prob_true = list() + # prob_false_dict = dict() + + true_losses = list() + # false_losses_dict = dict() + for i in range(len(true_vector)): doc_i = np.asarray([doc_vector[i]]) true_i = np.asarray([true_vector[i]]) @@ -299,32 +538,45 @@ class EL_Model(): all_i.extend(falses_i) prob_true_i = self._calculate_probability(doc_i, true_i, all_i) - prob_true.append(prob_true_i) + # prob_true.append(prob_true_i) - false_list = list() + # false_list = list() all_probs_i = [prob_true_i] - for false_vector in falses_i: - false_prob_i = self._calculate_probability(doc_i, false_vector, all_i) - all_probs_i.append(false_prob_i) - false_list.append(false_prob_i) - false_prob_list.append(false_list) + for false_i in falses_i: + prob_false_i = self._calculate_probability(doc_i, false_i, all_i) + all_probs_i.append(prob_false_i) + # false_list.append(prob_false_i) + # prob_false_dict[i] = false_list - sign_loss_i = 1 - if doc_vector[i] * true_vector[i] < 0: - sign_loss_i = -1 + true_loss_i = self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32) + if doc_vector[i] > 0: + true_loss_i = -1 * true_loss_i + true_losses.append(true_loss_i) - loss_i = sign_loss_i * self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32) - entity_gradient.append(loss_i) - # print("prob_true", prob_true) - # print("false_prob_list", false_prob_list) - return np.asarray(entity_gradient) + # false_loss_list = list() + # for prob_false_i in false_list: + # false_loss_i = self._calculate_loss(prob_false_i, all_probs_i).astype(np.float32) + # false_loss_list.append(false_loss_i) + # false_losses_dict[i] = false_loss_list + + return true_losses # , false_losses_dict + + def _calculate_entity_gradient_depr(self, loss, doc_vector, true_vector, false_vectors): + true_losses = self._calculate_losses_vector(doc_vector, true_vector, false_vectors) + + # renormalize the gradient so that the total sum of abs values does not exceed the actual loss + loss_i = sum([abs(x) for x in true_losses]) # sum of absolute values + entity_gradient = [(x/2) * (loss/loss_i) for x in true_losses] + doc_gradient = [(x/2) * (loss/loss_i) for x in true_losses] + + return np.asarray(entity_gradient), np.asarray(doc_gradient) @staticmethod - def _calculate_dot_exp(vector1, vector2_transposed): + def _calculate_dot_exp_depr(vector1, vector2_transposed): dot_product = vector1.dot(vector2_transposed) dot_product = min(50, dot_product) - # dot_product = max(-10000, dot_product) + dot_product = max(-10000, dot_product) # print("DOT", dot_product) e = np.exp(dot_product) # print("E", e) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index cccc67650..a5ebc99bb 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training ", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1500, devlimit=50) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=1) print() # STEP 7: apply the EL algorithm on the dev dataset From d51bffe63b9e92b3f6c2b4cfb09d2039e6e55a5f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 16 May 2019 18:36:15 +0200 Subject: [PATCH 042/148] clean up code --- .../pipeline/wiki_entity_linking/train_el.py | 346 +----------------- 1 file changed, 4 insertions(+), 342 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 5cb027d0e..369b0762c 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -4,11 +4,9 @@ from __future__ import unicode_literals import os import datetime from os import listdir -from random import shuffle import numpy as np import random from thinc.neural._classes.convolution import ExtractWindow -from thinc.neural._classes.feature_extracter import FeatureExtracter from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator @@ -49,9 +47,6 @@ class EL_Model: self._build_cnn(hidden_entity_width=self.ENTITY_WIDTH, hidden_article_width=self.ARTICLE_WIDTH) - # self.entity_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) - # self.article_encoder = self._simple_encoder(in_width=self.INPUT_DIM, out_width=self.OUTPUT_DIM) - def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): # raise errors instead of runtime warnings in case of int/float overflow np.seterr(all='raise') @@ -69,16 +64,13 @@ class EL_Model: True, devlimit, to_print=False) - - # self.sgd_entity = self.begin_training(self.entity_encoder) - # self.sgd_article = self.begin_training(self.article_encoder) self._begin_training() if self.PRINT_F: - _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True) + _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True) _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False) _, _, f_random_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, calc_random=True) - _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True) + _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True) _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False) _, _, f_random_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, calc_random=True) @@ -97,18 +89,6 @@ class EL_Model: print("Dev test on", len(dev_instances.values()), "articles") print() - # for article_id, inst_cluster_set in train_instances.items(): - # article_doc = train_doc[article_id] - # print("training on", article_id, inst_cluster_set) - # pos_ex_list = list() - # neg_exs_list = list() - # for inst_cluster in inst_cluster_set: - # instance_count += 1 - # pos_ex_list.append(train_pos.get(inst_cluster)) - # neg_exs_list.append(train_neg.get(inst_cluster, [])) - - #self.update(article_doc, pos_ex_list, neg_exs_list) - article_docs = list() entities = list() golds = list() @@ -130,9 +110,9 @@ class EL_Model: # eval again if self.PRINT_F: - _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True) + _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True) _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False) - _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True) + _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True) _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False) print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1)) @@ -142,39 +122,6 @@ class EL_Model: if to_print: print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg") - def _test_dev_depr(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False): - predictions = list() - golds = list() - - for article_id, inst_cluster_set in dev_instances.items(): - for inst_cluster in inst_cluster_set: - pos_ex = dev_pos.get(inst_cluster) - neg_exs = dev_neg.get(inst_cluster, []) - ex_to_id = dict() - - if pos_ex and neg_exs: - ex_to_id[pos_ex] = pos_ex._.entity_id - for neg_ex in neg_exs: - ex_to_id[neg_ex] = neg_ex._.entity_id - - article = inst_cluster.split(sep="_")[0] - entity_id = inst_cluster.split(sep="_")[1] - article_doc = dev_doc[article] - - examples = list(neg_exs) - examples.append(pos_ex) - shuffle(examples) - - best_entity, highest_prob = self._predict(examples, article_doc, avg) - if calc_random: - best_entity, highest_prob = self._predict_random(examples) - predictions.append(ex_to_id[best_entity]) - golds.append(ex_to_id[pos_ex]) - - # TODO: use lowest_mse and combine with prior probability - p, r, f = run_el.evaluate(predictions, golds, to_print=False) - return p, r, f - def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False): predictions = list() golds = list() @@ -207,33 +154,6 @@ class EL_Model: p, r, f = run_el.evaluate(predictions, golds, to_print=False) return p, r, f - def _predict_depr(self, entities, article_doc, avg=False): - if avg: - with self.article_encoder.use_params(self.sgd_article.averages): - doc_encoding = self.article_encoder([article_doc]) - else: - doc_encoding = self.article_encoder([article_doc]) - - highest_prob = None - best_entity = None - - entity_to_vector = dict() - for entity in entities: - if avg: - with self.entity_encoder.use_params(self.sgd_entity.averages): - entity_to_vector[entity] = self.entity_encoder([entity]) - else: - entity_to_vector[entity] = self.entity_encoder([entity]) - - for entity in entities: - entity_encoding = entity_to_vector[entity] - prob = self._calculate_probability(doc_encoding, entity_encoding, entity_to_vector.values()) - if not best_entity or prob > highest_prob: - highest_prob = prob - best_entity = entity - - return best_entity, highest_prob - def _predict(self, article_doc, entity, avg=False, apply_threshold=True): if avg: with self.sgd.use_params(self.model.averages): @@ -252,11 +172,6 @@ class EL_Model: return float(1.0) return float(0.0) - def _predict_random_depr(self, entities): - highest_prob = 1 - best_entity = random.choice(entities) - return best_entity, highest_prob - def _predict_random(self, entity, apply_threshold=True): r = random.uniform(0, 1) if not apply_threshold: @@ -275,29 +190,12 @@ class EL_Model: convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_output_with, hidden_output_with * 3)))) - # self.entity_encoder | self.article_encoder \ - # self.model = with_flatten(LN(Maxout(hidden_with, hidden_with)) >> convolution_2 ** 2, pad=2) \ - # >> flatten_add_lengths \ - # >> ParametricAttention(hidden_with) \ - # >> Pooling(sum_pool) \ - # >> Softmax(nr_class, nr_class) - self.model = Affine(hidden_output_with, hidden_input_with) \ >> LN(Maxout(hidden_output_with, hidden_output_with)) \ >> convolution_2 \ >> Affine(self.HIDDEN_2_WIDTH, hidden_output_with) \ >> Affine(1, self.HIDDEN_2_WIDTH) \ >> logistic - # >> with_flatten(LN(Maxout(hidden_output_with, hidden_output_with)) >> convolution_2 ** 2, pad=2) - - # >> convolution_2 \ - - # >> flatten_add_lengths - # >> ParametricAttention(hidden_output_with) \ - # >> Pooling(max_pool) \ - # >> Softmax(nr_class, nr_class) - - # self.model.nO = nr_class @staticmethod def _encoder(in_width, hidden_width): @@ -311,138 +209,9 @@ class EL_Model: return encoder - def begin_training_depr(self, model): - # TODO ? link_vectors_to_models(self.vocab) depr? - sgd = create_default_optimizer(model.ops) - return sgd - def _begin_training(self): - # self.sgd_entity = self.begin_training(self.entity_encoder) - # self.sgd_article = self.begin_training(self.article_encoder) self.sgd = create_default_optimizer(self.model.ops) - # TODO: deprecated ? - def _simple_encoder_depr(self, in_width, out_width): - hidden_with = 128 - - conv_depth = 1 - cnn_maxout_pieces = 3 - with Model.define_operators({">>": chain, "**": clone}): - # encoder = SpacyVectors \ - # >> flatten_add_lengths \ - # >> ParametricAttention(in_width)\ - # >> Pooling(mean_pool) \ - # >> Residual(zero_init(Maxout(in_width, in_width))) \ - # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) - # encoder = SpacyVectors \ - # >> flatten_add_lengths \ - # >> with_getitem(0, Affine(in_width, in_width)) \ - # >> ParametricAttention(in_width) \ - # >> Pooling(sum_pool) \ - # >> Residual(ReLu(in_width, in_width)) ** conv_depth \ - # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) - # encoder = SpacyVectors \ - # >> flatten_add_lengths \ - # >> ParametricAttention(in_width)\ - # >> Pooling(sum_pool) \ - # >> Residual(zero_init(Maxout(in_width, in_width))) \ - # >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) - - # >> zero_init(Affine(nr_class, width, drop_factor=0.0)) - # >> logistic - - #convolution = Residual(ExtractWindow(nW=1) - # >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)) - #) - #encoder = SpacyVectors >> with_flatten( - # embed >> convolution ** conv_depth, pad=conv_depth - #) - - # static_vectors = SpacyVectors >> with_flatten( - # Affine(in_width, in_width) - #) - - convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3)))) - - encoder = SpacyVectors >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution_2 ** 2, pad = 2) \ - >> flatten_add_lengths \ - >> ParametricAttention(hidden_with) \ - >> Pooling(sum_pool) \ - >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \ - >> zero_init(Affine(out_width, hidden_with, drop_factor=0.0)) \ - >> logistic - - # convolution = Residual(ExtractWindow(nW=1) >> ReLu(in_width, in_width*3)) - - # encoder = static_vectors # >> with_flatten( - # ReLu(in_width, in_width) - # >> convolution ** conv_depth, pad=conv_depth) \ - # >> Affine(out_width, in_width, drop_factor=0.0) - - # encoder = SpacyVectors >> with_flatten( - # LN(Maxout(in_width, in_width)) - # >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)))) ** conv_depth, - # pad=conv_depth, - #) >> zero_init(Affine(out_width, in_width, drop_factor=0.0)) - - # embed = SpacyVectors >> LN(Maxout(width, width, pieces=3)) - - # encoder = SpacyVectors >> flatten_add_lengths >> convolution ** conv_depth - # encoder = with_flatten(embed >> convolution ** conv_depth, pad=conv_depth) - - return encoder - - def update_depr(self, article_doc, true_entity_list, false_entities_list, drop=0., losses=None): - doc_encoding, article_bp = self.article_encoder.begin_update([article_doc], drop=drop) - doc_encoding = doc_encoding[0] - # print() - # print("doc", doc_encoding) - - for i, true_entity in enumerate(true_entity_list): - try: - false_entities = false_entities_list[i] - if len(false_entities) > 0: - # TODO: batch per doc - - all_entities = [true_entity] - all_entities.extend(false_entities) - - entity_encodings, entity_bp = self.entity_encoder.begin_update(all_entities, drop=drop) - true_entity_encoding = entity_encodings[0] - false_entity_encodings = entity_encodings[1:] - - all_vectors = [true_entity_encoding] - all_vectors.extend(false_entity_encodings) - - # consensus_encoding = self._calculate_consensus(doc_encoding, true_entity_encoding) - - true_prob = self._calculate_probability(doc_encoding, true_entity_encoding, all_vectors) - # print("true", true_prob, true_entity_encoding) - - all_probs = [true_prob] - for false_vector in false_entity_encodings: - false_prob = self._calculate_probability(doc_encoding, false_vector, all_vectors) - # print("false", false_prob, false_vector) - all_probs.append(false_prob) - - loss = self._calculate_loss(true_prob, all_probs).astype(np.float32) - if self.PRINT_LOSS: - print("loss train", round(loss, 5)) - - # for false_vector in false_vectors: - # false_gradient = -1 * self._calculate_entity_gradient(loss, doc_encoding, false_vector, false_vectors) - # print("false gradient", false_gradient) - - # doc_gradient = self._calculate_doc_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings) - true_gradient, doc_gradient = self._calculate_entity_gradient(loss, doc_encoding, true_entity_encoding, false_entity_encodings) - # print("true_gradient", true_gradient) - # print("doc_gradient", doc_gradient) - article_bp([doc_gradient.astype(np.float32)], sgd=self.sgd_article) - entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity) - #true_entity_bp([true_gradient.astype(np.float32)], sgd=self.sgd_entity) - except Exception as e: - pass - def update(self, article_docs, entities, golds, drop=0.): doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop) @@ -476,112 +245,6 @@ class EL_Model: bp_doc(doc_gradient) bp_encoding(entity_gradient) - def _calculate_probability_depr(self, vector1, vector2, allvectors): - """ Make sure that vector2 is included in allvectors """ - if len(vector1) != len(vector2): - raise ValueError("To calculate similarity, both vectors should be of equal length") - - vector1_t = vector1.transpose() - e = self._calculate_dot_exp(vector2, vector1_t) - e_sum = 0 - for v in allvectors: - e_sum += self._calculate_dot_exp(v, vector1_t) - - return float(e / (self.EPS + e_sum)) - - def _calculate_loss_depr(self, true_prob, all_probs): - """ all_probs should include true_prob ! """ - return -1 * np.log((self.EPS + true_prob) / (self.EPS + sum(all_probs))) - - @staticmethod - def _calculate_doc_gradient_depr(loss, doc_vector, true_vector, false_vectors): - gradient = np.zeros(len(doc_vector)) - for i in range(len(doc_vector)): - min_false = min(x[i] for x in false_vectors) - max_false = max(x[i] for x in false_vectors) - - if true_vector[i] > max_false: - if doc_vector[i] > 0: - gradient[i] = 0 - else: - gradient[i] = -loss - elif true_vector[i] < min_false: - if doc_vector[i] > 0: - gradient[i] = loss - if doc_vector[i] < 0: - gradient[i] = 0 - else: - # non-distinctive vector positions should converge to 0 - gradient[i] = doc_vector[i] - - return gradient - - # TODO: delete ? try again ? - def depr__calculate_true_gradient(self, doc_vector, entity_vector): - # sum_entity_vector = sum(entity_vector) - # gradient = [-sum_entity_vector/(self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))] - gradient = [1 / (self.EPS + np.exp(doc_vector[i] * entity_vector[i])) for i in range(len(doc_vector))] - return np.asarray(gradient) - - def _calculate_losses_vector_depr(self, doc_vector, true_vector, false_vectors): - # prob_true = list() - # prob_false_dict = dict() - - true_losses = list() - # false_losses_dict = dict() - - for i in range(len(true_vector)): - doc_i = np.asarray([doc_vector[i]]) - true_i = np.asarray([true_vector[i]]) - falses_i = np.asarray([[fv[i]] for fv in false_vectors]) - all_i = [true_i] - all_i.extend(falses_i) - - prob_true_i = self._calculate_probability(doc_i, true_i, all_i) - # prob_true.append(prob_true_i) - - # false_list = list() - all_probs_i = [prob_true_i] - for false_i in falses_i: - prob_false_i = self._calculate_probability(doc_i, false_i, all_i) - all_probs_i.append(prob_false_i) - # false_list.append(prob_false_i) - # prob_false_dict[i] = false_list - - true_loss_i = self._calculate_loss(prob_true_i, all_probs_i).astype(np.float32) - if doc_vector[i] > 0: - true_loss_i = -1 * true_loss_i - true_losses.append(true_loss_i) - - # false_loss_list = list() - # for prob_false_i in false_list: - # false_loss_i = self._calculate_loss(prob_false_i, all_probs_i).astype(np.float32) - # false_loss_list.append(false_loss_i) - # false_losses_dict[i] = false_loss_list - - return true_losses # , false_losses_dict - - def _calculate_entity_gradient_depr(self, loss, doc_vector, true_vector, false_vectors): - true_losses = self._calculate_losses_vector(doc_vector, true_vector, false_vectors) - - # renormalize the gradient so that the total sum of abs values does not exceed the actual loss - loss_i = sum([abs(x) for x in true_losses]) # sum of absolute values - entity_gradient = [(x/2) * (loss/loss_i) for x in true_losses] - doc_gradient = [(x/2) * (loss/loss_i) for x in true_losses] - - return np.asarray(entity_gradient), np.asarray(doc_gradient) - - - @staticmethod - def _calculate_dot_exp_depr(vector1, vector2_transposed): - dot_product = vector1.dot(vector2_transposed) - dot_product = min(50, dot_product) - dot_product = max(-10000, dot_product) - # print("DOT", dot_product) - e = np.exp(dot_product) - # print("E", e) - return e - def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) @@ -589,7 +252,6 @@ class EL_Model: collect_correct=True, collect_incorrect=True) - instance_by_doc = dict() local_vectors = list() # TODO: local vectors doc_by_article = dict() From 400b19353de9768805b6a4bcc7bcd72ba57bd001 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 17 May 2019 01:51:18 +0200 Subject: [PATCH 043/148] simplify architecture and larger-scale test runs --- .../pipeline/wiki_entity_linking/run_el.py | 3 +- .../pipeline/wiki_entity_linking/train_el.py | 165 +++++++++--------- .../wiki_entity_linking/wiki_nel_pipeline.py | 4 +- 3 files changed, 88 insertions(+), 84 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index 66ab0385e..6ab7ea75f 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -81,7 +81,8 @@ def evaluate(predictions, golds, to_print=True): for pred, gold in zip(predictions, golds): is_correct = pred == gold if not pred: - fn += 1 + if not is_correct: # we don't care about tn + fn += 1 elif is_correct: tp += 1 else: diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 369b0762c..21bc03282 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -12,10 +12,9 @@ from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic -from thinc.api import chain, concatenate, flatten_add_lengths, with_getitem, clone, with_flatten -from thinc.neural.util import get_array_module -from thinc.v2v import Model, Softmax, Maxout, Affine, ReLu -from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool +from thinc.api import chain, concatenate, flatten_add_lengths, clone +from thinc.v2v import Model, Maxout, Affine +from thinc.t2v import Pooling, mean_pool from thinc.t2t import ParametricAttention from thinc.misc import Residual from thinc.misc import LayerNorm as LN @@ -27,16 +26,15 @@ from spacy.tokens import Doc class EL_Model: - PRINT_LOSS = True + PRINT_LOSS = False PRINT_F = True EPS = 0.0000000005 CUTOFF = 0.5 INPUT_DIM = 300 ENTITY_WIDTH = 64 - ARTICLE_WIDTH = 64 - HIDDEN_1_WIDTH = 256 - HIDDEN_2_WIDTH = 64 + ARTICLE_WIDTH = 128 + HIDDEN_WIDTH = 64 name = "entity_linker" @@ -53,46 +51,44 @@ class EL_Model: Doc.set_extension("entity_id", default=None) - train_instances, train_pos, train_neg, train_doc = self._get_training_data(training_dir, - entity_descr_output, - False, - trainlimit, - to_print=False) + train_inst, train_pos, train_neg, train_doc = self._get_training_data(training_dir, + entity_descr_output, + False, + trainlimit, + to_print=False) - dev_instances, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir, - entity_descr_output, - True, - devlimit, - to_print=False) + dev_inst, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir, + entity_descr_output, + True, + devlimit, + to_print=False) self._begin_training() - if self.PRINT_F: - _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True) - _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False) - _, _, f_random_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, calc_random=True) - _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True) - _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False) - _, _, f_random_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, calc_random=True) - - print("random F train", round(f_random_train, 1)) - print("random F dev", round(f_random_dev, 1)) - print() - print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1)) - print("avg/nonavg F dev", round(f_avg_dev, 1), round(f_nonavg_dev, 1)) - print() + print() + self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_random", calc_random=True) + self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_random", calc_random=True) + print() + self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_pre", calc_random=False) + self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_pre", avg=False) instance_pos_count = 0 instance_neg_count = 0 if to_print: - print("Training on", len(train_instances.values()), "articles") - print("Dev test on", len(dev_instances.values()), "articles") print() + print("Training on", len(train_inst.values()), "articles") + print("Dev test on", len(dev_inst.values()), "articles") - article_docs = list() - entities = list() - golds = list() - for article_id, inst_cluster_set in train_instances.items(): + # TODO: proper batches. Currently 1 article at the time + article_count = 0 + for article_id, inst_cluster_set in train_inst.items(): + # if to_print: + # print() + # print(article_count, "Training on article", article_id) + article_count += 1 + article_docs = list() + entities = list() + golds = list() for inst_cluster in inst_cluster_set: article_docs.append(train_doc[article_id]) entities.append(train_pos.get(inst_cluster)) @@ -104,36 +100,31 @@ class EL_Model: golds.append(float(0.0)) instance_neg_count += 1 - for x in range(10): - print("Updating", x) self.update(article_docs=article_docs, entities=entities, golds=golds) - # eval again - if self.PRINT_F: - _, _, f_avg_train = -3.42, -3.42, -3.42 # self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=True) - _, _, f_nonavg_train = self._test_dev(train_instances, train_pos, train_neg, train_doc, avg=False) - _, _, f_avg_dev = -3.42, -3.42, -3.42 # self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=True) - _, _, f_nonavg_dev = self._test_dev(dev_instances, dev_pos, dev_neg, dev_doc, avg=False) - - print("avg/nonavg F train", round(f_avg_train, 1), round(f_nonavg_train, 1)) - print("avg/nonavg F dev", round(f_avg_dev, 1), round(f_nonavg_dev, 1)) - print() + # dev eval + self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False) if to_print: + print() print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg") - def _test_dev(self, dev_instances, dev_pos, dev_neg, dev_doc, avg=False, calc_random=False): + print() + self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", calc_random=False) + self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False) + + def _test_dev(self, instances, pos, neg, doc, print_string, avg=False, calc_random=False): predictions = list() golds = list() - for article_id, inst_cluster_set in dev_instances.items(): + for article_id, inst_cluster_set in instances.items(): for inst_cluster in inst_cluster_set: - pos_ex = dev_pos.get(inst_cluster) - neg_exs = dev_neg.get(inst_cluster, []) + pos_ex = pos.get(inst_cluster) + neg_exs = neg.get(inst_cluster, []) article = inst_cluster.split(sep="_")[0] entity_id = inst_cluster.split(sep="_")[1] - article_doc = dev_doc[article] + article_doc = doc[article] if calc_random: prediction = self._predict_random(entity=pos_ex) @@ -150,9 +141,17 @@ class EL_Model: predictions.append(prediction) golds.append(float(0.0)) - # TODO: use lowest_mse and combine with prior probability + # TODO: combine with prior probability p, r, f = run_el.evaluate(predictions, golds, to_print=False) - return p, r, f + if self.PRINT_F: + # print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1)) + print("F", print_string, round(f, 1)) + + loss, d_scores = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds)) + if self.PRINT_LOSS: + print("loss", print_string, round(loss, 5)) + + return loss, p, r, f def _predict(self, article_doc, entity, avg=False, apply_threshold=True): if avg: @@ -182,20 +181,16 @@ class EL_Model: def _build_cnn(self, hidden_entity_width, hidden_article_width): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - self.entity_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_entity_width) # entity encoding - self.article_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_article_width) # doc encoding + self.entity_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_entity_width) + self.article_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_article_width) - hidden_input_with = hidden_entity_width + hidden_article_width - hidden_output_with = self.HIDDEN_1_WIDTH + nr_i = hidden_entity_width + hidden_article_width + nr_o = self.HIDDEN_WIDTH - convolution_2 = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_output_with, hidden_output_with * 3)))) - - self.model = Affine(hidden_output_with, hidden_input_with) \ - >> LN(Maxout(hidden_output_with, hidden_output_with)) \ - >> convolution_2 \ - >> Affine(self.HIDDEN_2_WIDTH, hidden_output_with) \ - >> Affine(1, self.HIDDEN_2_WIDTH) \ - >> logistic + self.model = Affine(nr_o, nr_i) \ + >> LN(Maxout(nr_o, nr_o)) \ + >> Affine(1, nr_o) \ + >> logistic @staticmethod def _encoder(in_width, hidden_width): @@ -204,38 +199,46 @@ class EL_Model: >> flatten_add_lengths \ >> ParametricAttention(in_width)\ >> Pooling(mean_pool) \ - >> Residual(zero_init(Maxout(in_width, in_width))) \ + >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3)))) \ >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0)) + # TODO: ReLu instead of LN(Maxout) ? + return encoder def _begin_training(self): self.sgd = create_default_optimizer(self.model.ops) - def update(self, article_docs, entities, golds, drop=0.): + @staticmethod + def get_loss(predictions, golds): + d_scores = (predictions - golds) + + loss = (d_scores ** 2).sum() + return loss, d_scores + + def update(self, article_docs, entities, golds, drop=0., apply_threshold=True): doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop) concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=drop) - predictions = self.model.ops.flatten(predictions) golds = self.model.ops.asarray(golds) - # print("predictions", predictions) - # print("golds", golds) + loss, d_scores = self.get_loss(predictions, golds) - d_scores = (predictions - golds) # / predictions.shape[0] - # print("d_scores (1)", d_scores) + # if self.PRINT_LOSS: + # print("loss train", round(loss, 5)) - loss = (d_scores ** 2).sum() - - if self.PRINT_LOSS: - print("loss train", round(loss, 5)) + # if self.PRINT_F: + # predictions_f = [x for x in predictions] + # if apply_threshold: + # predictions_f = [1.0 if x > self.CUTOFF else 0.0 for x in predictions_f] + # p, r, f = run_el.evaluate(predictions_f, golds, to_print=False) + # print("p/r/F train", round(p, 1), round(r, 1), round(f, 1)) d_scores = d_scores.reshape((-1, 1)) d_scores = d_scores.astype(np.float32) - # print("d_scores (2)", d_scores) model_gradient = bp_model(d_scores, sgd=self.sgd) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index a5ebc99bb..2e4ab3c2e 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -108,10 +108,10 @@ if __name__ == "__main__": # STEP 6: apply the EL algorithm on the training dataset if run_training: - print("STEP 6: training ", datetime.datetime.now()) + print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=1) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=2000, devlimit=200) print() # STEP 7: apply the EL algorithm on the dev dataset From dd691d00530eed432d6cf60b39d99206e5830f69 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 17 May 2019 17:44:11 +0200 Subject: [PATCH 044/148] debugging --- .../pipeline/wiki_entity_linking/train_el.py | 140 ++++++++++++------ .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- spacy/pipeline/pipes.pyx | 2 +- 3 files changed, 98 insertions(+), 46 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 21bc03282..312e50cad 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -28,13 +28,16 @@ class EL_Model: PRINT_LOSS = False PRINT_F = True + PRINT_TRAIN = True EPS = 0.0000000005 CUTOFF = 0.5 INPUT_DIM = 300 - ENTITY_WIDTH = 64 - ARTICLE_WIDTH = 128 - HIDDEN_WIDTH = 64 + ENTITY_WIDTH = 4 # 64 + ARTICLE_WIDTH = 8 # 128 + HIDDEN_WIDTH = 6 # 64 + + DROP = 0.00 name = "entity_linker" @@ -78,40 +81,63 @@ class EL_Model: print() print("Training on", len(train_inst.values()), "articles") print("Dev test on", len(dev_inst.values()), "articles") + print() + print(" CUTOFF", self.CUTOFF) + print(" INPUT_DIM", self.INPUT_DIM) + print(" ENTITY_WIDTH", self.ENTITY_WIDTH) + print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH) + print(" HIDDEN_WIDTH", self.ARTICLE_WIDTH) + print(" DROP", self.DROP) + print() # TODO: proper batches. Currently 1 article at the time article_count = 0 for article_id, inst_cluster_set in train_inst.items(): - # if to_print: - # print() - # print(article_count, "Training on article", article_id) - article_count += 1 - article_docs = list() - entities = list() - golds = list() - for inst_cluster in inst_cluster_set: - article_docs.append(train_doc[article_id]) - entities.append(train_pos.get(inst_cluster)) - golds.append(float(1.0)) - instance_pos_count += 1 - for neg_entity in train_neg.get(inst_cluster, []): - article_docs.append(train_doc[article_id]) - entities.append(neg_entity) - golds.append(float(0.0)) - instance_neg_count += 1 + try: + # if to_print: + # print() + # print(article_count, "Training on article", article_id) + article_count += 1 + article_docs = list() + entities = list() + golds = list() + for inst_cluster in inst_cluster_set: + if instance_pos_count < 2: # TODO remove + article_docs.append(train_doc[article_id]) + entities.append(train_pos.get(inst_cluster)) + golds.append(float(1.0)) + instance_pos_count += 1 + for neg_entity in train_neg.get(inst_cluster, []): + article_docs.append(train_doc[article_id]) + entities.append(neg_entity) + golds.append(float(0.0)) + instance_neg_count += 1 - self.update(article_docs=article_docs, entities=entities, golds=golds) + for k in range(5): + print() + print("update", k) + print() + # print("article docs", article_docs) + print("entities", entities) + print("golds", golds) + print() + self.update(article_docs=article_docs, entities=entities, golds=golds) - # dev eval - self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False) + # dev eval + self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False) + self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter_avg", avg=True) + except ValueError as e: + print("Error in article id", article_id) if to_print: print() print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg") print() - self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", calc_random=False) + self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", avg=False) + self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post_avg", avg=True) self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False) + self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post_avg", avg=True) def _test_dev(self, instances, pos, neg, doc, print_string, avg=False, calc_random=False): predictions = list() @@ -155,16 +181,24 @@ class EL_Model: def _predict(self, article_doc, entity, avg=False, apply_threshold=True): if avg: - with self.sgd.use_params(self.model.averages): - doc_encoding = self.article_encoder([article_doc]) - entity_encoding = self.entity_encoder([entity]) - return self.model(np.append(entity_encoding, doc_encoding)) # TODO list + with self.article_encoder.use_params(self.sgd_article.averages) \ + and self.entity_encoder.use_params(self.sgd_article.averages): + doc_encoding = self.article_encoder([article_doc])[0] + entity_encoding = self.entity_encoder([entity])[0] + + else: + doc_encoding = self.article_encoder([article_doc])[0] + entity_encoding = self.entity_encoder([entity])[0] - doc_encoding = self.article_encoder([article_doc])[0] - entity_encoding = self.entity_encoder([entity])[0] concat_encoding = list(entity_encoding) + list(doc_encoding) np_array = np.asarray([concat_encoding]) - prediction = self.model(np_array) + + if avg: + with self.model.use_params(self.sgd.averages): + prediction = self.model(np_array) + else: + prediction = self.model(np_array) + if not apply_threshold: return float(prediction) if prediction > self.CUTOFF: @@ -199,14 +233,17 @@ class EL_Model: >> flatten_add_lengths \ >> ParametricAttention(in_width)\ >> Pooling(mean_pool) \ - >> Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3)))) \ + >> (ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3))) \ >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0)) # TODO: ReLu instead of LN(Maxout) ? + # TODO: more convolutions ? return encoder def _begin_training(self): + self.sgd_article = create_default_optimizer(self.article_encoder.ops) + self.sgd_entity = create_default_optimizer(self.entity_encoder.ops) self.sgd = create_default_optimizer(self.model.ops) @staticmethod @@ -216,34 +253,49 @@ class EL_Model: loss = (d_scores ** 2).sum() return loss, d_scores - def update(self, article_docs, entities, golds, drop=0., apply_threshold=True): - doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) - entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=drop) + def update(self, article_docs, entities, golds, apply_threshold=True): + print("article_docs", len(article_docs)) + for a in article_docs: + print(a[0:10], a[-10:]) + doc_encoding, bp_doc = self.article_encoder.begin_update([a], drop=self.DROP) + print(doc_encoding) + + doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP) + entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=self.DROP) concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] - predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=drop) + print("doc_encodings", len(doc_encodings), doc_encodings) + print("entity_encodings", len(entity_encodings), entity_encodings) + print("concat_encodings", len(concat_encodings), concat_encodings) + + predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) + print("predictions", predictions) predictions = self.model.ops.flatten(predictions) golds = self.model.ops.asarray(golds) loss, d_scores = self.get_loss(predictions, golds) - # if self.PRINT_LOSS: - # print("loss train", round(loss, 5)) + if self.PRINT_LOSS and self.PRINT_TRAIN: + print("loss train", round(loss, 5)) - # if self.PRINT_F: - # predictions_f = [x for x in predictions] - # if apply_threshold: - # predictions_f = [1.0 if x > self.CUTOFF else 0.0 for x in predictions_f] - # p, r, f = run_el.evaluate(predictions_f, golds, to_print=False) - # print("p/r/F train", round(p, 1), round(r, 1), round(f, 1)) + if self.PRINT_F and self.PRINT_TRAIN: + predictions_f = [x for x in predictions] + if apply_threshold: + predictions_f = [1.0 if x > self.CUTOFF else 0.0 for x in predictions_f] + p, r, f = run_el.evaluate(predictions_f, golds, to_print=False) + print("p/r/F train", round(p, 1), round(r, 1), round(f, 1)) d_scores = d_scores.reshape((-1, 1)) d_scores = d_scores.astype(np.float32) + print("d_scores", d_scores) model_gradient = bp_model(d_scores, sgd=self.sgd) + print("model_gradient", model_gradient) doc_gradient = [x[0:self.ARTICLE_WIDTH] for x in model_gradient] + print("doc_gradient", doc_gradient) entity_gradient = [x[self.ARTICLE_WIDTH:] for x in model_gradient] + print("entity_gradient", entity_gradient) bp_doc(doc_gradient) bp_encoding(entity_gradient) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 2e4ab3c2e..ced905ac5 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=2000, devlimit=200) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=10) print() # STEP 7: apply the EL algorithm on the dev dataset diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 7043c1647..69521c1b2 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -293,7 +293,7 @@ class Tensorizer(Pipe): docs (iterable): A batch of `Doc` objects. golds (iterable): A batch of `GoldParse` objects. - drop (float): The droput rate. + drop (float): The dropout rate. sgd (callable): An optimizer. RETURNS (dict): Results from the update. """ From 7edb2e171181f0f49fb4b1f54326fa9e2b97373b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 20 May 2019 11:58:48 +0200 Subject: [PATCH 045/148] fix convolution layer --- .../pipeline/wiki_entity_linking/train_el.py | 44 +++++++++++-------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 312e50cad..2d7ede48d 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -12,9 +12,9 @@ from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic -from thinc.api import chain, concatenate, flatten_add_lengths, clone +from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten from thinc.v2v import Model, Maxout, Affine -from thinc.t2v import Pooling, mean_pool +from thinc.t2v import Pooling, mean_pool, sum_pool from thinc.t2t import ParametricAttention from thinc.misc import Residual from thinc.misc import LayerNorm as LN @@ -96,13 +96,13 @@ class EL_Model: try: # if to_print: # print() - # print(article_count, "Training on article", article_id) + print(article_count, "Training on article", article_id) article_count += 1 article_docs = list() entities = list() golds = list() for inst_cluster in inst_cluster_set: - if instance_pos_count < 2: # TODO remove + if instance_pos_count < 2: # TODO del article_docs.append(train_doc[article_id]) entities.append(train_pos.get(inst_cluster)) golds.append(float(1.0)) @@ -228,16 +228,23 @@ class EL_Model: @staticmethod def _encoder(in_width, hidden_width): + conv_depth = 1 + cnn_maxout_pieces = 3 + with Model.define_operators({">>": chain}): + convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)))) + encoder = SpacyVectors \ - >> flatten_add_lengths \ - >> ParametricAttention(in_width)\ - >> Pooling(mean_pool) \ - >> (ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3))) \ - >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0)) + >> with_flatten(LN(Maxout(in_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \ + >> flatten_add_lengths \ + >> ParametricAttention(in_width)\ + >> Pooling(mean_pool) \ + >> Residual(zero_init(Maxout(in_width, in_width))) \ + >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0)) # TODO: ReLu instead of LN(Maxout) ? # TODO: more convolutions ? + # sum_pool or mean_pool ? return encoder @@ -261,16 +268,17 @@ class EL_Model: print(doc_encoding) doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP) - entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=self.DROP) - concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] - print("doc_encodings", len(doc_encodings), doc_encodings) + + entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=self.DROP) print("entity_encodings", len(entity_encodings), entity_encodings) - print("concat_encodings", len(concat_encodings), concat_encodings) + + concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] + # print("concat_encodings", len(concat_encodings), concat_encodings) predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) - print("predictions", predictions) predictions = self.model.ops.flatten(predictions) + print("predictions", predictions) golds = self.model.ops.asarray(golds) loss, d_scores = self.get_loss(predictions, golds) @@ -287,15 +295,15 @@ class EL_Model: d_scores = d_scores.reshape((-1, 1)) d_scores = d_scores.astype(np.float32) - print("d_scores", d_scores) + # print("d_scores", d_scores) model_gradient = bp_model(d_scores, sgd=self.sgd) - print("model_gradient", model_gradient) + # print("model_gradient", model_gradient) doc_gradient = [x[0:self.ARTICLE_WIDTH] for x in model_gradient] - print("doc_gradient", doc_gradient) + # print("doc_gradient", doc_gradient) entity_gradient = [x[self.ARTICLE_WIDTH:] for x in model_gradient] - print("entity_gradient", entity_gradient) + # print("entity_gradient", entity_gradient) bp_doc(doc_gradient) bp_encoding(entity_gradient) From 89e322a637243d261b84ce01ae6d5595b7e82dd6 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 20 May 2019 17:20:39 +0200 Subject: [PATCH 046/148] small fixes --- .../pipeline/wiki_entity_linking/train_el.py | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 2d7ede48d..3a7cd6186 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -33,9 +33,9 @@ class EL_Model: CUTOFF = 0.5 INPUT_DIM = 300 - ENTITY_WIDTH = 4 # 64 - ARTICLE_WIDTH = 8 # 128 - HIDDEN_WIDTH = 6 # 64 + ENTITY_WIDTH = 4 # 64 + ARTICLE_WIDTH = 8 # 128 + HIDDEN_WIDTH = 6 # 64 DROP = 0.00 @@ -71,7 +71,7 @@ class EL_Model: self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_random", calc_random=True) self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_random", calc_random=True) print() - self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_pre", calc_random=False) + self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_pre", avg=False) self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_pre", avg=False) instance_pos_count = 0 @@ -113,7 +113,7 @@ class EL_Model: golds.append(float(0.0)) instance_neg_count += 1 - for k in range(5): + for k in range(10): print() print("update", k) print() @@ -182,7 +182,7 @@ class EL_Model: def _predict(self, article_doc, entity, avg=False, apply_threshold=True): if avg: with self.article_encoder.use_params(self.sgd_article.averages) \ - and self.entity_encoder.use_params(self.sgd_article.averages): + and self.entity_encoder.use_params(self.sgd_entity.averages): doc_encoding = self.article_encoder([article_doc])[0] entity_encoding = self.entity_encoder([entity])[0] @@ -228,7 +228,7 @@ class EL_Model: @staticmethod def _encoder(in_width, hidden_width): - conv_depth = 1 + conv_depth = 2 cnn_maxout_pieces = 3 with Model.define_operators({">>": chain}): @@ -261,16 +261,10 @@ class EL_Model: return loss, d_scores def update(self, article_docs, entities, golds, apply_threshold=True): - print("article_docs", len(article_docs)) - for a in article_docs: - print(a[0:10], a[-10:]) - doc_encoding, bp_doc = self.article_encoder.begin_update([a], drop=self.DROP) - print(doc_encoding) - doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP) print("doc_encodings", len(doc_encodings), doc_encodings) - entity_encodings, bp_encoding = self.entity_encoder.begin_update(entities, drop=self.DROP) + entity_encodings, bp_entity = self.entity_encoder.begin_update(entities, drop=self.DROP) print("entity_encodings", len(entity_encodings), entity_encodings) concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] @@ -298,15 +292,19 @@ class EL_Model: # print("d_scores", d_scores) model_gradient = bp_model(d_scores, sgd=self.sgd) - # print("model_gradient", model_gradient) + print("model_gradient", model_gradient) - doc_gradient = [x[0:self.ARTICLE_WIDTH] for x in model_gradient] - # print("doc_gradient", doc_gradient) - entity_gradient = [x[self.ARTICLE_WIDTH:] for x in model_gradient] - # print("entity_gradient", entity_gradient) + doc_gradient = list() + entity_gradient = list() + for x in model_gradient: + doc_gradient.append(list(x[0:self.ARTICLE_WIDTH])) + entity_gradient.append(list(x[self.ARTICLE_WIDTH:])) + + print("doc_gradient", doc_gradient) + print("entity_gradient", entity_gradient) bp_doc(doc_gradient) - bp_encoding(entity_gradient) + bp_entity(entity_gradient) def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) From 0a15ee4541b2b46db716990830eb0d67d71fa45a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 20 May 2019 23:54:55 +0200 Subject: [PATCH 047/148] fix in bp call --- .../pipeline/wiki_entity_linking/train_el.py | 82 +++++++++---------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 38 insertions(+), 46 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 3a7cd6186..e213f0955 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten -from thinc.v2v import Model, Maxout, Affine +from thinc.v2v import Model, Maxout, Affine, ReLu from thinc.t2v import Pooling, mean_pool, sum_pool from thinc.t2t import ParametricAttention from thinc.misc import Residual @@ -28,16 +28,16 @@ class EL_Model: PRINT_LOSS = False PRINT_F = True - PRINT_TRAIN = True + PRINT_TRAIN = False EPS = 0.0000000005 CUTOFF = 0.5 INPUT_DIM = 300 - ENTITY_WIDTH = 4 # 64 - ARTICLE_WIDTH = 8 # 128 - HIDDEN_WIDTH = 6 # 64 + ENTITY_WIDTH = 64 # 4 + ARTICLE_WIDTH = 128 # 8 + HIDDEN_WIDTH = 64 # 6 - DROP = 0.00 + DROP = 0.1 name = "entity_linker" @@ -91,41 +91,34 @@ class EL_Model: print() # TODO: proper batches. Currently 1 article at the time + # TODO shuffle data (currently positive is always followed by several negatives) article_count = 0 for article_id, inst_cluster_set in train_inst.items(): try: # if to_print: # print() - print(article_count, "Training on article", article_id) + # print(article_count, "Training on article", article_id) article_count += 1 article_docs = list() entities = list() golds = list() for inst_cluster in inst_cluster_set: - if instance_pos_count < 2: # TODO del + article_docs.append(train_doc[article_id]) + entities.append(train_pos.get(inst_cluster)) + golds.append(float(1.0)) + instance_pos_count += 1 + for neg_entity in train_neg.get(inst_cluster, []): article_docs.append(train_doc[article_id]) - entities.append(train_pos.get(inst_cluster)) - golds.append(float(1.0)) - instance_pos_count += 1 - for neg_entity in train_neg.get(inst_cluster, []): - article_docs.append(train_doc[article_id]) - entities.append(neg_entity) - golds.append(float(0.0)) - instance_neg_count += 1 + entities.append(neg_entity) + golds.append(float(0.0)) + instance_neg_count += 1 - for k in range(10): - print() - print("update", k) - print() - # print("article docs", article_docs) - print("entities", entities) - print("golds", golds) - print() - self.update(article_docs=article_docs, entities=entities, golds=golds) + self.update(article_docs=article_docs, entities=entities, golds=golds) - # dev eval - self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False) - self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter_avg", avg=True) + # dev eval + # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False) + self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter_avg", avg=True) + print() except ValueError as e: print("Error in article id", article_id) @@ -133,11 +126,12 @@ class EL_Model: print() print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg") - print() - self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", avg=False) - self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post_avg", avg=True) - self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False) - self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post_avg", avg=True) + if self.PRINT_TRAIN: + # print() + # self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", avg=False) + self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post_avg", avg=True) + # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False) + # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post_avg", avg=True) def _test_dev(self, instances, pos, neg, doc, print_string, avg=False, calc_random=False): predictions = list() @@ -170,8 +164,7 @@ class EL_Model: # TODO: combine with prior probability p, r, f = run_el.evaluate(predictions, golds, to_print=False) if self.PRINT_F: - # print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1)) - print("F", print_string, round(f, 1)) + print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1)) loss, d_scores = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds)) if self.PRINT_LOSS: @@ -242,8 +235,7 @@ class EL_Model: >> Residual(zero_init(Maxout(in_width, in_width))) \ >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0)) - # TODO: ReLu instead of LN(Maxout) ? - # TODO: more convolutions ? + # TODO: ReLu or LN(Maxout) ? # sum_pool or mean_pool ? return encoder @@ -262,17 +254,17 @@ class EL_Model: def update(self, article_docs, entities, golds, apply_threshold=True): doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP) - print("doc_encodings", len(doc_encodings), doc_encodings) + # print("doc_encodings", len(doc_encodings), doc_encodings) entity_encodings, bp_entity = self.entity_encoder.begin_update(entities, drop=self.DROP) - print("entity_encodings", len(entity_encodings), entity_encodings) + # print("entity_encodings", len(entity_encodings), entity_encodings) concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] # print("concat_encodings", len(concat_encodings), concat_encodings) predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) predictions = self.model.ops.flatten(predictions) - print("predictions", predictions) + # print("predictions", predictions) golds = self.model.ops.asarray(golds) loss, d_scores = self.get_loss(predictions, golds) @@ -292,7 +284,7 @@ class EL_Model: # print("d_scores", d_scores) model_gradient = bp_model(d_scores, sgd=self.sgd) - print("model_gradient", model_gradient) + # print("model_gradient", model_gradient) doc_gradient = list() entity_gradient = list() @@ -300,11 +292,11 @@ class EL_Model: doc_gradient.append(list(x[0:self.ARTICLE_WIDTH])) entity_gradient.append(list(x[self.ARTICLE_WIDTH:])) - print("doc_gradient", doc_gradient) - print("entity_gradient", entity_gradient) + # print("doc_gradient", doc_gradient) + # print("entity_gradient", entity_gradient) - bp_doc(doc_gradient) - bp_entity(entity_gradient) + bp_doc(doc_gradient, sgd=self.sgd_article) + bp_entity(entity_gradient, sgd=self.sgd_entity) def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index ced905ac5..6f021597f 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1, devlimit=10) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=200) print() # STEP 7: apply the EL algorithm on the dev dataset From 2fa3fac8512c1ed102a64017123246ca156cfef5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 21 May 2019 13:43:59 +0200 Subject: [PATCH 048/148] fix concat bp and more efficient batch calls --- .../pipeline/wiki_entity_linking/train_el.py | 163 ++++++++---------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 76 insertions(+), 89 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index e213f0955..2d218ed60 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -52,27 +52,25 @@ class EL_Model: # raise errors instead of runtime warnings in case of int/float overflow np.seterr(all='raise') - Doc.set_extension("entity_id", default=None) + train_inst, train_pos, train_neg, train_texts = self._get_training_data(training_dir, + entity_descr_output, + False, + trainlimit, + to_print=False) - train_inst, train_pos, train_neg, train_doc = self._get_training_data(training_dir, - entity_descr_output, - False, - trainlimit, - to_print=False) - - dev_inst, dev_pos, dev_neg, dev_doc = self._get_training_data(training_dir, - entity_descr_output, - True, - devlimit, - to_print=False) + dev_inst, dev_pos, dev_neg, dev_texts = self._get_training_data(training_dir, + entity_descr_output, + True, + devlimit, + to_print=False) self._begin_training() print() - self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_random", calc_random=True) - self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_random", calc_random=True) + self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_random", calc_random=True) + self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_random", calc_random=True) print() - self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_pre", avg=False) - self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_pre", avg=False) + self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_pre", avg=False) + self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_pre", avg=False) instance_pos_count = 0 instance_neg_count = 0 @@ -99,26 +97,22 @@ class EL_Model: # print() # print(article_count, "Training on article", article_id) article_count += 1 - article_docs = list() + article_text = train_texts[article_id] entities = list() golds = list() for inst_cluster in inst_cluster_set: - article_docs.append(train_doc[article_id]) entities.append(train_pos.get(inst_cluster)) golds.append(float(1.0)) instance_pos_count += 1 for neg_entity in train_neg.get(inst_cluster, []): - article_docs.append(train_doc[article_id]) entities.append(neg_entity) golds.append(float(0.0)) instance_neg_count += 1 - self.update(article_docs=article_docs, entities=entities, golds=golds) + self.update(article_text=article_text, entities=entities, golds=golds) # dev eval - # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter", avg=False) - self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_inter_avg", avg=True) - print() + self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_inter_avg", avg=True) except ValueError as e: print("Error in article id", article_id) @@ -127,13 +121,9 @@ class EL_Model: print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg") if self.PRINT_TRAIN: - # print() - # self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post", avg=False) - self._test_dev(train_inst, train_pos, train_neg, train_doc, print_string="train_post_avg", avg=True) - # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post", avg=False) - # self._test_dev(dev_inst, dev_pos, dev_neg, dev_doc, print_string="dev_post_avg", avg=True) + self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_post_avg", avg=True) - def _test_dev(self, instances, pos, neg, doc, print_string, avg=False, calc_random=False): + def _test_dev(self, instances, pos, neg, texts_by_id, print_string, avg=False, calc_random=False): predictions = list() golds = list() @@ -144,22 +134,18 @@ class EL_Model: article = inst_cluster.split(sep="_")[0] entity_id = inst_cluster.split(sep="_")[1] - article_doc = doc[article] + article_doc = self.nlp(texts_by_id[article]) + entities = [self.nlp(pos_ex)] + golds.append(float(1.0)) + for neg_ex in neg_exs: + entities.append(self.nlp(neg_ex)) + golds.append(float(0.0)) if calc_random: - prediction = self._predict_random(entity=pos_ex) + preds = self._predict_random(entities=entities) else: - prediction = self._predict(article_doc=article_doc, entity=pos_ex, avg=avg) - predictions.append(prediction) - golds.append(float(1.0)) - - for neg_ex in neg_exs: - if calc_random: - prediction = self._predict_random(entity=neg_ex) - else: - prediction = self._predict(article_doc=article_doc, entity=neg_ex, avg=avg) - predictions.append(prediction) - golds.append(float(0.0)) + preds = self._predict(article_doc=article_doc, entities=entities, avg=avg) + predictions.extend(preds) # TODO: combine with prior probability p, r, f = run_el.evaluate(predictions, golds, to_print=False) @@ -172,39 +158,38 @@ class EL_Model: return loss, p, r, f - def _predict(self, article_doc, entity, avg=False, apply_threshold=True): + def _predict(self, article_doc, entities, avg=False, apply_threshold=True): if avg: with self.article_encoder.use_params(self.sgd_article.averages) \ and self.entity_encoder.use_params(self.sgd_entity.averages): doc_encoding = self.article_encoder([article_doc])[0] - entity_encoding = self.entity_encoder([entity])[0] + entity_encodings = self.entity_encoder(entities) else: doc_encoding = self.article_encoder([article_doc])[0] - entity_encoding = self.entity_encoder([entity])[0] + entity_encodings = self.entity_encoder(entities) - concat_encoding = list(entity_encoding) + list(doc_encoding) - np_array = np.asarray([concat_encoding]) + concat_encodings = [list(entity_encodings[i]) + list(doc_encoding) for i in range(len(entities))] + np_array_list = np.asarray(concat_encodings) if avg: - with self.model.use_params(self.sgd.averages): - prediction = self.model(np_array) + with self.model.use_params(self.sgd.averages): + predictions = self.model(np_array_list) else: - prediction = self.model(np_array) + predictions = self.model(np_array_list) - if not apply_threshold: - return float(prediction) - if prediction > self.CUTOFF: - return float(1.0) - return float(0.0) + predictions = self.model.ops.flatten(predictions) + predictions = [float(p) for p in predictions] + if apply_threshold: + predictions = [float(1.0) if p > self.CUTOFF else float(0.0) for p in predictions] - def _predict_random(self, entity, apply_threshold=True): - r = random.uniform(0, 1) + return predictions + + def _predict_random(self, entities, apply_threshold=True): if not apply_threshold: - return r - if r > self.CUTOFF: - return float(1.0) - return float(0.0) + return [float(random.uniform(0,1)) for e in entities] + else: + return [float(1.0) if random.uniform(0,1) > self.CUTOFF else float(0.0) for e in entities] def _build_cnn(self, hidden_entity_width, hidden_article_width): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): @@ -252,20 +237,27 @@ class EL_Model: loss = (d_scores ** 2).sum() return loss, d_scores - def update(self, article_docs, entities, golds, apply_threshold=True): - doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP) - # print("doc_encodings", len(doc_encodings), doc_encodings) + # TODO: multiple docs/articles + def update(self, article_text, entities, golds, apply_threshold=True): + article_doc = self.nlp(article_text) + doc_encodings, bp_doc = self.article_encoder.begin_update([article_doc], drop=self.DROP) + doc_encoding = doc_encodings[0] - entity_encodings, bp_entity = self.entity_encoder.begin_update(entities, drop=self.DROP) + entity_docs = list(self.nlp.pipe(entities)) + # print("entity_docs", type(entity_docs)) + + entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=self.DROP) # print("entity_encodings", len(entity_encodings), entity_encodings) - concat_encodings = [list(entity_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] + concat_encodings = [list(entity_encodings[i]) + list(doc_encoding) for i in range(len(entities))] # print("concat_encodings", len(concat_encodings), concat_encodings) predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) predictions = self.model.ops.flatten(predictions) + # print("predictions", predictions) golds = self.model.ops.asarray(golds) + # print("golds", golds) loss, d_scores = self.get_loss(predictions, golds) @@ -275,7 +267,7 @@ class EL_Model: if self.PRINT_F and self.PRINT_TRAIN: predictions_f = [x for x in predictions] if apply_threshold: - predictions_f = [1.0 if x > self.CUTOFF else 0.0 for x in predictions_f] + predictions_f = [float(1.0) if x > self.CUTOFF else float(0.0) for x in predictions_f] p, r, f = run_el.evaluate(predictions_f, golds, to_print=False) print("p/r/F train", round(p, 1), round(r, 1), round(f, 1)) @@ -286,17 +278,17 @@ class EL_Model: model_gradient = bp_model(d_scores, sgd=self.sgd) # print("model_gradient", model_gradient) - doc_gradient = list() - entity_gradient = list() + # concat = entity + doc, but doc is the same within this function (TODO: multiple docs/articles) + doc_gradient = model_gradient[0][self.ENTITY_WIDTH:] + entity_gradients = list() for x in model_gradient: - doc_gradient.append(list(x[0:self.ARTICLE_WIDTH])) - entity_gradient.append(list(x[self.ARTICLE_WIDTH:])) + entity_gradients.append(list(x[0:self.ENTITY_WIDTH])) # print("doc_gradient", doc_gradient) - # print("entity_gradient", entity_gradient) + # print("entity_gradients", entity_gradients) - bp_doc(doc_gradient, sgd=self.sgd_article) - bp_entity(entity_gradient, sgd=self.sgd_entity) + bp_doc([doc_gradient], sgd=self.sgd_article) + bp_entity(entity_gradients, sgd=self.sgd_entity) def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) @@ -305,9 +297,9 @@ class EL_Model: collect_correct=True, collect_incorrect=True) - instance_by_doc = dict() + instance_by_article = dict() local_vectors = list() # TODO: local vectors - doc_by_article = dict() + text_by_article = dict() pos_entities = dict() neg_entities = dict() @@ -319,33 +311,28 @@ class EL_Model: if cnt % 500 == 0 and to_print: print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") cnt += 1 - if article_id not in doc_by_article: + if article_id not in text_by_article: with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: text = file.read() - doc = self.nlp(text) - doc_by_article[article_id] = doc - instance_by_doc[article_id] = set() + text_by_article[article_id] = text + instance_by_article[article_id] = set() for mention, entity_pos in correct_entries[article_id].items(): descr = id_to_descr.get(entity_pos) if descr: - instance_by_doc[article_id].add(article_id + "_" + mention) - doc_descr = self.nlp(descr) - doc_descr._.entity_id = entity_pos - pos_entities[article_id + "_" + mention] = doc_descr + instance_by_article[article_id].add(article_id + "_" + mention) + pos_entities[article_id + "_" + mention] = descr for mention, entity_negs in incorrect_entries[article_id].items(): for entity_neg in entity_negs: descr = id_to_descr.get(entity_neg) if descr: - doc_descr = self.nlp(descr) - doc_descr._.entity_id = entity_neg descr_list = neg_entities.get(article_id + "_" + mention, []) - descr_list.append(doc_descr) + descr_list.append(descr) neg_entities[article_id + "_" + mention] = descr_list if to_print: print() print("Processed", cnt, "training articles, dev=" + str(dev)) print() - return instance_by_doc, pos_entities, neg_entities, doc_by_article + return instance_by_article, pos_entities, neg_entities, text_by_article diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 6f021597f..23c12bfe6 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=200) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10, devlimit=10) print() # STEP 7: apply the EL algorithm on the dev dataset From 7b13e3d56fb2af2ba6f2ebdd9e26e1aa8f540dd5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 21 May 2019 18:35:10 +0200 Subject: [PATCH 049/148] undersampling negatives --- .../pipeline/wiki_entity_linking/train_el.py | 20 +++++++++---------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 2d218ed60..20a5e4428 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -56,20 +56,19 @@ class EL_Model: entity_descr_output, False, trainlimit, + balance=True, to_print=False) dev_inst, dev_pos, dev_neg, dev_texts = self._get_training_data(training_dir, entity_descr_output, True, devlimit, + balance=False, to_print=False) self._begin_training() print() - self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_random", calc_random=True) self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_random", calc_random=True) - print() - self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_pre", avg=False) self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_pre", avg=False) instance_pos_count = 0 @@ -120,9 +119,6 @@ class EL_Model: print() print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg") - if self.PRINT_TRAIN: - self._test_dev(train_inst, train_pos, train_neg, train_texts, print_string="train_post_avg", avg=True) - def _test_dev(self, instances, pos, neg, texts_by_id, print_string, avg=False, calc_random=False): predictions = list() golds = list() @@ -290,7 +286,7 @@ class EL_Model: bp_doc([doc_gradient], sgd=self.sgd_article) bp_entity(entity_gradients, sgd=self.sgd_entity) - def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): + def _get_training_data(self, training_dir, entity_descr_output, dev, limit, balance, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, @@ -324,12 +320,16 @@ class EL_Model: pos_entities[article_id + "_" + mention] = descr for mention, entity_negs in incorrect_entries[article_id].items(): + neg_count = 0 for entity_neg in entity_negs: descr = id_to_descr.get(entity_neg) if descr: - descr_list = neg_entities.get(article_id + "_" + mention, []) - descr_list.append(descr) - neg_entities[article_id + "_" + mention] = descr_list + # if balance, keep only 1 negative instance for each positive instance + if neg_count < 1 or not balance: + descr_list = neg_entities.get(article_id + "_" + mention, []) + descr_list.append(descr) + neg_entities[article_id + "_" + mention] = descr_list + neg_count += 1 if to_print: print() diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 23c12bfe6..0927fb394 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10, devlimit=10) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=500, devlimit=20) print() # STEP 7: apply the EL algorithm on the dev dataset From eb08bdb11feef7bd8ffaa31a7d30dab37e97d1d3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 21 May 2019 23:42:46 +0200 Subject: [PATCH 050/148] hidden with for encoders --- .../pipeline/wiki_entity_linking/train_el.py | 44 +++++++++++-------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 20a5e4428..36fb9227a 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -33,9 +33,10 @@ class EL_Model: CUTOFF = 0.5 INPUT_DIM = 300 - ENTITY_WIDTH = 64 # 4 - ARTICLE_WIDTH = 128 # 8 - HIDDEN_WIDTH = 64 # 6 + HIDDEN_1_WIDTH = 256 # 10 + HIDDEN_2_WIDTH = 32 # 6 + ENTITY_WIDTH = 64 # 4 + ARTICLE_WIDTH = 128 # 8 DROP = 0.1 @@ -46,7 +47,11 @@ class EL_Model: self.nlp = nlp self.kb = kb - self._build_cnn(hidden_entity_width=self.ENTITY_WIDTH, hidden_article_width=self.ARTICLE_WIDTH) + self._build_cnn(in_width=self.INPUT_DIM, + entity_width=self.ENTITY_WIDTH, + article_width=self.ARTICLE_WIDTH, + hidden_1_width=self.HIDDEN_1_WIDTH, + hidden_2_width=self.HIDDEN_2_WIDTH) def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): # raise errors instead of runtime warnings in case of int/float overflow @@ -81,9 +86,10 @@ class EL_Model: print() print(" CUTOFF", self.CUTOFF) print(" INPUT_DIM", self.INPUT_DIM) + print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) print(" ENTITY_WIDTH", self.ENTITY_WIDTH) print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH) - print(" HIDDEN_WIDTH", self.ARTICLE_WIDTH) + print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH) print(" DROP", self.DROP) print() @@ -187,34 +193,34 @@ class EL_Model: else: return [float(1.0) if random.uniform(0,1) > self.CUTOFF else float(0.0) for e in entities] - def _build_cnn(self, hidden_entity_width, hidden_article_width): + def _build_cnn(self, in_width, entity_width, article_width, hidden_1_width, hidden_2_width): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - self.entity_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_entity_width) - self.article_encoder = self._encoder(in_width=self.INPUT_DIM, hidden_width=hidden_article_width) + self.entity_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=entity_width) + self.article_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=article_width) - nr_i = hidden_entity_width + hidden_article_width - nr_o = self.HIDDEN_WIDTH + in_width = entity_width + article_width + out_width = hidden_2_width - self.model = Affine(nr_o, nr_i) \ - >> LN(Maxout(nr_o, nr_o)) \ - >> Affine(1, nr_o) \ + self.model = Affine(out_width, in_width) \ + >> LN(Maxout(out_width, out_width)) \ + >> Affine(1, out_width) \ >> logistic @staticmethod - def _encoder(in_width, hidden_width): + def _encoder(in_width, hidden_with, end_width): conv_depth = 2 cnn_maxout_pieces = 3 with Model.define_operators({">>": chain}): - convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(in_width, in_width * 3, pieces=cnn_maxout_pieces)))) + convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces)))) encoder = SpacyVectors \ - >> with_flatten(LN(Maxout(in_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \ + >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution ** conv_depth, pad=conv_depth) \ >> flatten_add_lengths \ - >> ParametricAttention(in_width)\ + >> ParametricAttention(hidden_with)\ >> Pooling(mean_pool) \ - >> Residual(zero_init(Maxout(in_width, in_width))) \ - >> zero_init(Affine(hidden_width, in_width, drop_factor=0.0)) + >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \ + >> zero_init(Affine(end_width, hidden_with, drop_factor=0.0)) # TODO: ReLu or LN(Maxout) ? # sum_pool or mean_pool ? diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 0927fb394..a3d6a69f9 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=500, devlimit=20) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=20) print() # STEP 7: apply the EL algorithm on the dev dataset From 1a16490d20185949d65831fc96064a4c1e1c97e8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 22 May 2019 12:46:40 +0200 Subject: [PATCH 051/148] update per entity --- .../pipeline/wiki_entity_linking/train_el.py | 91 +++++++++---------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 45 insertions(+), 48 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 36fb9227a..a383a3687 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -154,7 +154,7 @@ class EL_Model: if self.PRINT_F: print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1)) - loss, d_scores = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds)) + loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds)) if self.PRINT_LOSS: print("loss", print_string, round(loss, 5)) @@ -235,62 +235,58 @@ class EL_Model: @staticmethod def get_loss(predictions, golds): d_scores = (predictions - golds) - - loss = (d_scores ** 2).sum() + loss = (d_scores ** 2).mean() return loss, d_scores # TODO: multiple docs/articles def update(self, article_text, entities, golds, apply_threshold=True): article_doc = self.nlp(article_text) - doc_encodings, bp_doc = self.article_encoder.begin_update([article_doc], drop=self.DROP) - doc_encoding = doc_encodings[0] + # entity_docs = list(self.nlp.pipe(entities)) - entity_docs = list(self.nlp.pipe(entities)) - # print("entity_docs", type(entity_docs)) + for entity, gold in zip(entities, golds): + doc_encodings, bp_doc = self.article_encoder.begin_update([article_doc], drop=self.DROP) + doc_encoding = doc_encodings[0] - entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=self.DROP) - # print("entity_encodings", len(entity_encodings), entity_encodings) + entity_doc = self.nlp(entity) + # print("entity_docs", type(entity_doc)) - concat_encodings = [list(entity_encodings[i]) + list(doc_encoding) for i in range(len(entities))] - # print("concat_encodings", len(concat_encodings), concat_encodings) + entity_encodings, bp_entity = self.entity_encoder.begin_update([entity_doc], drop=self.DROP) + entity_encoding = entity_encodings[0] + # print("entity_encoding", len(entity_encoding), entity_encoding) - predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) - predictions = self.model.ops.flatten(predictions) + concat_encodings = [list(entity_encoding) + list(doc_encoding)] # for i in range(len(entities)) + # print("concat_encodings", len(concat_encodings), concat_encodings) - # print("predictions", predictions) - golds = self.model.ops.asarray(golds) - # print("golds", golds) + prediction, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) + # predictions = self.model.ops.flatten(predictions) - loss, d_scores = self.get_loss(predictions, golds) + # print("prediction", prediction) + # golds = self.model.ops.asarray(golds) + # print("gold", gold) - if self.PRINT_LOSS and self.PRINT_TRAIN: - print("loss train", round(loss, 5)) + loss, gradient = self.get_loss(prediction, gold) - if self.PRINT_F and self.PRINT_TRAIN: - predictions_f = [x for x in predictions] - if apply_threshold: - predictions_f = [float(1.0) if x > self.CUTOFF else float(0.0) for x in predictions_f] - p, r, f = run_el.evaluate(predictions_f, golds, to_print=False) - print("p/r/F train", round(p, 1), round(r, 1), round(f, 1)) + if self.PRINT_LOSS and self.PRINT_TRAIN: + print("loss train", round(loss, 5)) - d_scores = d_scores.reshape((-1, 1)) - d_scores = d_scores.astype(np.float32) - # print("d_scores", d_scores) + gradient = float(gradient) + # print("gradient", gradient) + # print("loss", loss) - model_gradient = bp_model(d_scores, sgd=self.sgd) - # print("model_gradient", model_gradient) + model_gradient = bp_model(gradient, sgd=self.sgd) + # print("model_gradient", model_gradient) - # concat = entity + doc, but doc is the same within this function (TODO: multiple docs/articles) - doc_gradient = model_gradient[0][self.ENTITY_WIDTH:] - entity_gradients = list() - for x in model_gradient: - entity_gradients.append(list(x[0:self.ENTITY_WIDTH])) + # concat = entity + doc, but doc is the same within this function (TODO: multiple docs/articles) + doc_gradient = model_gradient[0][self.ENTITY_WIDTH:] + entity_gradients = list() + for x in model_gradient: + entity_gradients.append(list(x[0:self.ENTITY_WIDTH])) - # print("doc_gradient", doc_gradient) - # print("entity_gradients", entity_gradients) + # print("doc_gradient", doc_gradient) + # print("entity_gradients", entity_gradients) - bp_doc([doc_gradient], sgd=self.sgd_article) - bp_entity(entity_gradients, sgd=self.sgd_entity) + bp_doc([doc_gradient], sgd=self.sgd_article) + bp_entity(entity_gradients, sgd=self.sgd_entity) def _get_training_data(self, training_dir, entity_descr_output, dev, limit, balance, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) @@ -326,16 +322,17 @@ class EL_Model: pos_entities[article_id + "_" + mention] = descr for mention, entity_negs in incorrect_entries[article_id].items(): - neg_count = 0 - for entity_neg in entity_negs: - descr = id_to_descr.get(entity_neg) - if descr: + if not balance or pos_entities.get(article_id + "_" + mention): + neg_count = 0 + for entity_neg in entity_negs: # if balance, keep only 1 negative instance for each positive instance if neg_count < 1 or not balance: - descr_list = neg_entities.get(article_id + "_" + mention, []) - descr_list.append(descr) - neg_entities[article_id + "_" + mention] = descr_list - neg_count += 1 + descr = id_to_descr.get(entity_neg) + if descr: + descr_list = neg_entities.get(article_id + "_" + mention, []) + descr_list.append(descr) + neg_entities[article_id + "_" + mention] = descr_list + neg_count += 1 if to_print: print() diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index a3d6a69f9..319b1e1c8 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=20) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20) print() # STEP 7: apply the EL algorithm on the dev dataset From 97241a3ed78d7fa41aaea3de30843ca49b0ae6d0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 22 May 2019 23:40:10 +0200 Subject: [PATCH 052/148] upsampling and batch processing --- .../pipeline/wiki_entity_linking/run_el.py | 12 +- .../pipeline/wiki_entity_linking/train_el.py | 294 +++++++++--------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 3 files changed, 157 insertions(+), 151 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index 6ab7ea75f..273543306 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -78,8 +78,15 @@ def evaluate(predictions, golds, to_print=True): fp = 0 fn = 0 + corrects = 0 + incorrects = 0 + for pred, gold in zip(predictions, golds): is_correct = pred == gold + if is_correct: + corrects += 1 + else: + incorrects += 1 if not pred: if not is_correct: # we don't care about tn fn += 1 @@ -98,12 +105,15 @@ def evaluate(predictions, golds, to_print=True): recall = 100 * tp / (tp + fn + 0.0000001) fscore = 2 * recall * precision / (recall + precision + 0.0000001) + accuracy = corrects / (corrects + incorrects) + if to_print: print("precision", round(precision, 1), "%") print("recall", round(recall, 1), "%") print("Fscore", round(fscore, 1), "%") + print("Accuracy", round(accuracy, 1), "%") - return precision, recall, fscore + return precision, recall, fscore, accuracy def _prepare_pipeline(nlp, kb): diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index a383a3687..cd6e9de4d 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -6,6 +6,7 @@ import datetime from os import listdir import numpy as np import random +from random import shuffle from thinc.neural._classes.convolution import ExtractWindow from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator @@ -26,17 +27,17 @@ from spacy.tokens import Doc class EL_Model: - PRINT_LOSS = False - PRINT_F = True PRINT_TRAIN = False EPS = 0.0000000005 CUTOFF = 0.5 + BATCH_SIZE = 5 + INPUT_DIM = 300 - HIDDEN_1_WIDTH = 256 # 10 + HIDDEN_1_WIDTH = 32 # 10 HIDDEN_2_WIDTH = 32 # 6 - ENTITY_WIDTH = 64 # 4 - ARTICLE_WIDTH = 128 # 8 + DESC_WIDTH = 64 # 4 + ARTICLE_WIDTH = 64 # 8 DROP = 0.1 @@ -48,7 +49,7 @@ class EL_Model: self.kb = kb self._build_cnn(in_width=self.INPUT_DIM, - entity_width=self.ENTITY_WIDTH, + desc_width=self.DESC_WIDTH, article_width=self.ARTICLE_WIDTH, hidden_1_width=self.HIDDEN_1_WIDTH, hidden_2_width=self.HIDDEN_2_WIDTH) @@ -57,121 +58,118 @@ class EL_Model: # raise errors instead of runtime warnings in case of int/float overflow np.seterr(all='raise') - train_inst, train_pos, train_neg, train_texts = self._get_training_data(training_dir, - entity_descr_output, - False, - trainlimit, - balance=True, - to_print=False) + train_ent, train_gold, train_desc, train_article, train_texts = self._get_training_data(training_dir, + entity_descr_output, + False, + trainlimit, + to_print=False) + + train_pos_entities = [k for k,v in train_gold.items() if v] + train_neg_entities = [k for k,v in train_gold.items() if not v] + + train_pos_count = len(train_pos_entities) + train_neg_count = len(train_neg_entities) + + # upsample positives to 50-50 distribution + while train_pos_count < train_neg_count: + train_ent.append(random.choice(train_pos_entities)) + train_pos_count += 1 + + # upsample negatives to 50-50 distribution + while train_neg_count < train_pos_count: + train_ent.append(random.choice(train_neg_entities)) + train_neg_count += 1 + + shuffle(train_ent) + + dev_ent, dev_gold, dev_desc, dev_article, dev_texts = self._get_training_data(training_dir, + entity_descr_output, + True, + devlimit, + to_print=False) + shuffle(dev_ent) + + dev_pos_count = len([g for g in dev_gold.values() if g]) + dev_neg_count = len([g for g in dev_gold.values() if not g]) - dev_inst, dev_pos, dev_neg, dev_texts = self._get_training_data(training_dir, - entity_descr_output, - True, - devlimit, - balance=False, - to_print=False) self._begin_training() print() - self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_random", calc_random=True) - self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_pre", avg=False) - - instance_pos_count = 0 - instance_neg_count = 0 + self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_random", calc_random=True) + print() + self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_pre", avg=True) if to_print: print() - print("Training on", len(train_inst.values()), "articles") - print("Dev test on", len(dev_inst.values()), "articles") + print("Training on", len(train_ent), "entities in", len(train_texts), "articles") + print("Training instances pos/neg", train_pos_count, train_neg_count) + print() + print("Dev test on", len(dev_ent), "entities in", len(dev_texts), "articles") + print("Dev instances pos/neg", dev_pos_count, dev_neg_count) print() print(" CUTOFF", self.CUTOFF) print(" INPUT_DIM", self.INPUT_DIM) print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) - print(" ENTITY_WIDTH", self.ENTITY_WIDTH) + print(" DESC_WIDTH", self.DESC_WIDTH) print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH) print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH) print(" DROP", self.DROP) print() - # TODO: proper batches. Currently 1 article at the time - # TODO shuffle data (currently positive is always followed by several negatives) - article_count = 0 - for article_id, inst_cluster_set in train_inst.items(): - try: - # if to_print: - # print() - # print(article_count, "Training on article", article_id) - article_count += 1 - article_text = train_texts[article_id] - entities = list() - golds = list() - for inst_cluster in inst_cluster_set: - entities.append(train_pos.get(inst_cluster)) - golds.append(float(1.0)) - instance_pos_count += 1 - for neg_entity in train_neg.get(inst_cluster, []): - entities.append(neg_entity) - golds.append(float(0.0)) - instance_neg_count += 1 + start = 0 + stop = min(self.BATCH_SIZE, len(train_ent)) + processed = 0 - self.update(article_text=article_text, entities=entities, golds=golds) + while start < len(train_ent): + next_batch = train_ent[start:stop] - # dev eval - self._test_dev(dev_inst, dev_pos, dev_neg, dev_texts, print_string="dev_inter_avg", avg=True) - except ValueError as e: - print("Error in article id", article_id) + golds = [train_gold[e] for e in next_batch] + descs = [train_desc[e] for e in next_batch] + articles = [train_texts[train_article[e]] for e in next_batch] + + self.update(entities=next_batch, golds=golds, descs=descs, texts=articles) + self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_inter", avg=True) + + processed += len(next_batch) + + start = start + self.BATCH_SIZE + stop = min(stop + self.BATCH_SIZE, len(train_ent)) if to_print: print() - print("Trained on", instance_pos_count, "/", instance_neg_count, "instances pos/neg") + print("Trained on", processed, "entities in total") - def _test_dev(self, instances, pos, neg, texts_by_id, print_string, avg=False, calc_random=False): - predictions = list() - golds = list() + def _test_dev(self, entities, gold_by_entity, desc_by_entity, article_by_entity, texts_by_id, print_string, avg=True, calc_random=False): + golds = [gold_by_entity[e] for e in entities] - for article_id, inst_cluster_set in instances.items(): - for inst_cluster in inst_cluster_set: - pos_ex = pos.get(inst_cluster) - neg_exs = neg.get(inst_cluster, []) + if calc_random: + predictions = self._predict_random(entities=entities) - article = inst_cluster.split(sep="_")[0] - entity_id = inst_cluster.split(sep="_")[1] - article_doc = self.nlp(texts_by_id[article]) - entities = [self.nlp(pos_ex)] - golds.append(float(1.0)) - for neg_ex in neg_exs: - entities.append(self.nlp(neg_ex)) - golds.append(float(0.0)) - - if calc_random: - preds = self._predict_random(entities=entities) - else: - preds = self._predict(article_doc=article_doc, entities=entities, avg=avg) - predictions.extend(preds) + else: + desc_docs = self.nlp.pipe([desc_by_entity[e] for e in entities]) + article_docs = self.nlp.pipe([texts_by_id[article_by_entity[e]] for e in entities]) + predictions = self._predict(entities=entities, article_docs=article_docs, desc_docs=desc_docs, avg=avg) # TODO: combine with prior probability - p, r, f = run_el.evaluate(predictions, golds, to_print=False) - if self.PRINT_F: - print("p/r/F", print_string, round(p, 1), round(r, 1), round(f, 1)) - + p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False) loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds)) - if self.PRINT_LOSS: - print("loss", print_string, round(loss, 5)) + + print("p/r/F/acc/loss", print_string, round(p, 1), round(r, 1), round(f, 1), round(acc, 2), round(loss, 5)) return loss, p, r, f - def _predict(self, article_doc, entities, avg=False, apply_threshold=True): + def _predict(self, entities, article_docs, desc_docs, avg=True, apply_threshold=True): if avg: with self.article_encoder.use_params(self.sgd_article.averages) \ - and self.entity_encoder.use_params(self.sgd_entity.averages): - doc_encoding = self.article_encoder([article_doc])[0] - entity_encodings = self.entity_encoder(entities) + and self.desc_encoder.use_params(self.sgd_entity.averages): + doc_encodings = self.article_encoder(article_docs) + desc_encodings = self.desc_encoder(desc_docs) else: - doc_encoding = self.article_encoder([article_doc])[0] - entity_encodings = self.entity_encoder(entities) + doc_encodings = self.article_encoder(article_docs) + desc_encodings = self.desc_encoder(desc_docs) - concat_encodings = [list(entity_encodings[i]) + list(doc_encoding) for i in range(len(entities))] + concat_encodings = [list(desc_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] np_array_list = np.asarray(concat_encodings) if avg: @@ -189,16 +187,16 @@ class EL_Model: def _predict_random(self, entities, apply_threshold=True): if not apply_threshold: - return [float(random.uniform(0,1)) for e in entities] + return [float(random.uniform(0, 1)) for e in entities] else: - return [float(1.0) if random.uniform(0,1) > self.CUTOFF else float(0.0) for e in entities] + return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for e in entities] - def _build_cnn(self, in_width, entity_width, article_width, hidden_1_width, hidden_2_width): + def _build_cnn(self, in_width, desc_width, article_width, hidden_1_width, hidden_2_width): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - self.entity_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=entity_width) + self.desc_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=desc_width) self.article_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=article_width) - in_width = entity_width + article_width + in_width = desc_width + article_width out_width = hidden_2_width self.model = Affine(out_width, in_width) \ @@ -229,80 +227,78 @@ class EL_Model: def _begin_training(self): self.sgd_article = create_default_optimizer(self.article_encoder.ops) - self.sgd_entity = create_default_optimizer(self.entity_encoder.ops) + self.sgd_entity = create_default_optimizer(self.desc_encoder.ops) self.sgd = create_default_optimizer(self.model.ops) @staticmethod def get_loss(predictions, golds): d_scores = (predictions - golds) + gradient = d_scores.mean() loss = (d_scores ** 2).mean() - return loss, d_scores + return loss, gradient - # TODO: multiple docs/articles - def update(self, article_text, entities, golds, apply_threshold=True): - article_doc = self.nlp(article_text) - # entity_docs = list(self.nlp.pipe(entities)) + def update(self, entities, golds, descs, texts): + golds = self.model.ops.asarray(golds) - for entity, gold in zip(entities, golds): - doc_encodings, bp_doc = self.article_encoder.begin_update([article_doc], drop=self.DROP) - doc_encoding = doc_encodings[0] + desc_docs = self.nlp.pipe(descs) + article_docs = self.nlp.pipe(texts) - entity_doc = self.nlp(entity) - # print("entity_docs", type(entity_doc)) + doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP) - entity_encodings, bp_entity = self.entity_encoder.begin_update([entity_doc], drop=self.DROP) - entity_encoding = entity_encodings[0] - # print("entity_encoding", len(entity_encoding), entity_encoding) + desc_encodings, bp_entity = self.desc_encoder.begin_update(desc_docs, drop=self.DROP) - concat_encodings = [list(entity_encoding) + list(doc_encoding)] # for i in range(len(entities)) - # print("concat_encodings", len(concat_encodings), concat_encodings) + concat_encodings = [list(desc_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] - prediction, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) - # predictions = self.model.ops.flatten(predictions) + predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) + predictions = self.model.ops.flatten(predictions) - # print("prediction", prediction) - # golds = self.model.ops.asarray(golds) - # print("gold", gold) + # print("entities", entities) + # print("predictions", predictions) + # print("golds", golds) - loss, gradient = self.get_loss(prediction, gold) + loss, gradient = self.get_loss(predictions, golds) - if self.PRINT_LOSS and self.PRINT_TRAIN: - print("loss train", round(loss, 5)) + if self.PRINT_TRAIN: + print("loss train", round(loss, 5)) - gradient = float(gradient) - # print("gradient", gradient) - # print("loss", loss) + gradient = float(gradient) + # print("gradient", gradient) + # print("loss", loss) - model_gradient = bp_model(gradient, sgd=self.sgd) - # print("model_gradient", model_gradient) + model_gradient = bp_model(gradient, sgd=self.sgd) + # print("model_gradient", model_gradient) - # concat = entity + doc, but doc is the same within this function (TODO: multiple docs/articles) - doc_gradient = model_gradient[0][self.ENTITY_WIDTH:] - entity_gradients = list() - for x in model_gradient: - entity_gradients.append(list(x[0:self.ENTITY_WIDTH])) + # concat = desc + doc, but doc is the same within this function (TODO: multiple docs/articles) + doc_gradient = model_gradient[0][self.DESC_WIDTH:] + entity_gradients = list() + for x in model_gradient: + entity_gradients.append(list(x[0:self.DESC_WIDTH])) - # print("doc_gradient", doc_gradient) - # print("entity_gradients", entity_gradients) + # print("doc_gradient", doc_gradient) + # print("entity_gradients", entity_gradients) - bp_doc([doc_gradient], sgd=self.sgd_article) - bp_entity(entity_gradients, sgd=self.sgd_entity) + bp_doc([doc_gradient], sgd=self.sgd_article) + bp_entity(entity_gradients, sgd=self.sgd_entity) - def _get_training_data(self, training_dir, entity_descr_output, dev, limit, balance, to_print): + def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, collect_correct=True, collect_incorrect=True) - instance_by_article = dict() local_vectors = list() # TODO: local vectors text_by_article = dict() - pos_entities = dict() - neg_entities = dict() + gold_by_entity = dict() + desc_by_entity = dict() + article_by_entity = dict() + entities = list() cnt = 0 - for f in listdir(training_dir): + next_entity_nr = 0 + files = listdir(training_dir) + shuffle(files) + for f in files: if not limit or cnt < limit: if dev == run_el.is_dev(f): article_id = f.replace(".txt", "") @@ -313,29 +309,29 @@ class EL_Model: with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: text = file.read() text_by_article[article_id] = text - instance_by_article[article_id] = set() for mention, entity_pos in correct_entries[article_id].items(): descr = id_to_descr.get(entity_pos) if descr: - instance_by_article[article_id].add(article_id + "_" + mention) - pos_entities[article_id + "_" + mention] = descr + entities.append(next_entity_nr) + gold_by_entity[next_entity_nr] = 1 + desc_by_entity[next_entity_nr] = descr + article_by_entity[next_entity_nr] = article_id + next_entity_nr += 1 for mention, entity_negs in incorrect_entries[article_id].items(): - if not balance or pos_entities.get(article_id + "_" + mention): - neg_count = 0 - for entity_neg in entity_negs: - # if balance, keep only 1 negative instance for each positive instance - if neg_count < 1 or not balance: - descr = id_to_descr.get(entity_neg) - if descr: - descr_list = neg_entities.get(article_id + "_" + mention, []) - descr_list.append(descr) - neg_entities[article_id + "_" + mention] = descr_list - neg_count += 1 + for entity_neg in entity_negs: + descr = id_to_descr.get(entity_neg) + if descr: + entities.append(next_entity_nr) + gold_by_entity[next_entity_nr] = 0 + desc_by_entity[next_entity_nr] = descr + article_by_entity[next_entity_nr] = article_id + next_entity_nr += 1 if to_print: print() print("Processed", cnt, "training articles, dev=" + str(dev)) print() - return instance_by_article, pos_entities, neg_entities, text_by_article + return entities, gold_by_entity, desc_by_entity, article_by_entity, text_by_article + diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 319b1e1c8..715282642 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=400, devlimit=50) print() # STEP 7: apply the EL algorithm on the dev dataset From 4392c01b7bfb22e435249128ac15c196c5b50bd1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 23 May 2019 15:37:05 +0200 Subject: [PATCH 053/148] obtain sentence for each mention --- .../pipeline/wiki_entity_linking/run_el.py | 9 +- .../pipeline/wiki_entity_linking/train_el.py | 144 +++++++++++++----- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 3 files changed, 112 insertions(+), 43 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index 273543306..c0c219829 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -70,7 +70,7 @@ def is_dev(file_name): return file_name.endswith("3.txt") -def evaluate(predictions, golds, to_print=True): +def evaluate(predictions, golds, to_print=True, times_hundred=True): if len(predictions) != len(golds): raise ValueError("predictions and gold entities should have the same length") @@ -101,8 +101,11 @@ def evaluate(predictions, golds, to_print=True): print("fp", fp) print("fn", fn) - precision = 100 * tp / (tp + fp + 0.0000001) - recall = 100 * tp / (tp + fn + 0.0000001) + precision = tp / (tp + fp + 0.0000001) + recall = tp / (tp + fn + 0.0000001) + if times_hundred: + precision = precision*100 + recall = recall*100 fscore = 2 * recall * precision / (recall + precision + 0.0000001) accuracy = corrects / (corrects + incorrects) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index cd6e9de4d..d8082635a 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -20,6 +20,7 @@ from thinc.t2t import ParametricAttention from thinc.misc import Residual from thinc.misc import LayerNorm as LN +from spacy.matcher import PhraseMatcher from spacy.tokens import Doc """ TODO: this code needs to be implemented in pipes.pyx""" @@ -27,13 +28,16 @@ from spacy.tokens import Doc class EL_Model: + PRINT_INSPECT = False PRINT_TRAIN = False EPS = 0.0000000005 CUTOFF = 0.5 BATCH_SIZE = 5 - INPUT_DIM = 300 + DOC_CUTOFF = 300 # number of characters from the doc context + INPUT_DIM = 300 # dimension of pre-trained vectors + HIDDEN_1_WIDTH = 32 # 10 HIDDEN_2_WIDTH = 32 # 6 DESC_WIDTH = 64 # 4 @@ -58,11 +62,20 @@ class EL_Model: # raise errors instead of runtime warnings in case of int/float overflow np.seterr(all='raise') - train_ent, train_gold, train_desc, train_article, train_texts = self._get_training_data(training_dir, - entity_descr_output, - False, - trainlimit, - to_print=False) + train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ + self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False) + + # inspect data + if self.PRINT_INSPECT: + for entity in train_ent: + print("entity", entity) + print("gold", train_gold[entity]) + print("desc", train_desc[entity]) + print("sentence ID", train_sent[entity]) + print("sentence text", train_sent_texts[train_sent[entity]]) + print("article ID", train_art[entity]) + print("article text", train_art_texts[train_art[entity]]) + print() train_pos_entities = [k for k,v in train_gold.items() if v] train_neg_entities = [k for k,v in train_gold.items() if not v] @@ -70,6 +83,10 @@ class EL_Model: train_pos_count = len(train_pos_entities) train_neg_count = len(train_neg_entities) + if to_print: + print() + print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count) + # upsample positives to 50-50 distribution while train_pos_count < train_neg_count: train_ent.append(random.choice(train_pos_entities)) @@ -82,11 +99,8 @@ class EL_Model: shuffle(train_ent) - dev_ent, dev_gold, dev_desc, dev_article, dev_texts = self._get_training_data(training_dir, - entity_descr_output, - True, - devlimit, - to_print=False) + dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \ + self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False) shuffle(dev_ent) dev_pos_count = len([g for g in dev_gold.values() if g]) @@ -94,20 +108,16 @@ class EL_Model: self._begin_training() - print() - self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_random", calc_random=True) - print() - self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_pre", avg=True) - if to_print: print() - print("Training on", len(train_ent), "entities in", len(train_texts), "articles") - print("Training instances pos/neg", train_pos_count, train_neg_count) + print("Training on", len(train_ent), "entities in", len(train_art_texts), "articles") + print("Training instances pos/neg:", train_pos_count, train_neg_count) print() - print("Dev test on", len(dev_ent), "entities in", len(dev_texts), "articles") - print("Dev instances pos/neg", dev_pos_count, dev_neg_count) + print("Dev test on", len(dev_ent), "entities in", len(dev_art_texts), "articles") + print("Dev instances pos/neg:", dev_pos_count, dev_neg_count) print() print(" CUTOFF", self.CUTOFF) + print(" DOC_CUTOFF", self.DOC_CUTOFF) print(" INPUT_DIM", self.INPUT_DIM) print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) print(" DESC_WIDTH", self.DESC_WIDTH) @@ -116,6 +126,10 @@ class EL_Model: print(" DROP", self.DROP) print() + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_random", calc_random=True) + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_pre", avg=True) + print() + start = 0 stop = min(self.BATCH_SIZE, len(train_ent)) processed = 0 @@ -125,10 +139,10 @@ class EL_Model: golds = [train_gold[e] for e in next_batch] descs = [train_desc[e] for e in next_batch] - articles = [train_texts[train_article[e]] for e in next_batch] + articles = [train_art_texts[train_art[e]] for e in next_batch] self.update(entities=next_batch, golds=golds, descs=descs, texts=articles) - self._test_dev(dev_ent, dev_gold, dev_desc, dev_article, dev_texts, print_string="dev_inter", avg=True) + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_inter", avg=True) processed += len(next_batch) @@ -151,7 +165,7 @@ class EL_Model: predictions = self._predict(entities=entities, article_docs=article_docs, desc_docs=desc_docs, avg=avg) # TODO: combine with prior probability - p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False) + p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False, times_hundred=False) loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds)) print("p/r/F/acc/loss", print_string, round(p, 1), round(r, 1), round(f, 1), round(acc, 2), round(loss, 5)) @@ -288,14 +302,18 @@ class EL_Model: collect_incorrect=True) local_vectors = list() # TODO: local vectors - text_by_article = dict() + + entities = set() gold_by_entity = dict() desc_by_entity = dict() article_by_entity = dict() - entities = list() + text_by_article = dict() + sentence_by_entity = dict() + text_by_sentence = dict() cnt = 0 - next_entity_nr = 0 + next_entity_nr = 1 + next_sent_nr = 1 files = listdir(training_dir) shuffle(files) for f in files: @@ -305,33 +323,81 @@ class EL_Model: if cnt % 500 == 0 and to_print: print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") cnt += 1 - if article_id not in text_by_article: - with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: - text = file.read() - text_by_article[article_id] = text + + # parse the article text + with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: + text = file.read() + article_doc = self.nlp(text) + truncated_text = text[0:min(self.DOC_CUTOFF, len(text))] + text_by_article[article_id] = truncated_text + + # process all positive and negative entities, collect all relevant mentions in this article + article_terms = set() + entities_by_mention = dict() for mention, entity_pos in correct_entries[article_id].items(): descr = id_to_descr.get(entity_pos) if descr: - entities.append(next_entity_nr) - gold_by_entity[next_entity_nr] = 1 - desc_by_entity[next_entity_nr] = descr - article_by_entity[next_entity_nr] = article_id + entity = "E_" + str(next_entity_nr) + "_" + article_id + "_" + mention next_entity_nr += 1 + gold_by_entity[entity] = 1 + desc_by_entity[entity] = descr + article_terms.add(mention) + mention_entities = entities_by_mention.get(mention, set()) + mention_entities.add(entity) + entities_by_mention[mention] = mention_entities for mention, entity_negs in incorrect_entries[article_id].items(): for entity_neg in entity_negs: descr = id_to_descr.get(entity_neg) if descr: - entities.append(next_entity_nr) - gold_by_entity[next_entity_nr] = 0 - desc_by_entity[next_entity_nr] = descr - article_by_entity[next_entity_nr] = article_id + entity = "E_" + str(next_entity_nr) + "_" + article_id + "_" + mention next_entity_nr += 1 + gold_by_entity[entity] = 0 + desc_by_entity[entity] = descr + article_terms.add(mention) + mention_entities = entities_by_mention.get(mention, set()) + mention_entities.add(entity) + entities_by_mention[mention] = mention_entities + + # find all matches in the doc for the mentions + # TODO: fix this - doesn't look like all entities are found + matcher = PhraseMatcher(self.nlp.vocab) + patterns = list(self.nlp.tokenizer.pipe(article_terms)) + + matcher.add("TerminologyList", None, *patterns) + matches = matcher(article_doc) + + # store sentences + sentence_to_id = dict() + for match_id, start, end in matches: + span = article_doc[start:end] + sent_text = span.sent + sent_nr = sentence_to_id.get(sent_text, None) + if sent_nr is None: + sent_nr = "S_" + str(next_sent_nr) + article_id + next_sent_nr += 1 + text_by_sentence[sent_nr] = sent_text + sentence_to_id[sent_text] = sent_nr + mention_entities = entities_by_mention[span.text] + for entity in mention_entities: + entities.add(entity) + sentence_by_entity[entity] = sent_nr + article_by_entity[entity] = article_id + + # remove entities that didn't have all data + gold_by_entity = {k: v for k, v in gold_by_entity.items() if k in entities} + desc_by_entity = {k: v for k, v in desc_by_entity.items() if k in entities} + + article_by_entity = {k: v for k, v in article_by_entity.items() if k in entities} + text_by_article = {k: v for k, v in text_by_article.items() if k in article_by_entity.values()} + + sentence_by_entity = {k: v for k, v in sentence_by_entity.items() if k in entities} + text_by_sentence = {k: v for k, v in text_by_sentence.items() if k in sentence_by_entity.values()} if to_print: print() print("Processed", cnt, "training articles, dev=" + str(dev)) print() - return entities, gold_by_entity, desc_by_entity, article_by_entity, text_by_article + return list(entities), gold_by_entity, desc_by_entity, article_by_entity, text_by_article, sentence_by_entity, text_by_sentence diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 715282642..319b1e1c8 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=400, devlimit=50) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20) print() # STEP 7: apply the EL algorithm on the dev dataset From 86ed771e0ba83cea12be3f241d911bccd8a9afa1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 23 May 2019 16:59:11 +0200 Subject: [PATCH 054/148] adding local sentence encoder --- .../pipeline/wiki_entity_linking/train_el.py | 99 ++++++++++++------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 63 insertions(+), 38 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index d8082635a..63f8885cc 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -42,6 +42,7 @@ class EL_Model: HIDDEN_2_WIDTH = 32 # 6 DESC_WIDTH = 64 # 4 ARTICLE_WIDTH = 64 # 8 + SENT_WIDTH = 64 DROP = 0.1 @@ -55,6 +56,7 @@ class EL_Model: self._build_cnn(in_width=self.INPUT_DIM, desc_width=self.DESC_WIDTH, article_width=self.ARTICLE_WIDTH, + sent_width=self.SENT_WIDTH, hidden_1_width=self.HIDDEN_1_WIDTH, hidden_2_width=self.HIDDEN_2_WIDTH) @@ -77,8 +79,8 @@ class EL_Model: print("article text", train_art_texts[train_art[entity]]) print() - train_pos_entities = [k for k,v in train_gold.items() if v] - train_neg_entities = [k for k,v in train_gold.items() if not v] + train_pos_entities = [k for k, v in train_gold.items() if v] + train_neg_entities = [k for k, v in train_gold.items() if not v] train_pos_count = len(train_pos_entities) train_neg_count = len(train_neg_entities) @@ -122,12 +124,15 @@ class EL_Model: print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) print(" DESC_WIDTH", self.DESC_WIDTH) print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH) + print(" SENT_WIDTH", self.SENT_WIDTH) print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH) print(" DROP", self.DROP) print() - self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_random", calc_random=True) - self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_pre", avg=True) + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, + print_string="dev_random", calc_random=True) + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, + print_string="dev_pre", avg=True) print() start = 0 @@ -139,10 +144,12 @@ class EL_Model: golds = [train_gold[e] for e in next_batch] descs = [train_desc[e] for e in next_batch] - articles = [train_art_texts[train_art[e]] for e in next_batch] + article_texts = [train_art_texts[train_art[e]] for e in next_batch] + sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch] - self.update(entities=next_batch, golds=golds, descs=descs, texts=articles) - self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, print_string="dev_inter", avg=True) + self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts) + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, + print_string="dev_inter", avg=True) processed += len(next_batch) @@ -153,7 +160,8 @@ class EL_Model: print() print("Trained on", processed, "entities in total") - def _test_dev(self, entities, gold_by_entity, desc_by_entity, article_by_entity, texts_by_id, print_string, avg=True, calc_random=False): + def _test_dev(self, entities, gold_by_entity, desc_by_entity, art_by_entity, art_texts, sent_by_entity, sent_texts, + print_string, avg=True, calc_random=False): golds = [gold_by_entity[e] for e in entities] if calc_random: @@ -161,29 +169,35 @@ class EL_Model: else: desc_docs = self.nlp.pipe([desc_by_entity[e] for e in entities]) - article_docs = self.nlp.pipe([texts_by_id[article_by_entity[e]] for e in entities]) - predictions = self._predict(entities=entities, article_docs=article_docs, desc_docs=desc_docs, avg=avg) + article_docs = self.nlp.pipe([art_texts[art_by_entity[e]] for e in entities]) + sent_docs = self.nlp.pipe([sent_texts[sent_by_entity[e]] for e in entities]) + predictions = self._predict(entities=entities, article_docs=article_docs, sent_docs=sent_docs, + desc_docs=desc_docs, avg=avg) # TODO: combine with prior probability p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False, times_hundred=False) loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds)) - print("p/r/F/acc/loss", print_string, round(p, 1), round(r, 1), round(f, 1), round(acc, 2), round(loss, 5)) + print("p/r/F/acc/loss", print_string, round(p, 2), round(r, 2), round(f, 2), round(acc, 2), round(loss, 2)) return loss, p, r, f - def _predict(self, entities, article_docs, desc_docs, avg=True, apply_threshold=True): + def _predict(self, entities, article_docs, sent_docs, desc_docs, avg=True, apply_threshold=True): if avg: with self.article_encoder.use_params(self.sgd_article.averages) \ - and self.desc_encoder.use_params(self.sgd_entity.averages): + and self.desc_encoder.use_params(self.sgd_desc.averages): doc_encodings = self.article_encoder(article_docs) desc_encodings = self.desc_encoder(desc_docs) + sent_encodings = self.sent_encoder(sent_docs) else: doc_encodings = self.article_encoder(article_docs) desc_encodings = self.desc_encoder(desc_docs) + sent_encodings = self.sent_encoder(sent_docs) + + concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) + list(desc_encodings[i]) for i in + range(len(entities))] - concat_encodings = [list(desc_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] np_array_list = np.asarray(concat_encodings) if avg: @@ -201,16 +215,17 @@ class EL_Model: def _predict_random(self, entities, apply_threshold=True): if not apply_threshold: - return [float(random.uniform(0, 1)) for e in entities] + return [float(random.uniform(0, 1)) for _ in entities] else: - return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for e in entities] + return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for _ in entities] - def _build_cnn(self, in_width, desc_width, article_width, hidden_1_width, hidden_2_width): + def _build_cnn(self, in_width, desc_width, article_width, sent_width, hidden_1_width, hidden_2_width): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): self.desc_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=desc_width) self.article_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=article_width) + self.sent_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=sent_width) - in_width = desc_width + article_width + in_width = article_width + sent_width + desc_width out_width = hidden_2_width self.model = Affine(out_width, in_width) \ @@ -224,7 +239,8 @@ class EL_Model: cnn_maxout_pieces = 3 with Model.define_operators({">>": chain}): - convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces)))) + convolution = Residual((ExtractWindow(nW=1) >> + LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces)))) encoder = SpacyVectors \ >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution ** conv_depth, pad=conv_depth) \ @@ -241,7 +257,8 @@ class EL_Model: def _begin_training(self): self.sgd_article = create_default_optimizer(self.article_encoder.ops) - self.sgd_entity = create_default_optimizer(self.desc_encoder.ops) + self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) + self.sgd_desc = create_default_optimizer(self.desc_encoder.ops) self.sgd = create_default_optimizer(self.model.ops) @staticmethod @@ -251,17 +268,19 @@ class EL_Model: loss = (d_scores ** 2).mean() return loss, gradient - def update(self, entities, golds, descs, texts): + def update(self, entities, golds, descs, art_texts, sent_texts): golds = self.model.ops.asarray(golds) + art_docs = self.nlp.pipe(art_texts) + sent_docs = self.nlp.pipe(sent_texts) desc_docs = self.nlp.pipe(descs) - article_docs = self.nlp.pipe(texts) - doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=self.DROP) + doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP) + sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP) + desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP) - desc_encodings, bp_entity = self.desc_encoder.begin_update(desc_docs, drop=self.DROP) - - concat_encodings = [list(desc_encodings[i]) + list(doc_encodings[i]) for i in range(len(entities))] + concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) + list(desc_encodings[i]) + for i in range(len(entities))] predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) predictions = self.model.ops.flatten(predictions) @@ -282,17 +301,23 @@ class EL_Model: model_gradient = bp_model(gradient, sgd=self.sgd) # print("model_gradient", model_gradient) - # concat = desc + doc, but doc is the same within this function (TODO: multiple docs/articles) - doc_gradient = model_gradient[0][self.DESC_WIDTH:] - entity_gradients = list() + # concat = doc + sent + desc, but doc is the same within this function + sent_start = self.ARTICLE_WIDTH + desc_start = self.ARTICLE_WIDTH + self.SENT_WIDTH + doc_gradient = model_gradient[0][0:sent_start] + sent_gradients = list() + desc_gradients = list() for x in model_gradient: - entity_gradients.append(list(x[0:self.DESC_WIDTH])) + sent_gradients.append(list(x[sent_start:desc_start])) + desc_gradients.append(list(x[desc_start:])) # print("doc_gradient", doc_gradient) - # print("entity_gradients", entity_gradients) + # print("sent_gradients", sent_gradients) + # print("desc_gradients", desc_gradients) bp_doc([doc_gradient], sgd=self.sgd_article) - bp_entity(entity_gradients, sgd=self.sgd_entity) + bp_sent(sent_gradients, sgd=self.sgd_sent) + bp_desc(desc_gradients, sgd=self.sgd_desc) def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) @@ -301,8 +326,6 @@ class EL_Model: collect_correct=True, collect_incorrect=True) - local_vectors = list() # TODO: local vectors - entities = set() gold_by_entity = dict() desc_by_entity = dict() @@ -372,14 +395,15 @@ class EL_Model: sentence_to_id = dict() for match_id, start, end in matches: span = article_doc[start:end] - sent_text = span.sent + sent_text = span.sent.text sent_nr = sentence_to_id.get(sent_text, None) + mention = span.text if sent_nr is None: sent_nr = "S_" + str(next_sent_nr) + article_id next_sent_nr += 1 text_by_sentence[sent_nr] = sent_text sentence_to_id[sent_text] = sent_nr - mention_entities = entities_by_mention[span.text] + mention_entities = entities_by_mention[mention] for entity in mention_entities: entities.add(entity) sentence_by_entity[entity] = sent_nr @@ -399,5 +423,6 @@ class EL_Model: print() print("Processed", cnt, "training articles, dev=" + str(dev)) print() - return list(entities), gold_by_entity, desc_by_entity, article_by_entity, text_by_article, sentence_by_entity, text_by_sentence + return list(entities), gold_by_entity, desc_by_entity, article_by_entity, text_by_article, \ + sentence_by_entity, text_by_sentence diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 319b1e1c8..ec1f66d81 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=50) print() # STEP 7: apply the EL algorithm on the dev dataset From abf9af81c93e94ed1bbcc4f295d1184e57312fbe Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 24 May 2019 22:04:25 +0200 Subject: [PATCH 055/148] learn rate en epochs --- .../pipeline/wiki_entity_linking/train_el.py | 85 ++++++++++--------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 48 insertions(+), 39 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 63f8885cc..efad36362 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -34,6 +34,7 @@ class EL_Model: CUTOFF = 0.5 BATCH_SIZE = 5 + UPSAMPLE = True DOC_CUTOFF = 300 # number of characters from the doc context INPUT_DIM = 300 # dimension of pre-trained vectors @@ -45,6 +46,8 @@ class EL_Model: SENT_WIDTH = 64 DROP = 0.1 + LEARN_RATE = 0.01 + EPOCHS = 10 name = "entity_linker" @@ -67,6 +70,12 @@ class EL_Model: train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False) + dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \ + self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False) + + dev_pos_count = len([g for g in dev_gold.values() if g]) + dev_neg_count = len([g for g in dev_gold.values() if not g]) + # inspect data if self.PRINT_INSPECT: for entity in train_ent: @@ -85,28 +94,20 @@ class EL_Model: train_pos_count = len(train_pos_entities) train_neg_count = len(train_neg_entities) - if to_print: - print() - print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count) + if self.UPSAMPLE: + if to_print: + print() + print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count) - # upsample positives to 50-50 distribution - while train_pos_count < train_neg_count: - train_ent.append(random.choice(train_pos_entities)) - train_pos_count += 1 + # upsample positives to 50-50 distribution + while train_pos_count < train_neg_count: + train_ent.append(random.choice(train_pos_entities)) + train_pos_count += 1 - # upsample negatives to 50-50 distribution - while train_neg_count < train_pos_count: - train_ent.append(random.choice(train_neg_entities)) - train_neg_count += 1 - - shuffle(train_ent) - - dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \ - self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False) - shuffle(dev_ent) - - dev_pos_count = len([g for g in dev_gold.values() if g]) - dev_neg_count = len([g for g in dev_gold.values() if not g]) + # upsample negatives to 50-50 distribution + while train_neg_count < train_pos_count: + train_ent.append(random.choice(train_neg_entities)) + train_neg_count += 1 self._begin_training() @@ -135,30 +136,34 @@ class EL_Model: print_string="dev_pre", avg=True) print() - start = 0 - stop = min(self.BATCH_SIZE, len(train_ent)) - processed = 0 + for i in range(self.EPOCHS): + print("EPOCH", i) + shuffle(train_ent) - while start < len(train_ent): - next_batch = train_ent[start:stop] + start = 0 + stop = min(self.BATCH_SIZE, len(train_ent)) + processed = 0 - golds = [train_gold[e] for e in next_batch] - descs = [train_desc[e] for e in next_batch] - article_texts = [train_art_texts[train_art[e]] for e in next_batch] - sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch] + while start < len(train_ent): + next_batch = train_ent[start:stop] - self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts) - self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, - print_string="dev_inter", avg=True) + golds = [train_gold[e] for e in next_batch] + descs = [train_desc[e] for e in next_batch] + article_texts = [train_art_texts[train_art[e]] for e in next_batch] + sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch] - processed += len(next_batch) + self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts) + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, + print_string="dev_inter", avg=True) - start = start + self.BATCH_SIZE - stop = min(stop + self.BATCH_SIZE, len(train_ent)) + processed += len(next_batch) - if to_print: - print() - print("Trained on", processed, "entities in total") + start = start + self.BATCH_SIZE + stop = min(stop + self.BATCH_SIZE, len(train_ent)) + + if to_print: + print() + print("Trained on", processed, "entities in total") def _test_dev(self, entities, gold_by_entity, desc_by_entity, art_by_entity, art_texts, sent_by_entity, sent_texts, print_string, avg=True, calc_random=False): @@ -257,9 +262,13 @@ class EL_Model: def _begin_training(self): self.sgd_article = create_default_optimizer(self.article_encoder.ops) + self.sgd_article.learn_rate = self.LEARN_RATE self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) + self.sgd_sent.learn_rate = self.LEARN_RATE self.sgd_desc = create_default_optimizer(self.desc_encoder.ops) + self.sgd_desc.learn_rate = self.LEARN_RATE self.sgd = create_default_optimizer(self.model.ops) + self.sgd.learn_rate = self.LEARN_RATE @staticmethod def get_loss(predictions, golds): diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index ec1f66d81..cd7804ca4 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=50) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=20, devlimit=20) print() # STEP 7: apply the EL algorithm on the dev dataset From cfc27d7ff92abdc2962df4a61e9a38b3d383693f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 26 May 2019 23:39:46 +0200 Subject: [PATCH 056/148] using Tok2Vec instead --- .../pipeline/wiki_entity_linking/train_el.py | 82 +++++++++++++------ .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 56 insertions(+), 28 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index efad36362..e0bea3f08 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -11,7 +11,7 @@ from thinc.neural._classes.convolution import ExtractWindow from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator -from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic +from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic, Tok2Vec from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten from thinc.v2v import Model, Maxout, Affine, ReLu @@ -39,15 +39,15 @@ class EL_Model: DOC_CUTOFF = 300 # number of characters from the doc context INPUT_DIM = 300 # dimension of pre-trained vectors - HIDDEN_1_WIDTH = 32 # 10 - HIDDEN_2_WIDTH = 32 # 6 + # HIDDEN_1_WIDTH = 32 # 10 + # HIDDEN_2_WIDTH = 32 # 6 DESC_WIDTH = 64 # 4 ARTICLE_WIDTH = 64 # 8 SENT_WIDTH = 64 DROP = 0.1 - LEARN_RATE = 0.01 - EPOCHS = 10 + LEARN_RATE = 0.001 + EPOCHS = 20 name = "entity_linker" @@ -56,12 +56,9 @@ class EL_Model: self.nlp = nlp self.kb = kb - self._build_cnn(in_width=self.INPUT_DIM, - desc_width=self.DESC_WIDTH, + self._build_cnn(desc_width=self.DESC_WIDTH, article_width=self.ARTICLE_WIDTH, - sent_width=self.SENT_WIDTH, - hidden_1_width=self.HIDDEN_1_WIDTH, - hidden_2_width=self.HIDDEN_2_WIDTH) + sent_width=self.SENT_WIDTH) def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): # raise errors instead of runtime warnings in case of int/float overflow @@ -122,27 +119,29 @@ class EL_Model: print(" CUTOFF", self.CUTOFF) print(" DOC_CUTOFF", self.DOC_CUTOFF) print(" INPUT_DIM", self.INPUT_DIM) - print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) + # print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) print(" DESC_WIDTH", self.DESC_WIDTH) print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH) print(" SENT_WIDTH", self.SENT_WIDTH) - print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH) + # print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH) print(" DROP", self.DROP) + print(" LEARNING RATE", self.LEARN_RATE) + print(" UPSAMPLE", self.UPSAMPLE) print() self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, print_string="dev_random", calc_random=True) + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, print_string="dev_pre", avg=True) print() + processed = 0 for i in range(self.EPOCHS): - print("EPOCH", i) shuffle(train_ent) start = 0 stop = min(self.BATCH_SIZE, len(train_ent)) - processed = 0 while start < len(train_ent): next_batch = train_ent[start:stop] @@ -153,17 +152,22 @@ class EL_Model: sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch] self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts) - self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, - print_string="dev_inter", avg=True) processed += len(next_batch) start = start + self.BATCH_SIZE stop = min(stop + self.BATCH_SIZE, len(train_ent)) - if to_print: - print() - print("Trained on", processed, "entities in total") + if self.PRINT_TRAIN: + self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts, + print_string="train_inter_epoch " + str(i), avg=True) + + self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, + print_string="dev_inter_epoch " + str(i), avg=True) + + if to_print: + print() + print("Trained on", processed, "entities across", self.EPOCHS, "epochs") def _test_dev(self, entities, gold_by_entity, desc_by_entity, art_by_entity, art_texts, sent_by_entity, sent_texts, print_string, avg=True, calc_random=False): @@ -224,11 +228,11 @@ class EL_Model: else: return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for _ in entities] - def _build_cnn(self, in_width, desc_width, article_width, sent_width, hidden_1_width, hidden_2_width): + def _build_cnn_depr(self, embed_width, desc_width, article_width, sent_width, hidden_1_width, hidden_2_width): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - self.desc_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=desc_width) - self.article_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=article_width) - self.sent_encoder = self._encoder(in_width=in_width, hidden_with=hidden_1_width, end_width=sent_width) + self.desc_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=desc_width) + self.article_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=article_width) + self.sent_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=sent_width) in_width = article_width + sent_width + desc_width out_width = hidden_2_width @@ -238,8 +242,28 @@ class EL_Model: >> Affine(1, out_width) \ >> logistic + def _build_cnn(self, desc_width, article_width, sent_width): + with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): + self.desc_encoder = self._encoder(width=desc_width) + self.article_encoder = self._encoder(width=article_width) + self.sent_encoder = self._encoder(width=sent_width) + + in_width = desc_width + article_width + sent_width + + output_layer = ( + zero_init(Affine(1, in_width, drop_factor=0.0)) >> logistic + ) + self.model = output_layer + self.model.nO = 1 + + def _encoder(self, width): + tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3, + subword_features=True, conv_depth=4, bilstm_depth=0) + + return tok2vec >> flatten_add_lengths >> Pooling(mean_pool) + @staticmethod - def _encoder(in_width, hidden_with, end_width): + def _encoder_depr(in_width, hidden_with, end_width): conv_depth = 2 cnn_maxout_pieces = 3 @@ -263,12 +287,19 @@ class EL_Model: def _begin_training(self): self.sgd_article = create_default_optimizer(self.article_encoder.ops) self.sgd_article.learn_rate = self.LEARN_RATE + self.sgd_article.L2 = 0 + self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) self.sgd_sent.learn_rate = self.LEARN_RATE + self.sgd_sent.L2 = 0 + self.sgd_desc = create_default_optimizer(self.desc_encoder.ops) self.sgd_desc.learn_rate = self.LEARN_RATE + self.sgd_desc.L2 = 0 + self.sgd = create_default_optimizer(self.model.ops) self.sgd.learn_rate = self.LEARN_RATE + self.sgd.L2 = 0 @staticmethod def get_loss(predictions, golds): @@ -300,9 +331,6 @@ class EL_Model: loss, gradient = self.get_loss(predictions, golds) - if self.PRINT_TRAIN: - print("loss train", round(loss, 5)) - gradient = float(gradient) # print("gradient", gradient) # print("loss", loss) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index cd7804ca4..70fc200ab 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=20, devlimit=20) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=1000) print() # STEP 7: apply the EL algorithm on the dev dataset From 8c4aa076bcb57ad7970de72229beb2d2e10335e4 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 27 May 2019 14:29:38 +0200 Subject: [PATCH 057/148] small fixes --- .../pipeline/wiki_entity_linking/train_el.py | 36 ++++++++++++------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 24 insertions(+), 14 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index e0bea3f08..e7d80d52b 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -29,7 +29,7 @@ from spacy.tokens import Doc class EL_Model: PRINT_INSPECT = False - PRINT_TRAIN = False + PRINT_TRAIN = True EPS = 0.0000000005 CUTOFF = 0.5 @@ -40,14 +40,15 @@ class EL_Model: INPUT_DIM = 300 # dimension of pre-trained vectors # HIDDEN_1_WIDTH = 32 # 10 - # HIDDEN_2_WIDTH = 32 # 6 + HIDDEN_2_WIDTH = 32 # 6 DESC_WIDTH = 64 # 4 ARTICLE_WIDTH = 64 # 8 SENT_WIDTH = 64 DROP = 0.1 - LEARN_RATE = 0.001 + LEARN_RATE = 0.0001 EPOCHS = 20 + L2 = 1e-6 name = "entity_linker" @@ -62,7 +63,10 @@ class EL_Model: def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): # raise errors instead of runtime warnings in case of int/float overflow - np.seterr(all='raise') + # (not sure if we need this. set L2 to 0 because it throws an error otherwsise) + # np.seterr(all='raise') + # alternative: + np.seterr(divide="raise", over="warn", under="ignore", invalid="raise") train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False) @@ -159,6 +163,7 @@ class EL_Model: stop = min(stop + self.BATCH_SIZE, len(train_ent)) if self.PRINT_TRAIN: + print() self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts, print_string="train_inter_epoch " + str(i), avg=True) @@ -250,15 +255,20 @@ class EL_Model: in_width = desc_width + article_width + sent_width - output_layer = ( - zero_init(Affine(1, in_width, drop_factor=0.0)) >> logistic - ) - self.model = output_layer + self.model = Affine(self.HIDDEN_2_WIDTH, in_width) \ + >> LN(Maxout(self.HIDDEN_2_WIDTH, self.HIDDEN_2_WIDTH)) \ + >> Affine(1, self.HIDDEN_2_WIDTH) \ + >> logistic + + # output_layer = ( + # zero_init(Affine(1, in_width, drop_factor=0.0)) >> logistic + # ) + # self.model = output_layer self.model.nO = 1 def _encoder(self, width): tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3, - subword_features=True, conv_depth=4, bilstm_depth=0) + subword_features=False, conv_depth=4, bilstm_depth=0) return tok2vec >> flatten_add_lengths >> Pooling(mean_pool) @@ -287,19 +297,19 @@ class EL_Model: def _begin_training(self): self.sgd_article = create_default_optimizer(self.article_encoder.ops) self.sgd_article.learn_rate = self.LEARN_RATE - self.sgd_article.L2 = 0 + self.sgd_article.L2 = self.L2 self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) self.sgd_sent.learn_rate = self.LEARN_RATE - self.sgd_sent.L2 = 0 + self.sgd_sent.L2 = self.L2 self.sgd_desc = create_default_optimizer(self.desc_encoder.ops) self.sgd_desc.learn_rate = self.LEARN_RATE - self.sgd_desc.L2 = 0 + self.sgd_desc.L2 = self.L2 self.sgd = create_default_optimizer(self.model.ops) self.sgd.learn_rate = self.LEARN_RATE - self.sgd.L2 = 0 + self.sgd.L2 = self.L2 @staticmethod def get_loss(predictions, golds): diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 70fc200ab..319b1e1c8 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=1000) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20) print() # STEP 7: apply the EL algorithm on the dev dataset From 992fa92b6630ec8eb78ad378602a5774d7327de3 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 28 May 2019 00:05:22 +0200 Subject: [PATCH 058/148] refactor again to clusters of entities and cosine similarity --- .../pipeline/wiki_entity_linking/train_el.py | 428 +++++++++--------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 206 insertions(+), 224 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index e7d80d52b..ac8cae4a4 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -11,7 +11,7 @@ from thinc.neural._classes.convolution import ExtractWindow from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator -from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic, Tok2Vec +from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic, Tok2Vec, cosine from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten from thinc.v2v import Model, Maxout, Affine, ReLu @@ -20,6 +20,7 @@ from thinc.t2t import ParametricAttention from thinc.misc import Residual from thinc.misc import LayerNorm as LN +from spacy.cli.pretrain import get_cossim_loss from spacy.matcher import PhraseMatcher from spacy.tokens import Doc @@ -34,20 +35,20 @@ class EL_Model: CUTOFF = 0.5 BATCH_SIZE = 5 - UPSAMPLE = True + # UPSAMPLE = True DOC_CUTOFF = 300 # number of characters from the doc context INPUT_DIM = 300 # dimension of pre-trained vectors - # HIDDEN_1_WIDTH = 32 # 10 - HIDDEN_2_WIDTH = 32 # 6 - DESC_WIDTH = 64 # 4 - ARTICLE_WIDTH = 64 # 8 + HIDDEN_1_WIDTH = 32 + # HIDDEN_2_WIDTH = 32 # 6 + DESC_WIDTH = 64 + ARTICLE_WIDTH = 64 SENT_WIDTH = 64 DROP = 0.1 LEARN_RATE = 0.0001 - EPOCHS = 20 + EPOCHS = 10 L2 = 1e-6 name = "entity_linker" @@ -57,9 +58,10 @@ class EL_Model: self.nlp = nlp self.kb = kb - self._build_cnn(desc_width=self.DESC_WIDTH, + self._build_cnn(embed_width=self.INPUT_DIM, + desc_width=self.DESC_WIDTH, article_width=self.ARTICLE_WIDTH, - sent_width=self.SENT_WIDTH) + sent_width=self.SENT_WIDTH, hidden_1_width=self.HIDDEN_1_WIDTH) def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): # raise errors instead of runtime warnings in case of int/float overflow @@ -70,24 +72,28 @@ class EL_Model: train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False) + train_clusters = list(train_ent.keys()) dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \ self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False) + dev_clusters = list(dev_ent.keys()) dev_pos_count = len([g for g in dev_gold.values() if g]) dev_neg_count = len([g for g in dev_gold.values() if not g]) # inspect data if self.PRINT_INSPECT: - for entity in train_ent: - print("entity", entity) - print("gold", train_gold[entity]) - print("desc", train_desc[entity]) - print("sentence ID", train_sent[entity]) - print("sentence text", train_sent_texts[train_sent[entity]]) - print("article ID", train_art[entity]) - print("article text", train_art_texts[train_art[entity]]) + for cluster, entities in train_ent.items(): print() + for entity in entities: + print("entity", entity) + print("gold", train_gold[entity]) + print("desc", train_desc[entity]) + print("sentence ID", train_sent[entity]) + print("sentence text", train_sent_texts[train_sent[entity]]) + print("article ID", train_art[entity]) + print("article text", train_art_texts[train_art[entity]]) + print() train_pos_entities = [k for k, v in train_gold.items() if v] train_neg_entities = [k for k, v in train_gold.items() if not v] @@ -95,29 +101,29 @@ class EL_Model: train_pos_count = len(train_pos_entities) train_neg_count = len(train_neg_entities) - if self.UPSAMPLE: - if to_print: - print() - print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count) - - # upsample positives to 50-50 distribution - while train_pos_count < train_neg_count: - train_ent.append(random.choice(train_pos_entities)) - train_pos_count += 1 - + # if self.UPSAMPLE: + # if to_print: + # print() + # print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count) + # + # # upsample positives to 50-50 distribution + # while train_pos_count < train_neg_count: + # train_ent.append(random.choice(train_pos_entities)) + # train_pos_count += 1 + # # upsample negatives to 50-50 distribution - while train_neg_count < train_pos_count: - train_ent.append(random.choice(train_neg_entities)) - train_neg_count += 1 + # while train_neg_count < train_pos_count: + # train_ent.append(random.choice(train_neg_entities)) + # train_neg_count += 1 self._begin_training() if to_print: print() - print("Training on", len(train_ent), "entities in", len(train_art_texts), "articles") + print("Training on", len(train_clusters), "entity clusters in", len(train_art_texts), "articles") print("Training instances pos/neg:", train_pos_count, train_neg_count) print() - print("Dev test on", len(dev_ent), "entities in", len(dev_art_texts), "articles") + print("Dev test on", len(dev_clusters), "entity clusters in", len(dev_art_texts), "articles") print("Dev instances pos/neg:", dev_pos_count, dev_neg_count) print() print(" CUTOFF", self.CUTOFF) @@ -138,94 +144,104 @@ class EL_Model: self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, print_string="dev_pre", avg=True) - print() processed = 0 for i in range(self.EPOCHS): - shuffle(train_ent) + shuffle(train_clusters) start = 0 - stop = min(self.BATCH_SIZE, len(train_ent)) + stop = min(self.BATCH_SIZE, len(train_clusters)) - while start < len(train_ent): - next_batch = train_ent[start:stop] + while start < len(train_clusters): + next_batch = {c: train_ent[c] for c in train_clusters[start:stop]} + processed += len(next_batch.keys()) - golds = [train_gold[e] for e in next_batch] - descs = [train_desc[e] for e in next_batch] - article_texts = [train_art_texts[train_art[e]] for e in next_batch] - sent_texts = [train_sent_texts[train_sent[e]] for e in next_batch] - - self.update(entities=next_batch, golds=golds, descs=descs, art_texts=article_texts, sent_texts=sent_texts) - - processed += len(next_batch) + self.update(entity_clusters=next_batch, golds=train_gold, descs=train_desc, + art_texts=train_art_texts, arts=train_art, + sent_texts=train_sent_texts, sents=train_sent) start = start + self.BATCH_SIZE - stop = min(stop + self.BATCH_SIZE, len(train_ent)) + stop = min(stop + self.BATCH_SIZE, len(train_clusters)) if self.PRINT_TRAIN: print() self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts, - print_string="train_inter_epoch " + str(i), avg=True) + print_string="train_inter_epoch " + str(i), avg=True) self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, print_string="dev_inter_epoch " + str(i), avg=True) if to_print: print() - print("Trained on", processed, "entities across", self.EPOCHS, "epochs") + print("Trained on", processed, "entity clusters across", self.EPOCHS, "epochs") - def _test_dev(self, entities, gold_by_entity, desc_by_entity, art_by_entity, art_texts, sent_by_entity, sent_texts, + def _test_dev(self, entity_clusters, golds, descs, arts, art_texts, sents, sent_texts, print_string, avg=True, calc_random=False): - golds = [gold_by_entity[e] for e in entities] - if calc_random: - predictions = self._predict_random(entities=entities) + correct = 0 + incorrect = 0 - else: - desc_docs = self.nlp.pipe([desc_by_entity[e] for e in entities]) - article_docs = self.nlp.pipe([art_texts[art_by_entity[e]] for e in entities]) - sent_docs = self.nlp.pipe([sent_texts[sent_by_entity[e]] for e in entities]) - predictions = self._predict(entities=entities, article_docs=article_docs, sent_docs=sent_docs, - desc_docs=desc_docs, avg=avg) + for cluster, entities in entity_clusters.items(): + correct_entities = [e for e in entities if golds[e]] + incorrect_entities = [e for e in entities if not golds[e]] + assert len(correct_entities) == 1 - # TODO: combine with prior probability - p, r, f, acc = run_el.evaluate(predictions, golds, to_print=False, times_hundred=False) - loss, gradient = self.get_loss(self.model.ops.asarray(predictions), self.model.ops.asarray(golds)) + entities = list(entities) + shuffle(entities) - print("p/r/F/acc/loss", print_string, round(p, 2), round(r, 2), round(f, 2), round(acc, 2), round(loss, 2)) + if calc_random: + predicted_entity = random.choice(entities) + if predicted_entity in correct_entities: + correct += 1 + else: + incorrect += 1 - return loss, p, r, f + else: + desc_docs = self.nlp.pipe([descs[e] for e in entities]) + # article_texts = [art_texts[arts[e]] for e in entities] - def _predict(self, entities, article_docs, sent_docs, desc_docs, avg=True, apply_threshold=True): + sent_doc = self.nlp(sent_texts[sents[cluster]]) + article_doc = self.nlp(art_texts[arts[cluster]]) + + predicted_index = self._predict(article_doc=article_doc, sent_doc=sent_doc, + desc_docs=desc_docs, avg=avg) + if entities[predicted_index] in correct_entities: + correct += 1 + else: + incorrect += 1 + + if correct == incorrect == 0: + print("acc", print_string, "NA") + return 0 + + acc = correct / (correct + incorrect) + print("acc", print_string, round(acc, 2)) + return acc + + def _predict(self, article_doc, sent_doc, desc_docs, avg=True, apply_threshold=True): if avg: with self.article_encoder.use_params(self.sgd_article.averages) \ - and self.desc_encoder.use_params(self.sgd_desc.averages): - doc_encodings = self.article_encoder(article_docs) + and self.desc_encoder.use_params(self.sgd_desc.averages)\ + and self.sent_encoder.use_params(self.sgd_sent.averages): + # doc_encoding = self.article_encoder(article_doc) desc_encodings = self.desc_encoder(desc_docs) - sent_encodings = self.sent_encoder(sent_docs) + sent_encoding = self.sent_encoder([sent_doc]) else: - doc_encodings = self.article_encoder(article_docs) + # doc_encodings = self.article_encoder(article_docs) desc_encodings = self.desc_encoder(desc_docs) - sent_encodings = self.sent_encoder(sent_docs) + sent_encoding = self.sent_encoder([sent_doc]) - concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) + list(desc_encodings[i]) for i in - range(len(entities))] + sent_enc = np.transpose(sent_encoding) + highest_sim = -5 + best_i = -1 + for i, desc_enc in enumerate(desc_encodings): + sim = cosine(desc_enc, sent_enc) + if sim >= highest_sim: + best_i = i + highest_sim = sim - np_array_list = np.asarray(concat_encodings) - - if avg: - with self.model.use_params(self.sgd.averages): - predictions = self.model(np_array_list) - else: - predictions = self.model(np_array_list) - - predictions = self.model.ops.flatten(predictions) - predictions = [float(p) for p in predictions] - if apply_threshold: - predictions = [float(1.0) if p > self.CUTOFF else float(0.0) for p in predictions] - - return predictions + return best_i def _predict_random(self, entities, apply_threshold=True): if not apply_threshold: @@ -233,47 +249,23 @@ class EL_Model: else: return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for _ in entities] - def _build_cnn_depr(self, embed_width, desc_width, article_width, sent_width, hidden_1_width, hidden_2_width): + def _build_cnn(self, embed_width, desc_width, article_width, sent_width, hidden_1_width): with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - self.desc_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=desc_width) - self.article_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=article_width) - self.sent_encoder = self._encoder_depr(in_width=embed_width, hidden_with=hidden_1_width, end_width=sent_width) + self.desc_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, + end_width=desc_width) + self.article_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, + end_width=article_width) + self.sent_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, + end_width=sent_width) - in_width = article_width + sent_width + desc_width - out_width = hidden_2_width - - self.model = Affine(out_width, in_width) \ - >> LN(Maxout(out_width, out_width)) \ - >> Affine(1, out_width) \ - >> logistic - - def _build_cnn(self, desc_width, article_width, sent_width): - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - self.desc_encoder = self._encoder(width=desc_width) - self.article_encoder = self._encoder(width=article_width) - self.sent_encoder = self._encoder(width=sent_width) - - in_width = desc_width + article_width + sent_width - - self.model = Affine(self.HIDDEN_2_WIDTH, in_width) \ - >> LN(Maxout(self.HIDDEN_2_WIDTH, self.HIDDEN_2_WIDTH)) \ - >> Affine(1, self.HIDDEN_2_WIDTH) \ - >> logistic - - # output_layer = ( - # zero_init(Affine(1, in_width, drop_factor=0.0)) >> logistic - # ) - # self.model = output_layer - self.model.nO = 1 - - def _encoder(self, width): - tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3, - subword_features=False, conv_depth=4, bilstm_depth=0) - - return tok2vec >> flatten_add_lengths >> Pooling(mean_pool) + # def _encoder(self, width): + # tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3, + # subword_features=False, conv_depth=4, bilstm_depth=0) + # + # return tok2vec >> flatten_add_lengths >> Pooling(mean_pool) @staticmethod - def _encoder_depr(in_width, hidden_with, end_width): + def _encoder(in_width, hidden_with, end_width): conv_depth = 2 cnn_maxout_pieces = 3 @@ -307,64 +299,58 @@ class EL_Model: self.sgd_desc.learn_rate = self.LEARN_RATE self.sgd_desc.L2 = self.L2 - self.sgd = create_default_optimizer(self.model.ops) - self.sgd.learn_rate = self.LEARN_RATE - self.sgd.L2 = self.L2 + # self.sgd = create_default_optimizer(self.model.ops) + # self.sgd.learn_rate = self.LEARN_RATE + # self.sgd.L2 = self.L2 @staticmethod def get_loss(predictions, golds): - d_scores = (predictions - golds) - gradient = d_scores.mean() - loss = (d_scores ** 2).mean() - return loss, gradient + loss, gradients = get_cossim_loss(predictions, golds) + return loss, gradients - def update(self, entities, golds, descs, art_texts, sent_texts): - golds = self.model.ops.asarray(golds) + def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents): + for cluster, entities in entity_clusters.items(): + correct_entities = [e for e in entities if golds[e]] + incorrect_entities = [e for e in entities if not golds[e]] - art_docs = self.nlp.pipe(art_texts) - sent_docs = self.nlp.pipe(sent_texts) - desc_docs = self.nlp.pipe(descs) + assert len(correct_entities) == 1 + entities = list(entities) + shuffle(entities) - doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP) - sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP) - desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP) + # article_text = art_texts[arts[cluster]] + cluster_sent = sent_texts[sents[cluster]] - concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) + list(desc_encodings[i]) - for i in range(len(entities))] + # art_docs = self.nlp.pipe(article_text) + sent_doc = self.nlp(cluster_sent) - predictions, bp_model = self.model.begin_update(np.asarray(concat_encodings), drop=self.DROP) - predictions = self.model.ops.flatten(predictions) + for e in entities: + if golds[e]: + # TODO: more appropriate loss for the whole cluster (currently only pos entities) + # TODO: speed up + desc_doc = self.nlp(descs[e]) - # print("entities", entities) - # print("predictions", predictions) - # print("golds", golds) + # doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP) + sent_encodings, bp_sent = self.sent_encoder.begin_update([sent_doc], drop=self.DROP) + desc_encodings, bp_desc = self.desc_encoder.begin_update([desc_doc], drop=self.DROP) - loss, gradient = self.get_loss(predictions, golds) + sent_encoding = sent_encodings[0] + desc_encoding = desc_encodings[0] - gradient = float(gradient) - # print("gradient", gradient) - # print("loss", loss) + sent_enc = self.sent_encoder.ops.asarray([sent_encoding]) + desc_enc = self.sent_encoder.ops.asarray([desc_encoding]) - model_gradient = bp_model(gradient, sgd=self.sgd) - # print("model_gradient", model_gradient) + # print("sent_encoding", type(sent_encoding), sent_encoding) + # print("desc_encoding", type(desc_encoding), desc_encoding) + # print("getting los for entity", e) - # concat = doc + sent + desc, but doc is the same within this function - sent_start = self.ARTICLE_WIDTH - desc_start = self.ARTICLE_WIDTH + self.SENT_WIDTH - doc_gradient = model_gradient[0][0:sent_start] - sent_gradients = list() - desc_gradients = list() - for x in model_gradient: - sent_gradients.append(list(x[sent_start:desc_start])) - desc_gradients.append(list(x[desc_start:])) + loss, gradient = self.get_loss(sent_enc, desc_enc) - # print("doc_gradient", doc_gradient) - # print("sent_gradients", sent_gradients) - # print("desc_gradients", desc_gradients) + # print("gradient", gradient) + # print("loss", loss) - bp_doc([doc_gradient], sgd=self.sgd_article) - bp_sent(sent_gradients, sgd=self.sgd_sent) - bp_desc(desc_gradients, sgd=self.sgd_desc) + bp_sent(gradient, sgd=self.sgd_sent) + # bp_desc(desc_gradients, sgd=self.sgd_desc) TODO + # print() def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) @@ -373,13 +359,14 @@ class EL_Model: collect_correct=True, collect_incorrect=True) - entities = set() + entities_by_cluster = dict() gold_by_entity = dict() desc_by_entity = dict() - article_by_entity = dict() + article_by_cluster = dict() text_by_article = dict() - sentence_by_entity = dict() + sentence_by_cluster = dict() text_by_sentence = dict() + sentence_by_text = dict() cnt = 0 next_entity_nr = 1 @@ -402,74 +389,69 @@ class EL_Model: text_by_article[article_id] = truncated_text # process all positive and negative entities, collect all relevant mentions in this article - article_terms = set() - entities_by_mention = dict() - for mention, entity_pos in correct_entries[article_id].items(): + cluster = article_id + "_" + mention descr = id_to_descr.get(entity_pos) + entities = set() if descr: - entity = "E_" + str(next_entity_nr) + "_" + article_id + "_" + mention + entity = "E_" + str(next_entity_nr) + "_" + cluster next_entity_nr += 1 gold_by_entity[entity] = 1 desc_by_entity[entity] = descr - article_terms.add(mention) - mention_entities = entities_by_mention.get(mention, set()) - mention_entities.add(entity) - entities_by_mention[mention] = mention_entities - - for mention, entity_negs in incorrect_entries[article_id].items(): - for entity_neg in entity_negs: - descr = id_to_descr.get(entity_neg) - if descr: - entity = "E_" + str(next_entity_nr) + "_" + article_id + "_" + mention - next_entity_nr += 1 - gold_by_entity[entity] = 0 - desc_by_entity[entity] = descr - article_terms.add(mention) - mention_entities = entities_by_mention.get(mention, set()) - mention_entities.add(entity) - entities_by_mention[mention] = mention_entities - - # find all matches in the doc for the mentions - # TODO: fix this - doesn't look like all entities are found - matcher = PhraseMatcher(self.nlp.vocab) - patterns = list(self.nlp.tokenizer.pipe(article_terms)) - - matcher.add("TerminologyList", None, *patterns) - matches = matcher(article_doc) - - # store sentences - sentence_to_id = dict() - for match_id, start, end in matches: - span = article_doc[start:end] - sent_text = span.sent.text - sent_nr = sentence_to_id.get(sent_text, None) - mention = span.text - if sent_nr is None: - sent_nr = "S_" + str(next_sent_nr) + article_id - next_sent_nr += 1 - text_by_sentence[sent_nr] = sent_text - sentence_to_id[sent_text] = sent_nr - mention_entities = entities_by_mention[mention] - for entity in mention_entities: entities.add(entity) - sentence_by_entity[entity] = sent_nr - article_by_entity[entity] = article_id - # remove entities that didn't have all data - gold_by_entity = {k: v for k, v in gold_by_entity.items() if k in entities} - desc_by_entity = {k: v for k, v in desc_by_entity.items() if k in entities} + entity_negs = incorrect_entries[article_id][mention] + for entity_neg in entity_negs: + descr = id_to_descr.get(entity_neg) + if descr: + entity = "E_" + str(next_entity_nr) + "_" + cluster + next_entity_nr += 1 + gold_by_entity[entity] = 0 + desc_by_entity[entity] = descr + entities.add(entity) - article_by_entity = {k: v for k, v in article_by_entity.items() if k in entities} - text_by_article = {k: v for k, v in text_by_article.items() if k in article_by_entity.values()} + found_matches = 0 + if len(entities) > 1: + entities_by_cluster[cluster] = entities + + # find all matches in the doc for the mentions + # TODO: fix this - doesn't look like all entities are found + matcher = PhraseMatcher(self.nlp.vocab) + patterns = list(self.nlp.tokenizer.pipe([mention])) + + matcher.add("TerminologyList", None, *patterns) + matches = matcher(article_doc) + + + # store sentences + for match_id, start, end in matches: + found_matches += 1 + span = article_doc[start:end] + assert mention == span.text + sent_text = span.sent.text + sent_nr = sentence_by_text.get(sent_text, None) + if sent_nr is None: + sent_nr = "S_" + str(next_sent_nr) + article_id + next_sent_nr += 1 + text_by_sentence[sent_nr] = sent_text + sentence_by_text[sent_text] = sent_nr + article_by_cluster[cluster] = article_id + sentence_by_cluster[cluster] = sent_nr + + if found_matches == 0: + # TODO print("Could not find neg instances or sentence matches for", mention, "in", article_id) + entities_by_cluster.pop(cluster, None) + article_by_cluster.pop(cluster, None) + sentence_by_cluster.pop(cluster, None) + for entity in entities: + gold_by_entity.pop(entity, None) + desc_by_entity.pop(entity, None) - sentence_by_entity = {k: v for k, v in sentence_by_entity.items() if k in entities} - text_by_sentence = {k: v for k, v in text_by_sentence.items() if k in sentence_by_entity.values()} if to_print: print() print("Processed", cnt, "training articles, dev=" + str(dev)) print() - return list(entities), gold_by_entity, desc_by_entity, article_by_entity, text_by_article, \ - sentence_by_entity, text_by_sentence + return entities_by_cluster, gold_by_entity, desc_by_entity, article_by_cluster, text_by_article, \ + sentence_by_cluster, text_by_sentence diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 319b1e1c8..a24ff30c5 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=100, devlimit=20) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=100) print() # STEP 7: apply the EL algorithm on the dev dataset From a761929fa50365663c8e897c8e5664a22438b3bd Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 28 May 2019 18:14:49 +0200 Subject: [PATCH 059/148] context encoder combining sentence and article --- .../pipeline/wiki_entity_linking/train_el.py | 257 ++++++++++-------- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 138 insertions(+), 121 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index ac8cae4a4..ea42f9ab6 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -11,11 +11,11 @@ from thinc.neural._classes.convolution import ExtractWindow from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator -from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, logistic, Tok2Vec, cosine +from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, cosine from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten -from thinc.v2v import Model, Maxout, Affine, ReLu -from thinc.t2v import Pooling, mean_pool, sum_pool +from thinc.v2v import Model, Maxout, Affine +from thinc.t2v import Pooling, mean_pool from thinc.t2t import ParametricAttention from thinc.misc import Residual from thinc.misc import LayerNorm as LN @@ -30,24 +30,21 @@ from spacy.tokens import Doc class EL_Model: PRINT_INSPECT = False - PRINT_TRAIN = True + PRINT_BATCH_LOSS = False EPS = 0.0000000005 - CUTOFF = 0.5 BATCH_SIZE = 5 - # UPSAMPLE = True DOC_CUTOFF = 300 # number of characters from the doc context INPUT_DIM = 300 # dimension of pre-trained vectors HIDDEN_1_WIDTH = 32 - # HIDDEN_2_WIDTH = 32 # 6 DESC_WIDTH = 64 - ARTICLE_WIDTH = 64 + ARTICLE_WIDTH = 128 SENT_WIDTH = 64 DROP = 0.1 - LEARN_RATE = 0.0001 + LEARN_RATE = 0.001 EPOCHS = 10 L2 = 1e-6 @@ -61,13 +58,10 @@ class EL_Model: self._build_cnn(embed_width=self.INPUT_DIM, desc_width=self.DESC_WIDTH, article_width=self.ARTICLE_WIDTH, - sent_width=self.SENT_WIDTH, hidden_1_width=self.HIDDEN_1_WIDTH) + sent_width=self.SENT_WIDTH, + hidden_1_width=self.HIDDEN_1_WIDTH) def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): - # raise errors instead of runtime warnings in case of int/float overflow - # (not sure if we need this. set L2 to 0 because it throws an error otherwsise) - # np.seterr(all='raise') - # alternative: np.seterr(divide="raise", over="warn", under="ignore", invalid="raise") train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ @@ -101,21 +95,6 @@ class EL_Model: train_pos_count = len(train_pos_entities) train_neg_count = len(train_neg_entities) - # if self.UPSAMPLE: - # if to_print: - # print() - # print("Upsampling, original training instances pos/neg:", train_pos_count, train_neg_count) - # - # # upsample positives to 50-50 distribution - # while train_pos_count < train_neg_count: - # train_ent.append(random.choice(train_pos_entities)) - # train_pos_count += 1 - # - # upsample negatives to 50-50 distribution - # while train_neg_count < train_pos_count: - # train_ent.append(random.choice(train_neg_entities)) - # train_neg_count += 1 - self._begin_training() if to_print: @@ -126,24 +105,25 @@ class EL_Model: print("Dev test on", len(dev_clusters), "entity clusters in", len(dev_art_texts), "articles") print("Dev instances pos/neg:", dev_pos_count, dev_neg_count) print() - print(" CUTOFF", self.CUTOFF) print(" DOC_CUTOFF", self.DOC_CUTOFF) print(" INPUT_DIM", self.INPUT_DIM) - # print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) + print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) print(" DESC_WIDTH", self.DESC_WIDTH) print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH) print(" SENT_WIDTH", self.SENT_WIDTH) - # print(" HIDDEN_2_WIDTH", self.HIDDEN_2_WIDTH) print(" DROP", self.DROP) print(" LEARNING RATE", self.LEARN_RATE) - print(" UPSAMPLE", self.UPSAMPLE) + print(" BATCH SIZE", self.BATCH_SIZE) print() - self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, - print_string="dev_random", calc_random=True) + dev_random = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, + calc_random=True) + print("acc", "dev_random", round(dev_random, 2)) - self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, - print_string="dev_pre", avg=True) + dev_pre = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, + avg=True) + print("acc", "dev_pre", round(dev_pre, 2)) + print() processed = 0 for i in range(self.EPOCHS): @@ -163,45 +143,58 @@ class EL_Model: start = start + self.BATCH_SIZE stop = min(stop + self.BATCH_SIZE, len(train_clusters)) - if self.PRINT_TRAIN: - print() - self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts, - print_string="train_inter_epoch " + str(i), avg=True) + train_acc = self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts, avg=True) + dev_acc = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, avg=True) - self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, - print_string="dev_inter_epoch " + str(i), avg=True) + print(i, "acc train/dev", round(train_acc, 2), round(dev_acc, 2)) if to_print: print() print("Trained on", processed, "entity clusters across", self.EPOCHS, "epochs") - def _test_dev(self, entity_clusters, golds, descs, arts, art_texts, sents, sent_texts, - print_string, avg=True, calc_random=False): - + def _test_dev(self, entity_clusters, golds, descs, arts, art_texts, sents, sent_texts, avg=True, calc_random=False): correct = 0 incorrect = 0 - for cluster, entities in entity_clusters.items(): - correct_entities = [e for e in entities if golds[e]] - incorrect_entities = [e for e in entities if not golds[e]] - assert len(correct_entities) == 1 + if calc_random: + for cluster, entities in entity_clusters.items(): + correct_entities = [e for e in entities if golds[e]] + assert len(correct_entities) == 1 - entities = list(entities) - shuffle(entities) + entities = list(entities) + shuffle(entities) - if calc_random: - predicted_entity = random.choice(entities) - if predicted_entity in correct_entities: - correct += 1 - else: - incorrect += 1 + if calc_random: + predicted_entity = random.choice(entities) + if predicted_entity in correct_entities: + correct += 1 + else: + incorrect += 1 + + else: + all_clusters = list() + arts_list = list() + sents_list = list() + + for cluster in entity_clusters.keys(): + all_clusters.append(cluster) + arts_list.append(art_texts[arts[cluster]]) + sents_list.append(sent_texts[sents[cluster]]) + + art_docs = list(self.nlp.pipe(arts_list)) + sent_docs = list(self.nlp.pipe(sents_list)) + + for i, cluster in enumerate(all_clusters): + entities = entity_clusters[cluster] + correct_entities = [e for e in entities if golds[e]] + assert len(correct_entities) == 1 + + entities = list(entities) + shuffle(entities) - else: desc_docs = self.nlp.pipe([descs[e] for e in entities]) - # article_texts = [art_texts[arts[e]] for e in entities] - - sent_doc = self.nlp(sent_texts[sents[cluster]]) - article_doc = self.nlp(art_texts[arts[cluster]]) + sent_doc = sent_docs[i] + article_doc = art_docs[i] predicted_index = self._predict(article_doc=article_doc, sent_doc=sent_doc, desc_docs=desc_docs, avg=avg) @@ -211,52 +204,56 @@ class EL_Model: incorrect += 1 if correct == incorrect == 0: - print("acc", print_string, "NA") return 0 acc = correct / (correct + incorrect) - print("acc", print_string, round(acc, 2)) return acc def _predict(self, article_doc, sent_doc, desc_docs, avg=True, apply_threshold=True): + # print() + # print("predicting article") + if avg: with self.article_encoder.use_params(self.sgd_article.averages) \ and self.desc_encoder.use_params(self.sgd_desc.averages)\ - and self.sent_encoder.use_params(self.sgd_sent.averages): - # doc_encoding = self.article_encoder(article_doc) + and self.sent_encoder.use_params(self.sgd_sent.averages)\ + and self.cont_encoder.use_params(self.sgd_cont.averages): desc_encodings = self.desc_encoder(desc_docs) + doc_encoding = self.article_encoder([article_doc]) sent_encoding = self.sent_encoder([sent_doc]) else: - # doc_encodings = self.article_encoder(article_docs) desc_encodings = self.desc_encoder(desc_docs) + doc_encoding = self.article_encoder([article_doc]) sent_encoding = self.sent_encoder([sent_doc]) - sent_enc = np.transpose(sent_encoding) + # print("desc_encodings", desc_encodings) + # print("doc_encoding", doc_encoding) + # print("sent_encoding", sent_encoding) + concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] + # print("concat_encoding", concat_encoding) + + cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]])) + # print("cont_encodings", cont_encodings) + context_enc = np.transpose(cont_encodings) + # print("context_enc", context_enc) + highest_sim = -5 best_i = -1 for i, desc_enc in enumerate(desc_encodings): - sim = cosine(desc_enc, sent_enc) + sim = cosine(desc_enc, context_enc) if sim >= highest_sim: best_i = i highest_sim = sim return best_i - def _predict_random(self, entities, apply_threshold=True): - if not apply_threshold: - return [float(random.uniform(0, 1)) for _ in entities] - else: - return [float(1.0) if random.uniform(0, 1) > self.CUTOFF else float(0.0) for _ in entities] - def _build_cnn(self, embed_width, desc_width, article_width, sent_width, hidden_1_width): - with Model.define_operators({">>": chain, "|": concatenate, "**": clone}): - self.desc_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, - end_width=desc_width) - self.article_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, - end_width=article_width) - self.sent_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, - end_width=sent_width) + self.desc_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, end_width=desc_width) + self.cont_encoder = self._context_encoder(embed_width=embed_width, article_width=article_width, + sent_width=sent_width, hidden_width=hidden_1_width, + end_width=desc_width) + # def _encoder(self, width): # tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3, @@ -264,12 +261,19 @@ class EL_Model: # # return tok2vec >> flatten_add_lengths >> Pooling(mean_pool) + def _context_encoder(self, embed_width, article_width, sent_width, hidden_width, end_width): + self.article_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width) + self.sent_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width) + + model = Affine(end_width, article_width+sent_width, drop_factor=0.0) + return model + @staticmethod def _encoder(in_width, hidden_with, end_width): conv_depth = 2 cnn_maxout_pieces = 3 - with Model.define_operators({">>": chain}): + with Model.define_operators({">>": chain, "**": clone}): convolution = Residual((ExtractWindow(nW=1) >> LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces)))) @@ -295,62 +299,75 @@ class EL_Model: self.sgd_sent.learn_rate = self.LEARN_RATE self.sgd_sent.L2 = self.L2 + self.sgd_cont = create_default_optimizer(self.cont_encoder.ops) + self.sgd_cont.learn_rate = self.LEARN_RATE + self.sgd_cont.L2 = self.L2 + self.sgd_desc = create_default_optimizer(self.desc_encoder.ops) self.sgd_desc.learn_rate = self.LEARN_RATE self.sgd_desc.L2 = self.L2 - # self.sgd = create_default_optimizer(self.model.ops) - # self.sgd.learn_rate = self.LEARN_RATE - # self.sgd.L2 = self.L2 - @staticmethod def get_loss(predictions, golds): loss, gradients = get_cossim_loss(predictions, golds) return loss, gradients def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents): + all_clusters = list(entity_clusters.keys()) + + arts_list = list() + sents_list = list() + descs_list = list() + for cluster, entities in entity_clusters.items(): - correct_entities = [e for e in entities if golds[e]] - incorrect_entities = [e for e in entities if not golds[e]] - - assert len(correct_entities) == 1 - entities = list(entities) - shuffle(entities) - - # article_text = art_texts[arts[cluster]] - cluster_sent = sent_texts[sents[cluster]] - - # art_docs = self.nlp.pipe(article_text) - sent_doc = self.nlp(cluster_sent) - + art = art_texts[arts[cluster]] + sent = sent_texts[sents[cluster]] for e in entities: + # TODO: more appropriate loss for the whole cluster (currently only pos entities) if golds[e]: - # TODO: more appropriate loss for the whole cluster (currently only pos entities) - # TODO: speed up - desc_doc = self.nlp(descs[e]) + arts_list.append(art) + sents_list.append(sent) + descs_list.append(descs[e]) - # doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP) - sent_encodings, bp_sent = self.sent_encoder.begin_update([sent_doc], drop=self.DROP) - desc_encodings, bp_desc = self.desc_encoder.begin_update([desc_doc], drop=self.DROP) + desc_docs = self.nlp.pipe(descs_list) + desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP) - sent_encoding = sent_encodings[0] - desc_encoding = desc_encodings[0] + art_docs = self.nlp.pipe(arts_list) + sent_docs = self.nlp.pipe(sents_list) - sent_enc = self.sent_encoder.ops.asarray([sent_encoding]) - desc_enc = self.sent_encoder.ops.asarray([desc_encoding]) + doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP) + sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP) - # print("sent_encoding", type(sent_encoding), sent_encoding) - # print("desc_encoding", type(desc_encoding), desc_encoding) - # print("getting los for entity", e) + concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in + range(len(all_clusters))] + cont_encodings, bp_cont = self.cont_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) - loss, gradient = self.get_loss(sent_enc, desc_enc) + # print("sent_encodings", type(sent_encodings), sent_encodings) + # print("desc_encodings", type(desc_encodings), desc_encodings) + # print("doc_encodings", type(doc_encodings), doc_encodings) + # print("getting los for", len(arts_list), "entities") - # print("gradient", gradient) - # print("loss", loss) + loss, gradient = self.get_loss(cont_encodings, desc_encodings) - bp_sent(gradient, sgd=self.sgd_sent) - # bp_desc(desc_gradients, sgd=self.sgd_desc) TODO - # print() + # print("gradient", gradient) + if self.PRINT_BATCH_LOSS: + print("batch loss", loss) + + context_gradient = bp_cont(gradient, sgd=self.sgd_cont) + + # gradient : concat (doc+sent) vs. desc + sent_start = self.ARTICLE_WIDTH + sent_gradients = list() + doc_gradients = list() + for x in context_gradient: + doc_gradients.append(list(x[0:sent_start])) + sent_gradients.append(list(x[sent_start:])) + + # print("doc_gradients", doc_gradients) + # print("sent_gradients", sent_gradients) + + bp_doc(doc_gradients, sgd=self.sgd_article) + bp_sent(sent_gradients, sgd=self.sgd_sent) def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): id_to_descr = kb_creator._get_id_to_description(entity_descr_output) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index a24ff30c5..25c1e4721 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=100) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=5000, devlimit=100) print() # STEP 7: apply the EL algorithm on the dev dataset From 268a52ead7bbad21a22df11e9446971102193bcf Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 29 May 2019 16:07:53 +0200 Subject: [PATCH 060/148] experimenting with cosine sim for negative examples (not OK yet) --- .../pipeline/wiki_entity_linking/train_el.py | 44 ++++++++++++++++--- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index ea42f9ab6..ba8a6a6c9 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -8,6 +8,7 @@ import numpy as np import random from random import shuffle from thinc.neural._classes.convolution import ExtractWindow +from thinc.neural.util import get_array_module from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator @@ -20,7 +21,7 @@ from thinc.t2t import ParametricAttention from thinc.misc import Residual from thinc.misc import LayerNorm as LN -from spacy.cli.pretrain import get_cossim_loss +# from spacy.cli.pretrain import get_cossim_loss from spacy.matcher import PhraseMatcher from spacy.tokens import Doc @@ -307,27 +308,56 @@ class EL_Model: self.sgd_desc.learn_rate = self.LEARN_RATE self.sgd_desc.L2 = self.L2 - @staticmethod - def get_loss(predictions, golds): - loss, gradients = get_cossim_loss(predictions, golds) + def get_loss(self, v1, v2, targets): + loss, gradients = self.get_cossim_loss(v1, v2, targets) return loss, gradients + def get_cossim_loss(self, yh, y, t): + # Add a small constant to avoid 0 vectors + # print() + # print("yh", yh) + # print("y", y) + # print("t", t) + yh = yh + 1e-8 + y = y + 1e-8 + # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity + xp = get_array_module(yh) + norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) + norm_y = xp.linalg.norm(y, axis=1, keepdims=True) + mul_norms = norm_yh * norm_y + cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms + # print("cos", cos) + d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2)) + # print("abs", xp.abs(cos - t)) + loss = xp.abs(cos - t).sum() + # print("loss", loss) + # print("d_yh", d_yh) + inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))]) + # print("inverse", inverse) + return loss, -inverse + def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents): all_clusters = list(entity_clusters.keys()) arts_list = list() sents_list = list() descs_list = list() + targets = list() for cluster, entities in entity_clusters.items(): art = art_texts[arts[cluster]] sent = sent_texts[sents[cluster]] for e in entities: - # TODO: more appropriate loss for the whole cluster (currently only pos entities) if golds[e]: arts_list.append(art) sents_list.append(sent) descs_list.append(descs[e]) + targets.append([1]) + else: + arts_list.append(art) + sents_list.append(sent) + descs_list.append(descs[e]) + targets.append([-1]) desc_docs = self.nlp.pipe(descs_list) desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP) @@ -339,7 +369,7 @@ class EL_Model: sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP) concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in - range(len(all_clusters))] + range(len(targets))] cont_encodings, bp_cont = self.cont_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) # print("sent_encodings", type(sent_encodings), sent_encodings) @@ -347,7 +377,7 @@ class EL_Model: # print("doc_encodings", type(doc_encodings), doc_encodings) # print("getting los for", len(arts_list), "entities") - loss, gradient = self.get_loss(cont_encodings, desc_encodings) + loss, gradient = self.get_loss(cont_encodings, desc_encodings, targets) # print("gradient", gradient) if self.PRINT_BATCH_LOSS: diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 25c1e4721..a24ff30c5 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=5000, devlimit=100) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=100) print() # STEP 7: apply the EL algorithm on the dev dataset From 9e88763dab895d7ee86a21d78c0e2c950e8d6850 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 3 Jun 2019 08:04:49 +0200 Subject: [PATCH 061/148] 60% acc run --- .../pipeline/wiki_entity_linking/train_el.py | 159 ++++++++---------- .../wiki_entity_linking/wiki_nel_pipeline.py | 3 +- 2 files changed, 74 insertions(+), 88 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index ba8a6a6c9..a2db2dc95 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -23,7 +23,6 @@ from thinc.misc import LayerNorm as LN # from spacy.cli.pretrain import get_cossim_loss from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc """ TODO: this code needs to be implemented in pipes.pyx""" @@ -46,7 +45,7 @@ class EL_Model: DROP = 0.1 LEARN_RATE = 0.001 - EPOCHS = 10 + EPOCHS = 20 L2 = 1e-6 name = "entity_linker" @@ -211,9 +210,6 @@ class EL_Model: return acc def _predict(self, article_doc, sent_doc, desc_docs, avg=True, apply_threshold=True): - # print() - # print("predicting article") - if avg: with self.article_encoder.use_params(self.sgd_article.averages) \ and self.desc_encoder.use_params(self.sgd_desc.averages)\ @@ -228,16 +224,10 @@ class EL_Model: doc_encoding = self.article_encoder([article_doc]) sent_encoding = self.sent_encoder([sent_doc]) - # print("desc_encodings", desc_encodings) - # print("doc_encoding", doc_encoding) - # print("sent_encoding", sent_encoding) concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] - # print("concat_encoding", concat_encoding) cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]])) - # print("cont_encodings", cont_encodings) context_enc = np.transpose(cont_encodings) - # print("context_enc", context_enc) highest_sim = -5 best_i = -1 @@ -353,11 +343,11 @@ class EL_Model: sents_list.append(sent) descs_list.append(descs[e]) targets.append([1]) - else: - arts_list.append(art) - sents_list.append(sent) - descs_list.append(descs[e]) - targets.append([-1]) + # else: + # arts_list.append(art) + # sents_list.append(sent) + # descs_list.append(descs[e]) + # targets.append([-1]) desc_docs = self.nlp.pipe(descs_list) desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP) @@ -372,18 +362,17 @@ class EL_Model: range(len(targets))] cont_encodings, bp_cont = self.cont_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) - # print("sent_encodings", type(sent_encodings), sent_encodings) - # print("desc_encodings", type(desc_encodings), desc_encodings) - # print("doc_encodings", type(doc_encodings), doc_encodings) - # print("getting los for", len(arts_list), "entities") + loss, cont_gradient = self.get_loss(cont_encodings, desc_encodings, targets) - loss, gradient = self.get_loss(cont_encodings, desc_encodings, targets) + # loss, desc_gradient = self.get_loss(desc_encodings, cont_encodings, targets) + # cont_gradient = cont_gradient / 2 + # desc_gradient = desc_gradient / 2 + # bp_desc(desc_gradient, sgd=self.sgd_desc) - # print("gradient", gradient) if self.PRINT_BATCH_LOSS: print("batch loss", loss) - context_gradient = bp_cont(gradient, sgd=self.sgd_cont) + context_gradient = bp_cont(cont_gradient, sgd=self.sgd_cont) # gradient : concat (doc+sent) vs. desc sent_start = self.ARTICLE_WIDTH @@ -393,9 +382,6 @@ class EL_Model: doc_gradients.append(list(x[0:sent_start])) sent_gradients.append(list(x[sent_start:])) - # print("doc_gradients", doc_gradients) - # print("sent_gradients", sent_gradients) - bp_doc(doc_gradients, sgd=self.sgd_article) bp_sent(sent_gradients, sgd=self.sgd_sent) @@ -426,74 +412,75 @@ class EL_Model: article_id = f.replace(".txt", "") if cnt % 500 == 0 and to_print: print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") - cnt += 1 - # parse the article text - with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: - text = file.read() - article_doc = self.nlp(text) - truncated_text = text[0:min(self.DOC_CUTOFF, len(text))] - text_by_article[article_id] = truncated_text + try: + # parse the article text + with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: + text = file.read() + article_doc = self.nlp(text) + truncated_text = text[0:min(self.DOC_CUTOFF, len(text))] + text_by_article[article_id] = truncated_text - # process all positive and negative entities, collect all relevant mentions in this article - for mention, entity_pos in correct_entries[article_id].items(): - cluster = article_id + "_" + mention - descr = id_to_descr.get(entity_pos) - entities = set() - if descr: - entity = "E_" + str(next_entity_nr) + "_" + cluster - next_entity_nr += 1 - gold_by_entity[entity] = 1 - desc_by_entity[entity] = descr - entities.add(entity) + # process all positive and negative entities, collect all relevant mentions in this article + for mention, entity_pos in correct_entries[article_id].items(): + cluster = article_id + "_" + mention + descr = id_to_descr.get(entity_pos) + entities = set() + if descr: + entity = "E_" + str(next_entity_nr) + "_" + cluster + next_entity_nr += 1 + gold_by_entity[entity] = 1 + desc_by_entity[entity] = descr + entities.add(entity) - entity_negs = incorrect_entries[article_id][mention] - for entity_neg in entity_negs: - descr = id_to_descr.get(entity_neg) - if descr: - entity = "E_" + str(next_entity_nr) + "_" + cluster - next_entity_nr += 1 - gold_by_entity[entity] = 0 - desc_by_entity[entity] = descr - entities.add(entity) + entity_negs = incorrect_entries[article_id][mention] + for entity_neg in entity_negs: + descr = id_to_descr.get(entity_neg) + if descr: + entity = "E_" + str(next_entity_nr) + "_" + cluster + next_entity_nr += 1 + gold_by_entity[entity] = 0 + desc_by_entity[entity] = descr + entities.add(entity) - found_matches = 0 - if len(entities) > 1: - entities_by_cluster[cluster] = entities + found_matches = 0 + if len(entities) > 1: + entities_by_cluster[cluster] = entities - # find all matches in the doc for the mentions - # TODO: fix this - doesn't look like all entities are found - matcher = PhraseMatcher(self.nlp.vocab) - patterns = list(self.nlp.tokenizer.pipe([mention])) + # find all matches in the doc for the mentions + # TODO: fix this - doesn't look like all entities are found + matcher = PhraseMatcher(self.nlp.vocab) + patterns = list(self.nlp.tokenizer.pipe([mention])) - matcher.add("TerminologyList", None, *patterns) - matches = matcher(article_doc) + matcher.add("TerminologyList", None, *patterns) + matches = matcher(article_doc) + # store sentences + for match_id, start, end in matches: + span = article_doc[start:end] + if mention == span.text: + found_matches += 1 + sent_text = span.sent.text + sent_nr = sentence_by_text.get(sent_text, None) + if sent_nr is None: + sent_nr = "S_" + str(next_sent_nr) + article_id + next_sent_nr += 1 + text_by_sentence[sent_nr] = sent_text + sentence_by_text[sent_text] = sent_nr + article_by_cluster[cluster] = article_id + sentence_by_cluster[cluster] = sent_nr - # store sentences - for match_id, start, end in matches: - found_matches += 1 - span = article_doc[start:end] - assert mention == span.text - sent_text = span.sent.text - sent_nr = sentence_by_text.get(sent_text, None) - if sent_nr is None: - sent_nr = "S_" + str(next_sent_nr) + article_id - next_sent_nr += 1 - text_by_sentence[sent_nr] = sent_text - sentence_by_text[sent_text] = sent_nr - article_by_cluster[cluster] = article_id - sentence_by_cluster[cluster] = sent_nr - - if found_matches == 0: - # TODO print("Could not find neg instances or sentence matches for", mention, "in", article_id) - entities_by_cluster.pop(cluster, None) - article_by_cluster.pop(cluster, None) - sentence_by_cluster.pop(cluster, None) - for entity in entities: - gold_by_entity.pop(entity, None) - desc_by_entity.pop(entity, None) - + if found_matches == 0: + # print("Could not find neg instances or sentence matches for", mention, "in", article_id) + entities_by_cluster.pop(cluster, None) + article_by_cluster.pop(cluster, None) + sentence_by_cluster.pop(cluster, None) + for entity in entities: + gold_by_entity.pop(entity, None) + desc_by_entity.pop(entity, None) + cnt += 1 + except: + print("Problem parsing article", article_id) if to_print: print() diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index a24ff30c5..2ebf9973e 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=1000, devlimit=100) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500) print() # STEP 7: apply the EL algorithm on the dev dataset @@ -120,7 +120,6 @@ if __name__ == "__main__": run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000) print() - # TODO coreference resolution # add_coref() From fb37cdb2d30a6ac3a66df9cddb39951c4bcc93e8 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 3 Jun 2019 21:32:54 +0200 Subject: [PATCH 062/148] implementing el pipe in pipes.pyx (not tested yet) --- .../pipeline/wiki_entity_linking/train_el.py | 21 +-- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- spacy/_ml.py | 22 +++ spacy/pipeline/pipes.pyx | 156 ++++++++++++++---- 4 files changed, 160 insertions(+), 41 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index a2db2dc95..b9a0dc843 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -24,8 +24,6 @@ from thinc.misc import LayerNorm as LN # from spacy.cli.pretrain import get_cossim_loss from spacy.matcher import PhraseMatcher -""" TODO: this code needs to be implemented in pipes.pyx""" - class EL_Model: @@ -45,7 +43,7 @@ class EL_Model: DROP = 0.1 LEARN_RATE = 0.001 - EPOCHS = 20 + EPOCHS = 5 L2 = 1e-6 name = "entity_linker" @@ -213,8 +211,7 @@ class EL_Model: if avg: with self.article_encoder.use_params(self.sgd_article.averages) \ and self.desc_encoder.use_params(self.sgd_desc.averages)\ - and self.sent_encoder.use_params(self.sgd_sent.averages)\ - and self.cont_encoder.use_params(self.sgd_cont.averages): + and self.sent_encoder.use_params(self.sgd_sent.averages): desc_encodings = self.desc_encoder(desc_docs) doc_encoding = self.article_encoder([article_doc]) sent_encoding = self.sent_encoder([sent_doc]) @@ -226,7 +223,13 @@ class EL_Model: concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] - cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]])) + if avg: + with self.cont_encoder.use_params(self.sgd_cont.averages): + cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]])) + + else: + cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]])) + context_enc = np.transpose(cont_encodings) highest_sim = -5 @@ -298,8 +301,8 @@ class EL_Model: self.sgd_desc.learn_rate = self.LEARN_RATE self.sgd_desc.L2 = self.L2 - def get_loss(self, v1, v2, targets): - loss, gradients = self.get_cossim_loss(v1, v2, targets) + def get_loss(self, pred, gold, targets): + loss, gradients = self.get_cossim_loss(pred, gold, targets) return loss, gradients def get_cossim_loss(self, yh, y, t): @@ -327,8 +330,6 @@ class EL_Model: return loss, -inverse def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents): - all_clusters = list(entity_clusters.keys()) - arts_list = list() sents_list = list() descs_list = list() diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 2ebf9973e..40d737a6f 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -111,7 +111,7 @@ if __name__ == "__main__": print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20) print() # STEP 7: apply the EL algorithm on the dev dataset diff --git a/spacy/_ml.py b/spacy/_ml.py index 349b88df9..29772c5ee 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -652,6 +652,28 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, return model +def build_nel_encoder(in_width, hidden_width, end_width, **cfg): + conv_depth = cfg.get("conv_depth", 2) + cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) + + with Model.define_operators({">>": chain, "**": clone}): + convolution = Residual((ExtractWindow(nW=1) >> + LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces)))) + + encoder = SpacyVectors \ + >> with_flatten(LN(Maxout(hidden_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \ + >> flatten_add_lengths \ + >> ParametricAttention(hidden_width) \ + >> Pooling(mean_pool) \ + >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ + >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0)) + + # TODO: ReLu or LN(Maxout) ? + # sum_pool or mean_pool ? + + encoder.nO = end_width + return encoder + @layerize def flatten(seqs, drop=0.0): ops = Model.ops diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 69521c1b2..c8afd431e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -13,6 +13,8 @@ from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm from thinc.neural.util import to_categorical, copy_array +from spacy.cli.pretrain import get_cossim_loss + from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -23,14 +25,17 @@ from ..vocab cimport Vocab from ..syntax import nonproj from ..attrs import POS, ID from ..parts_of_speech import X -from .._ml import Tok2Vec, build_tagger_model +from .._ml import Tok2Vec, build_tagger_model, cosine from .._ml import build_text_classifier, build_simple_cnn_text_classifier -from .._ml import build_bow_text_classifier +from .._ml import build_bow_text_classifier, build_nel_encoder from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import masked_language_model, create_default_optimizer from ..errors import Errors, TempErrors from .. import util +# TODO: remove +from examples.pipeline.wiki_entity_linking import kb_creator + def _load_cfg(path): if path.exists(): @@ -1065,50 +1070,141 @@ class EntityLinker(Pipe): name = 'entity_linker' @classmethod - def Model(cls, nr_class=1, **cfg): - # TODO: non-dummy EL implementation - return None + def Model(cls, **cfg): + embed_width = cfg.get("embed_width", 300) + hidden_width = cfg.get("hidden_width", 32) + entity_width = cfg.get("entity_width", 64) + article_width = cfg.get("article_width", 128) + sent_width = cfg.get("sent_width", 64) + + entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width) + + article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width) + sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width) + + # dimension of the mention encoder needs to match the dimension of the entity encoder + mention_width = entity_encoder.nO + mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0) + + return entity_encoder, article_encoder, sent_encoder, mention_encoder + + def __init__(self, **cfg): + # TODO: bring-your-own-model + self.mention_encoder = True - def __init__(self, model=True, **cfg): - self.model = False self.cfg = dict(cfg) self.kb = self.cfg["kb"] + # TODO: fix this. store entity vectors in the KB ? + self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv') + + def use_avg_params(self): + """Modify the pipe's encoders/models, to use their average parameter values.""" + with self.article_encoder.use_params(self.sgd_article.averages) \ + and self.entity_encoder.use_params(self.sgd_entity.averages)\ + and self.sent_encoder.use_params(self.sgd_sent.averages) \ + and self.mention_encoder.use_params(self.sgd_mention.averages): + yield + + def require_model(self): + """Raise an error if the component's model is not initialized.""" + if getattr(self, "mention_encoder", None) in (None, True, False): + raise ValueError(Errors.E109.format(name=self.name)) + + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + if self.mention_encoder is True: + self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) + self.sgd_article = create_default_optimizer(self.article_encoder.ops) + self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) + self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) + self.sgd_entity = create_default_optimizer(self.entity_encoder.ops) + + def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): + """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """ + self.require_model() + + entity_docs, article_docs, sentence_docs = docs + assert len(entity_docs) == len(article_docs) == len(sentence_docs) + + if isinstance(entity_docs, Doc): + entity_docs = [entity_docs] + article_docs = [article_docs] + sentence_docs = [sentence_docs] + + entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop) + doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) + sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) + + concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in + range(len(article_docs))] + mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) + + loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) + + mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont) + + # gradient : concat (doc+sent) vs. desc + sent_start = self.article_encoder.nO + sent_gradients = list() + doc_gradients = list() + for x in mention_gradient: + doc_gradients.append(list(x[0:sent_start])) + sent_gradients.append(list(x[sent_start:])) + + bp_doc(doc_gradients, sgd=self.sgd_article) + bp_sent(sent_gradients, sgd=self.sgd_sent) + + if losses is not None: + losses.setdefault(self.name, 0.0) + losses[self.name] += loss + return loss + + def get_loss(self, docs, golds, scores): + loss, gradients = get_cossim_loss(scores, golds) + return loss, gradients + def __call__(self, doc): - self.set_annotations([doc], scores=None, tensors=None) + entities, kb_ids = self.predict([doc]) + self.set_annotations([doc], entities, kb_ids) return doc def pipe(self, stream, batch_size=128, n_threads=-1): - """Apply the pipe to a stream of documents. - Both __call__ and pipe should delegate to the `predict()` - and `set_annotations()` methods. - """ for docs in util.minibatch(stream, size=batch_size): docs = list(docs) - self.set_annotations(docs, scores=None, tensors=None) + entities, kb_ids = self.predict(docs) + self.set_annotations(docs, entities, kb_ids) yield from docs - def set_annotations(self, docs, scores, tensors=None): - """ - Currently implemented as taking the KB entry with highest prior probability for each named entity - TODO: actually use context etc - """ - for i, doc in enumerate(docs): - for ent in doc.ents: + def predict(self, docs): + self.require_model() + for i, article_doc in enumerate(docs): + doc_encoding = self.article_encoder([article_doc]) + for ent in article_doc.ents: + sent_doc = ent.sent.as_doc() + sent_encoding = self.sent_encoder([sent_doc]) + concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] + mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]])) + mention_enc_t = np.transpose(mention_encoding) + candidates = self.kb.get_candidates(ent.text) if candidates: - best_candidate = max(candidates, key=lambda c: c.prior_prob) - for token in ent: - token.ent_kb_id_ = best_candidate.entity_ + highest_sim = -5 + best_i = -1 + with self.use_avg_params: + for c in candidates: + kb_id = c.entity_ + description = self.id_to_descr.get(kb_id) + entity_encodings = self.entity_encoder([description]) # TODO: static entity vectors ? + sim = cosine(entity_encodings, mention_enc_t) + if sim >= highest_sim: + best_i = i + highest_sim = sim - def get_loss(self, docs, golds, scores): - # TODO - pass - - def add_label(self, label): - # TODO - pass + # TODO best_candidate = max(candidates, key=lambda c: c.prior_prob) + def set_annotations(self, docs, entities, kb_ids=None): + for token, kb_id in zip(entities, kb_ids): + token.ent_kb_id_ = kb_id class Sentencizer(object): """Segment the Doc into sentences using a rule-based strategy. From 9abbd0899fe2fb64601f02bca206dcad1431365c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 5 Jun 2019 00:09:46 +0200 Subject: [PATCH 063/148] separate entity encoder to get 64D descriptions --- .../wiki_entity_linking/train_descriptions.py | 113 ++++++++++++++++++ .../pipeline/wiki_entity_linking/train_el.py | 18 +-- .../wiki_entity_linking/wiki_nel_pipeline.py | 20 +++- spacy/pipeline/pipes.pyx | 22 ++-- 4 files changed, 152 insertions(+), 21 deletions(-) create mode 100644 examples/pipeline/wiki_entity_linking/train_descriptions.py diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py new file mode 100644 index 000000000..63149b5f7 --- /dev/null +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -0,0 +1,113 @@ +from random import shuffle + +from examples.pipeline.wiki_entity_linking import kb_creator + +import numpy as np + +from spacy._ml import zero_init, create_default_optimizer +from spacy.cli.pretrain import get_cossim_loss + +from thinc.v2v import Model +from thinc.api import chain +from thinc.neural._classes.affine import Affine + + +class EntityEncoder: + + INPUT_DIM = 300 # dimension of pre-trained vectors + DESC_WIDTH = 64 + + DROP = 0 + EPOCHS = 5 + STOP_THRESHOLD = 0.05 + + BATCH_SIZE = 1000 + + def __init__(self, kb, nlp): + self.nlp = nlp + self.kb = kb + + def run(self, entity_descr_output): + id_to_descr = kb_creator._get_id_to_description(entity_descr_output) + + processed, loss = self._train_model(entity_descr_output, id_to_descr) + print("Trained on", processed, "entities across", self.EPOCHS, "epochs") + print("Final loss:", loss) + print() + + # TODO: apply and write to file afterwards ! + # self._apply_encoder(id_to_descr) + + def _train_model(self, entity_descr_output, id_to_descr): + # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy + + self._build_network(self.INPUT_DIM, self.DESC_WIDTH) + + processed = 0 + loss = 1 + + for i in range(self.EPOCHS): + entity_keys = list(id_to_descr.keys()) + shuffle(entity_keys) + + batch_nr = 0 + start = 0 + stop = min(self.BATCH_SIZE, len(entity_keys)) + + while loss > self.STOP_THRESHOLD and start < len(entity_keys): + batch = [] + for e in entity_keys[start:stop]: + descr = id_to_descr[e] + doc = self.nlp(descr) + doc_vector = self._get_doc_embedding(doc) + batch.append(doc_vector) + + loss = self.update(batch) + print(i, batch_nr, loss) + processed += len(batch) + + batch_nr += 1 + start = start + self.BATCH_SIZE + stop = min(stop + self.BATCH_SIZE, len(entity_keys)) + + return processed, loss + + def _apply_encoder(self, id_to_descr): + for id, descr in id_to_descr.items(): + doc = self.nlp(descr) + doc_vector = self._get_doc_embedding(doc) + encoding = self.encoder(np.asarray([doc_vector])) + + @staticmethod + def _get_doc_embedding(doc): + indices = np.zeros((len(doc),), dtype="i") + for i, word in enumerate(doc): + if word.orth in doc.vocab.vectors.key2row: + indices[i] = doc.vocab.vectors.key2row[word.orth] + else: + indices[i] = 0 + word_vectors = doc.vocab.vectors.data[indices] + doc_vector = np.mean(word_vectors, axis=0) # TODO: min? max? + return doc_vector + + def _build_network(self, orig_width, hidden_with): + with Model.define_operators({">>": chain}): + self.encoder = ( + Affine(hidden_with, orig_width) + ) + self.model = self.encoder >> zero_init(Affine(orig_width, hidden_with, drop_factor=0.0)) + + self.sgd = create_default_optimizer(self.model.ops) + + def update(self, vectors): + predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP) + + loss, d_scores = self.get_loss(scores=predictions, golds=np.asarray(vectors)) + bp_model(d_scores, sgd=self.sgd) + + return loss / len(vectors) + + @staticmethod + def get_loss(golds, scores): + loss, gradients = get_cossim_loss(scores, golds) + return loss, gradients diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index b9a0dc843..143e38d99 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -31,7 +31,7 @@ class EL_Model: PRINT_BATCH_LOSS = False EPS = 0.0000000005 - BATCH_SIZE = 5 + BATCH_SIZE = 100 DOC_CUTOFF = 300 # number of characters from the doc context INPUT_DIM = 300 # dimension of pre-trained vectors @@ -41,9 +41,9 @@ class EL_Model: ARTICLE_WIDTH = 128 SENT_WIDTH = 64 - DROP = 0.1 - LEARN_RATE = 0.001 - EPOCHS = 5 + DROP = 0.4 + LEARN_RATE = 0.005 + EPOCHS = 10 L2 = 1e-6 name = "entity_linker" @@ -62,12 +62,14 @@ class EL_Model: def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): np.seterr(divide="raise", over="warn", under="ignore", invalid="raise") + id_to_descr = kb_creator._get_id_to_description(entity_descr_output) + train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ - self._get_training_data(training_dir, entity_descr_output, False, trainlimit, to_print=False) + self._get_training_data(training_dir, id_to_descr, False, trainlimit, to_print=False) train_clusters = list(train_ent.keys()) dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \ - self._get_training_data(training_dir, entity_descr_output, True, devlimit, to_print=False) + self._get_training_data(training_dir, id_to_descr, True, devlimit, to_print=False) dev_clusters = list(dev_ent.keys()) dev_pos_count = len([g for g in dev_gold.values() if g]) @@ -386,9 +388,7 @@ class EL_Model: bp_doc(doc_gradients, sgd=self.sgd_article) bp_sent(sent_gradients, sgd=self.sgd_sent) - def _get_training_data(self, training_dir, entity_descr_output, dev, limit, to_print): - id_to_descr = kb_creator._get_id_to_description(entity_descr_output) - + def _get_training_data(self, training_dir, id_to_descr, dev, limit, to_print): correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, collect_correct=True, collect_incorrect=True) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 40d737a6f..1f4b4b67e 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el +from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder from examples.pipeline.wiki_entity_linking.train_el import EL_Model import spacy @@ -38,11 +39,14 @@ if __name__ == "__main__": to_read_kb = True to_test_kb = False + # run entity description pre-training + run_desc_training = True + # create training dataset create_wp_training = False - # run training - run_training = True + # run EL training + run_el_training = False # apply named entity linking to the dev dataset apply_to_dev = False @@ -101,17 +105,25 @@ if __name__ == "__main__": run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp) print() + # STEP 4b : read KB back in from file, create entity descriptions + # TODO: write back to file + if run_desc_training: + print("STEP 4b: training entity descriptions", datetime.datetime.now()) + my_nlp = spacy.load('en_core_web_md') + EntityEncoder(my_kb, my_nlp).run(entity_descr_output=ENTITY_DESCR) + print() + # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) # STEP 6: apply the EL algorithm on the training dataset - if run_training: + if run_el_training: print("STEP 6: training", datetime.datetime.now()) my_nlp = spacy.load('en_core_web_md') trainer = EL_Model(kb=my_kb, nlp=my_nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=50, devlimit=20) + trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500) print() # STEP 7: apply the EL algorithm on the dev dataset diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index c8afd431e..d0c83b56e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1177,6 +1177,8 @@ class EntityLinker(Pipe): def predict(self, docs): self.require_model() + final_entities = list() + final_kb_ids = list() for i, article_doc in enumerate(docs): doc_encoding = self.article_encoder([article_doc]) for ent in article_doc.ents: @@ -1188,23 +1190,27 @@ class EntityLinker(Pipe): candidates = self.kb.get_candidates(ent.text) if candidates: - highest_sim = -5 - best_i = -1 with self.use_avg_params: + scores = list() for c in candidates: + prior_prob = c.prior_prob kb_id = c.entity_ description = self.id_to_descr.get(kb_id) entity_encodings = self.entity_encoder([description]) # TODO: static entity vectors ? sim = cosine(entity_encodings, mention_enc_t) - if sim >= highest_sim: - best_i = i - highest_sim = sim + score = prior_prob + sim - (prior_prob*sim) # TODO: weights ? + scores.append(score) - # TODO best_candidate = max(candidates, key=lambda c: c.prior_prob) + best_index = scores.index(max(scores)) + best_candidate = candidates[best_index] + final_entities.append(ent) + final_kb_ids.append(best_candidate) + + return final_entities, final_kb_ids def set_annotations(self, docs, entities, kb_ids=None): - for token, kb_id in zip(entities, kb_ids): - token.ent_kb_id_ = kb_id + for entity, kb_id in zip(entities, kb_ids): + entity.ent_kb_id_ = kb_id class Sentencizer(object): """Segment the Doc into sentences using a rule-based strategy. From 5c723c32c3e5e639f99005130c050afcf8230346 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 5 Jun 2019 18:29:18 +0200 Subject: [PATCH 064/148] entity vectors in the KB + serialization of them --- examples/pipeline/dummy_entity_linking.py | 8 +- .../wiki_entity_linking/kb_creator.py | 5 +- .../wiki_entity_linking/train_descriptions.py | 41 ++++- .../wiki_entity_linking/wiki_nel_pipeline.py | 2 +- spacy/kb.pxd | 31 ++-- spacy/kb.pyx | 143 +++++++++++++----- spacy/pipeline/pipes.pyx | 2 +- spacy/structs.pxd | 14 +- spacy/tests/pipeline/test_entity_linker.py | 56 ++++--- spacy/tests/serialize/test_serialize_kb.py | 15 +- 10 files changed, 223 insertions(+), 94 deletions(-) diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py index ae36a57b3..3f1fabdfd 100644 --- a/examples/pipeline/dummy_entity_linking.py +++ b/examples/pipeline/dummy_entity_linking.py @@ -9,20 +9,20 @@ from spacy.kb import KnowledgeBase def create_kb(vocab): - kb = KnowledgeBase(vocab=vocab) + kb = KnowledgeBase(vocab=vocab, entity_vector_length=1) # adding entities entity_0 = "Q1004791_Douglas" print("adding entity", entity_0) - kb.add_entity(entity=entity_0, prob=0.5) + kb.add_entity(entity=entity_0, prob=0.5, entity_vector=[0]) entity_1 = "Q42_Douglas_Adams" print("adding entity", entity_1) - kb.add_entity(entity=entity_1, prob=0.5) + kb.add_entity(entity=entity_1, prob=0.5, entity_vector=[1]) entity_2 = "Q5301561_Douglas_Haig" print("adding entity", entity_2) - kb.add_entity(entity=entity_2, prob=0.5) + kb.add_entity(entity=entity_2, prob=0.5, entity_vector=[2]) # adding aliases print() diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index bb00f918d..ae3422c91 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -16,7 +16,7 @@ def create_kb(vocab, max_entities_per_alias, min_occ, count_input, prior_prob_input, to_print=False, write_entity_defs=True): """ Create the knowledge base from Wikidata entries """ - kb = KnowledgeBase(vocab=vocab) + kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) # TODO: entity vectors ! print() print("1. _read_wikidata_entities", datetime.datetime.now()) @@ -38,7 +38,8 @@ def create_kb(vocab, max_entities_per_alias, min_occ, print() print("3. adding", len(entity_list), "entities", datetime.datetime.now()) print() - kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None, feature_list=None) + # TODO: vector_list ! + kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None) print() print("4. adding aliases", datetime.datetime.now()) diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py index 63149b5f7..88b1bf819 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -19,7 +19,7 @@ class EntityEncoder: DROP = 0 EPOCHS = 5 - STOP_THRESHOLD = 0.05 + STOP_THRESHOLD = 0.1 BATCH_SIZE = 1000 @@ -38,6 +38,8 @@ class EntityEncoder: # TODO: apply and write to file afterwards ! # self._apply_encoder(id_to_descr) + self._test_encoder() + def _train_model(self, entity_descr_output, id_to_descr): # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy @@ -111,3 +113,40 @@ class EntityEncoder: def get_loss(golds, scores): loss, gradients = get_cossim_loss(scores, golds) return loss, gradients + + def _test_encoder(self): + """ Test encoder on some dummy examples """ + desc_A1 = "Fictional character in The Simpsons" + desc_A2 = "Simpsons - fictional human" + desc_A3 = "Fictional character in The Flintstones" + desc_A4 = "Politician from the US" + + A1_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A1))]) + A2_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A2))]) + A3_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A3))]) + A4_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A4))]) + + loss_a1_a1, _ = get_cossim_loss(A1_doc_vector, A1_doc_vector) + loss_a1_a2, _ = get_cossim_loss(A1_doc_vector, A2_doc_vector) + loss_a1_a3, _ = get_cossim_loss(A1_doc_vector, A3_doc_vector) + loss_a1_a4, _ = get_cossim_loss(A1_doc_vector, A4_doc_vector) + + print("sim doc A1 A1", loss_a1_a1) + print("sim doc A1 A2", loss_a1_a2) + print("sim doc A1 A3", loss_a1_a3) + print("sim doc A1 A4", loss_a1_a4) + + A1_encoded = self.encoder(A1_doc_vector) + A2_encoded = self.encoder(A2_doc_vector) + A3_encoded = self.encoder(A3_doc_vector) + A4_encoded = self.encoder(A4_doc_vector) + + loss_a1_a1, _ = get_cossim_loss(A1_encoded, A1_encoded) + loss_a1_a2, _ = get_cossim_loss(A1_encoded, A2_encoded) + loss_a1_a3, _ = get_cossim_loss(A1_encoded, A3_encoded) + loss_a1_a4, _ = get_cossim_loss(A1_encoded, A4_encoded) + + print("sim encoded A1 A1", loss_a1_a1) + print("sim encoded A1 A2", loss_a1_a2) + print("sim encoded A1 A3", loss_a1_a3) + print("sim encoded A1 A4", loss_a1_a4) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 1f4b4b67e..d813238b7 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -93,7 +93,7 @@ if __name__ == "__main__": print("STEP 4: to_read_kb", datetime.datetime.now()) my_vocab = Vocab() my_vocab.from_disk(VOCAB_DIR) - my_kb = KnowledgeBase(vocab=my_vocab) + my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64) # TODO entity vectors my_kb.load_bulk(KB_FILE) print("kb entities:", my_kb.get_size_entities()) print("kb aliases:", my_kb.get_size_aliases()) diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 494848e5e..9c5a73d59 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -12,6 +12,8 @@ from .typedefs cimport hash_t from .structs cimport EntryC, AliasC ctypedef vector[EntryC] entry_vec ctypedef vector[AliasC] alias_vec +ctypedef vector[float] float_vec +ctypedef vector[float_vec] float_matrix # Object used by the Entity Linker that summarizes one entity-alias candidate combination. @@ -20,6 +22,7 @@ cdef class Candidate: cdef readonly KnowledgeBase kb cdef hash_t entity_hash cdef float entity_freq + cdef vector[float] entity_vector cdef hash_t alias_hash cdef float prior_prob @@ -27,6 +30,7 @@ cdef class Candidate: cdef class KnowledgeBase: cdef Pool mem cpdef readonly Vocab vocab + cdef int64_t entity_vector_length # This maps 64bit keys (hash of unique entity string) # to 64bit values (position of the _EntryC struct in the _entries vector). @@ -59,7 +63,7 @@ cdef class KnowledgeBase: # model, that embeds different features of the entities into vectors. We'll # still want some per-entity features, like the Wikipedia text or entity # co-occurrence. Hopefully those vectors can be narrow, e.g. 64 dimensions. - cdef object _vectors_table + cdef float_matrix _vectors_table # It's very useful to track categorical features, at least for output, even # if they're not useful in the model itself. For instance, we should be @@ -69,8 +73,15 @@ cdef class KnowledgeBase: cdef object _features_table + cdef inline int64_t c_add_vector(self, vector[float] entity_vector) nogil: + """Add an entity vector to the vectors table.""" + cdef int64_t new_index = self._vectors_table.size() + self._vectors_table.push_back(entity_vector) + return new_index + + cdef inline int64_t c_add_entity(self, hash_t entity_hash, float prob, - int32_t* vector_rows, int feats_row) nogil: + int32_t vector_index, int feats_row) nogil: """Add an entry to the vector of entries. After calling this method, make sure to update also the _entry_index using the return value""" # This is what we'll map the entity hash key to. It's where the entry will sit @@ -80,7 +91,7 @@ cdef class KnowledgeBase: # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 cdef EntryC entry entry.entity_hash = entity_hash - entry.vector_rows = vector_rows + entry.vector_index = vector_index entry.feats_row = feats_row entry.prob = prob @@ -113,7 +124,7 @@ cdef class KnowledgeBase: # Avoid struct initializer to enable nogil cdef EntryC entry entry.entity_hash = dummy_hash - entry.vector_rows = &dummy_value + entry.vector_index = dummy_value entry.feats_row = dummy_value entry.prob = dummy_value @@ -131,15 +142,16 @@ cdef class KnowledgeBase: self._aliases_table.push_back(alias) cpdef load_bulk(self, loc) - cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list) + cpdef set_entities(self, entity_list, prob_list, vector_list) cpdef set_aliases(self, alias_list, entities_list, probabilities_list) cdef class Writer: cdef FILE* _fp - cdef int write_header(self, int64_t nr_entries) except -1 - cdef int write_entry(self, hash_t entry_hash, float entry_prob) except -1 + cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1 + cdef int write_vector_element(self, float element) except -1 + cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1 cdef int write_alias_length(self, int64_t alias_length) except -1 cdef int write_alias_header(self, hash_t alias_hash, int64_t candidate_length) except -1 @@ -150,8 +162,9 @@ cdef class Writer: cdef class Reader: cdef FILE* _fp - cdef int read_header(self, int64_t* nr_entries) except -1 - cdef int read_entry(self, hash_t* entity_hash, float* prob) except -1 + cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1 + cdef int read_vector_element(self, float* element) except -1 + cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1 cdef int read_alias_length(self, int64_t* alias_length) except -1 cdef int read_alias_header(self, hash_t* alias_hash, int64_t* candidate_length) except -1 diff --git a/spacy/kb.pyx b/spacy/kb.pyx index d471130d0..790bb4992 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -26,10 +26,11 @@ from libcpp.vector cimport vector cdef class Candidate: - def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, alias_hash, prior_prob): + def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): self.kb = kb self.entity_hash = entity_hash self.entity_freq = entity_freq + self.entity_vector = entity_vector self.alias_hash = alias_hash self.prior_prob = prior_prob @@ -57,19 +58,26 @@ cdef class Candidate: def entity_freq(self): return self.entity_freq + @property + def entity_vector(self): + return self.entity_vector + @property def prior_prob(self): return self.prior_prob cdef class KnowledgeBase: - def __init__(self, Vocab vocab): + + def __init__(self, Vocab vocab, entity_vector_length): self.vocab = vocab self.mem = Pool() + self.entity_vector_length = entity_vector_length + self._entry_index = PreshMap() self._alias_index = PreshMap() - # TODO initialize self._entries and self._aliases_table ? + # Should we initialize self._entries and self._aliases_table to specific starting size ? self.vocab.strings.add("") self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) @@ -89,10 +97,10 @@ cdef class KnowledgeBase: def get_alias_strings(self): return [self.vocab.strings[x] for x in self._alias_index] - def add_entity(self, unicode entity, float prob=0.5, vectors=None, features=None): + def add_entity(self, unicode entity, float prob, vector[float] entity_vector): """ Add an entity to the KB, optionally specifying its log probability based on corpus frequency - Return the hash of the entity ID/name at the end + Return the hash of the entity ID/name at the end. """ cdef hash_t entity_hash = self.vocab.strings.add(entity) @@ -101,31 +109,41 @@ cdef class KnowledgeBase: user_warning(Warnings.W018.format(entity=entity)) return - cdef int32_t dummy_value = 342 - new_index = self.c_add_entity(entity_hash=entity_hash, prob=prob, - vector_rows=&dummy_value, feats_row=dummy_value) - self._entry_index[entity_hash] = new_index + if len(entity_vector) != self.entity_vector_length: + # TODO: proper error + raise ValueError("Entity vector length should have been", self.entity_vector_length) - # TODO self._vectors_table.get_pointer(vectors), - # self._features_table.get(features)) + vector_index = self.c_add_vector(entity_vector=entity_vector) + + new_index = self.c_add_entity(entity_hash=entity_hash, + prob=prob, + vector_index=vector_index, + feats_row=-1) # Features table currently not implemented + self._entry_index[entity_hash] = new_index return entity_hash - cpdef set_entities(self, entity_list, prob_list, vector_list, feature_list): + cpdef set_entities(self, entity_list, prob_list, vector_list): nr_entities = len(entity_list) self._entry_index = PreshMap(nr_entities+1) self._entries = entry_vec(nr_entities+1) i = 0 cdef EntryC entry - cdef int32_t dummy_value = 342 while i < nr_entities: - # TODO features and vectors - entity_hash = self.vocab.strings.add(entity_list[i]) + entity_vector = entity_list[i] + if len(entity_vector) != self.entity_vector_length: + # TODO: proper error + raise ValueError("Entity vector length should have been", self.entity_vector_length) + + entity_hash = self.vocab.strings.add(entity_vector) entry.entity_hash = entity_hash entry.prob = prob_list[i] - entry.vector_rows = &dummy_value - entry.feats_row = dummy_value + + vector_index = self.c_add_vector(entity_vector=vector_list[i]) + entry.vector_index = vector_index + + entry.feats_row = -1 # Features table currently not implemented self._entries[i+1] = entry self._entry_index[entity_hash] = i+1 @@ -186,7 +204,7 @@ cdef class KnowledgeBase: cdef hash_t alias_hash = self.vocab.strings.add(alias) - # Return if this alias was added before + # Check whether this alias was added before if alias_hash in self._alias_index: user_warning(Warnings.W017.format(alias=alias)) return @@ -208,9 +226,7 @@ cdef class KnowledgeBase: return alias_hash - def get_candidates(self, unicode alias): - """ TODO: where to put this functionality ?""" cdef hash_t alias_hash = self.vocab.strings[alias] alias_index = self._alias_index.get(alias_hash) alias_entry = self._aliases_table[alias_index] @@ -218,6 +234,7 @@ cdef class KnowledgeBase: return [Candidate(kb=self, entity_hash=self._entries[entry_index].entity_hash, entity_freq=self._entries[entry_index].prob, + entity_vector=self._vectors_table[self._entries[entry_index].vector_index], alias_hash=alias_hash, prior_prob=prob) for (entry_index, prob) in zip(alias_entry.entry_indices, alias_entry.probs) @@ -226,16 +243,23 @@ cdef class KnowledgeBase: def dump(self, loc): cdef Writer writer = Writer(loc) - writer.write_header(self.get_size_entities()) + writer.write_header(self.get_size_entities(), self.entity_vector_length) + + # dumping the entity vectors in their original order + i = 0 + for entity_vector in self._vectors_table: + for element in entity_vector: + writer.write_vector_element(element) + i = i+1 # dumping the entry records in the order in which they are in the _entries vector. # index 0 is a dummy object not stored in the _entry_index and can be ignored. i = 1 for entry_hash, entry_index in sorted(self._entry_index.items(), key=lambda x: x[1]): entry = self._entries[entry_index] - assert entry.entity_hash == entry_hash + assert entry.entity_hash == entry_hash assert entry_index == i - writer.write_entry(entry.entity_hash, entry.prob) + writer.write_entry(entry.entity_hash, entry.prob, entry.vector_index) i = i+1 writer.write_alias_length(self.get_size_aliases()) @@ -262,31 +286,47 @@ cdef class KnowledgeBase: cdef hash_t alias_hash cdef int64_t entry_index cdef float prob + cdef int32_t vector_index cdef EntryC entry cdef AliasC alias - cdef int32_t dummy_value = 342 + cdef float vector_element cdef Reader reader = Reader(loc) - # Step 1: load entities - + # STEP 0: load header and initialize KB cdef int64_t nr_entities - reader.read_header(&nr_entities) + cdef int64_t entity_vector_length + reader.read_header(&nr_entities, &entity_vector_length) + + self.entity_vector_length = entity_vector_length self._entry_index = PreshMap(nr_entities+1) self._entries = entry_vec(nr_entities+1) + self._vectors_table = float_matrix(nr_entities+1) + # STEP 1: load entity vectors + cdef int i = 0 + cdef int j = 0 + while i < nr_entities: + entity_vector = float_vec(entity_vector_length) + j = 0 + while j < entity_vector_length: + reader.read_vector_element(&vector_element) + entity_vector[j] = vector_element + j = j+1 + self._vectors_table[i] = entity_vector + i = i+1 + + # STEP 2: load entities # we assume that the entity data was written in sequence # index 0 is a dummy object not stored in the _entry_index and can be ignored. - # TODO: should we initialize the dummy objects ? - cdef int i = 1 + i = 1 while i <= nr_entities: - reader.read_entry(&entity_hash, &prob) + reader.read_entry(&entity_hash, &prob, &vector_index) - # TODO features and vectors entry.entity_hash = entity_hash entry.prob = prob - entry.vector_rows = &dummy_value - entry.feats_row = dummy_value + entry.vector_index = vector_index + entry.feats_row = -1 # Features table currently not implemented self._entries[i] = entry self._entry_index[entity_hash] = i @@ -296,7 +336,8 @@ cdef class KnowledgeBase: # check that all entities were read in properly assert nr_entities == self.get_size_entities() - # Step 2: load aliases + # STEP 3: load aliases + cdef int64_t nr_aliases reader.read_alias_length(&nr_aliases) self._alias_index = PreshMap(nr_aliases+1) @@ -344,13 +385,18 @@ cdef class Writer: cdef size_t status = fclose(self._fp) assert status == 0 - cdef int write_header(self, int64_t nr_entries) except -1: + cdef int write_header(self, int64_t nr_entries, int64_t entity_vector_length) except -1: self._write(&nr_entries, sizeof(nr_entries)) + self._write(&entity_vector_length, sizeof(entity_vector_length)) - cdef int write_entry(self, hash_t entry_hash, float entry_prob) except -1: - # TODO: feats_rows and vector rows + cdef int write_vector_element(self, float element) except -1: + self._write(&element, sizeof(element)) + + cdef int write_entry(self, hash_t entry_hash, float entry_prob, int32_t vector_index) except -1: self._write(&entry_hash, sizeof(entry_hash)) self._write(&entry_prob, sizeof(entry_prob)) + self._write(&vector_index, sizeof(vector_index)) + # Features table currently not implemented and not written to file cdef int write_alias_length(self, int64_t alias_length) except -1: self._write(&alias_length, sizeof(alias_length)) @@ -381,14 +427,27 @@ cdef class Reader: def __dealloc__(self): fclose(self._fp) - cdef int read_header(self, int64_t* nr_entries) except -1: + cdef int read_header(self, int64_t* nr_entries, int64_t* entity_vector_length) except -1: status = self._read(nr_entries, sizeof(int64_t)) if status < 1: if feof(self._fp): return 0 # end of file raise IOError("error reading header from input file") - cdef int read_entry(self, hash_t* entity_hash, float* prob) except -1: + status = self._read(entity_vector_length, sizeof(int64_t)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading header from input file") + + cdef int read_vector_element(self, float* element) except -1: + status = self._read(element, sizeof(float)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entity vector from input file") + + cdef int read_entry(self, hash_t* entity_hash, float* prob, int32_t* vector_index) except -1: status = self._read(entity_hash, sizeof(hash_t)) if status < 1: if feof(self._fp): @@ -401,6 +460,12 @@ cdef class Reader: return 0 # end of file raise IOError("error reading entity prob from input file") + status = self._read(vector_index, sizeof(int32_t)) + if status < 1: + if feof(self._fp): + return 0 # end of file + raise IOError("error reading entity vector from input file") + if feof(self._fp): return 0 else: diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index d0c83b56e..d9fbe59ff 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -3,7 +3,7 @@ # coding: utf8 from __future__ import unicode_literals -cimport numpy as np +import numpy as np import numpy import srsly diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 69a1f4961..8de4d5f4c 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -84,16 +84,12 @@ cdef struct EntryC: # The hash of this entry's unique ID/name in the kB hash_t entity_hash - # Allows retrieval of one or more vectors. - # Each element of vector_rows should be an index into a vectors table. - # Every entry should have the same number of vectors, so we can avoid storing - # the number of vectors in each knowledge-base struct - int32_t* vector_rows + # Allows retrieval of the entity vector, as an index into a vectors table of the KB. + # Can be expanded later to refer to multiple rows (compositional model to reduce storage footprint). + int32_t vector_index - # Allows retrieval of a struct of non-vector features. We could make this a - # pointer, but we have 32 bits left over in the struct after prob, so we'd - # like this to only be 32 bits. We can also set this to -1, for the common - # case where there are no features. + # Allows retrieval of a struct of non-vector features. + # This is currently not implemented and set to -1 for the common case where there are no features. int32_t feats_row # log probability of entity, based on corpus frequency diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 61baece68..b44332df4 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -14,12 +14,12 @@ def nlp(): def test_kb_valid_entities(nlp): """Test the valid construction of a KB with 3 entities and two aliases""" - mykb = KnowledgeBase(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9) - mykb.add_entity(entity=u'Q2') - mykb.add_entity(entity=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity=u'Q2', prob=0.5, entity_vector=[2]) + mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) @@ -32,12 +32,12 @@ def test_kb_valid_entities(nlp): def test_kb_invalid_entities(nlp): """Test the invalid construction of a KB with an alias linked to a non-existing entity""" - mykb = KnowledgeBase(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9) - mykb.add_entity(entity=u'Q2', prob=0.2) - mykb.add_entity(entity=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): @@ -46,12 +46,12 @@ def test_kb_invalid_entities(nlp): def test_kb_invalid_probabilities(nlp): """Test the invalid construction of a KB with wrong prior probabilities""" - mykb = KnowledgeBase(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9) - mykb.add_entity(entity=u'Q2', prob=0.2) - mykb.add_entity(entity=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): @@ -60,26 +60,38 @@ def test_kb_invalid_probabilities(nlp): def test_kb_invalid_combination(nlp): """Test the invalid construction of a KB with non-matching entity and probability lists""" - mykb = KnowledgeBase(nlp.vocab) + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9) - mykb.add_entity(entity=u'Q2', prob=0.2) - mykb.add_entity(entity=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.3, 0.4, 0.1]) -def test_candidate_generation(nlp): - """Test correct candidate generation""" - mykb = KnowledgeBase(nlp.vocab) +def test_kb_invalid_entity_vector(nlp): + """Test the invalid construction of a KB with non-matching entity vector lengths""" + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9) - mykb.add_entity(entity=u'Q2', prob=0.2) - mykb.add_entity(entity=u'Q3', prob=0.5) + mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1, 2, 3]) + + # this should fail because the kb's expected entity vector length is 3 + with pytest.raises(ValueError): + mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) + + +def test_candidate_generation(nlp): + """Test correct candidate generation""" + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + + # adding entities + mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) # adding aliases mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 7b1380623..7a8022890 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -20,7 +20,7 @@ def test_serialize_kb_disk(en_vocab): print(file_path, type(file_path)) kb1.dump(str(file_path)) - kb2 = KnowledgeBase(vocab=en_vocab) + kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3) kb2.load_bulk(str(file_path)) # final assertions @@ -28,12 +28,13 @@ def test_serialize_kb_disk(en_vocab): def _get_dummy_kb(vocab): - kb = KnowledgeBase(vocab=vocab) + kb = KnowledgeBase(vocab=vocab, entity_vector_length=3) + + kb.add_entity(entity="Q53", prob=0.33, entity_vector=[0, 5, 3]) + kb.add_entity(entity="Q17", prob=0.2, entity_vector=[7, 1, 0]) + kb.add_entity(entity="Q007", prob=0.7, entity_vector=[0, 0, 7]) + kb.add_entity(entity="Q44", prob=0.4, entity_vector=[4, 4, 4]) - kb.add_entity(entity="Q53", prob=0.33) - kb.add_entity(entity="Q17", prob=0.2) - kb.add_entity(entity="Q007", prob=0.7) - kb.add_entity(entity="Q44", prob=0.4) kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9]) kb.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1]) kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0]) @@ -62,10 +63,12 @@ def _check_kb(kb): assert candidates[0].entity_ == "Q007" assert 0.6999 < candidates[0].entity_freq < 0.701 + assert candidates[0].entity_vector == [0, 0, 7] assert candidates[0].alias_ == "double07" assert 0.899 < candidates[0].prior_prob < 0.901 assert candidates[1].entity_ == "Q17" assert 0.199 < candidates[1].entity_freq < 0.201 + assert candidates[1].entity_vector == [7, 1, 0] assert candidates[1].alias_ == "double07" assert 0.099 < candidates[1].prior_prob < 0.101 From d8b435ceffcf9143a78678d7c87a8e4e4216dcc5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Jun 2019 19:51:27 +0200 Subject: [PATCH 065/148] pretraining description vectors and storing them in the KB --- .../wiki_entity_linking/kb_creator.py | 68 ++++++++++++---- .../pipeline/wiki_entity_linking/run_el.py | 9 +++ .../wiki_entity_linking/train_descriptions.py | 79 +++++++++++-------- .../wiki_entity_linking/wiki_nel_pipeline.py | 39 ++++----- spacy/kb.pyx | 14 +++- spacy/language.py | 2 +- 6 files changed, 133 insertions(+), 78 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index ae3422c91..74e8efabd 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import spacy +from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder from spacy.kb import KnowledgeBase import csv @@ -10,25 +11,47 @@ import datetime from . import wikipedia_processor as wp from . import wikidata_processor as wd +INPUT_DIM = 300 # dimension of pre-trained vectors +DESC_WIDTH = 64 -def create_kb(vocab, max_entities_per_alias, min_occ, +def create_kb(nlp, max_entities_per_alias, min_occ, entity_def_output, entity_descr_output, - count_input, prior_prob_input, - to_print=False, write_entity_defs=True): + count_input, prior_prob_input, to_print=False): """ Create the knowledge base from Wikidata entries """ - kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) # TODO: entity vectors ! + kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH) - print() - print("1. _read_wikidata_entities", datetime.datetime.now()) - print() - title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None) + # disable parts of the pipeline when rerunning + read_raw_data = False - # write the title-ID and ID-description mappings to file - if write_entity_defs: + if read_raw_data: + print() + print("1. _read_wikidata_entities", datetime.datetime.now()) + print() + title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None) + + # write the title-ID and ID-description mappings to file _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr) + else: + # read the mappings from file + title_to_id = _get_entity_to_id(entity_def_output) + id_to_descr = _get_id_to_description(entity_descr_output) + title_list = list(title_to_id.keys()) + + # TODO: remove this filter (just for quicker testing of code) + title_list = title_list[0:34200] + title_to_id = {t: title_to_id[t] for t in title_list} + + # print("title_list", len(title_list), title_list[0:3]) + entity_list = [title_to_id[x] for x in title_list] + # print("entity_list", len(entity_list), entity_list[0:3]) + + # TODO: should we remove entities from the KB where there is no description ? + description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] + # print("description_list", len(description_list), description_list[0:3]) + print() print("2. _get_entity_frequencies", datetime.datetime.now()) @@ -36,13 +59,27 @@ def create_kb(vocab, max_entities_per_alias, min_occ, entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list) print() - print("3. adding", len(entity_list), "entities", datetime.datetime.now()) + print("3. train entity encoder", datetime.datetime.now()) print() - # TODO: vector_list ! - kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=None) + + encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH) + encoder.train(description_list=description_list, to_print=True) + print() + + print("4. get entity embeddings", datetime.datetime.now()) + print() + embeddings = encoder.apply_encoder(description_list) + # print("descriptions", description_list[0:3]) + # print("embeddings", len(embeddings), embeddings[0:3]) + #print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3]) print() - print("4. adding aliases", datetime.datetime.now()) + print("5. adding", len(entity_list), "entities", datetime.datetime.now()) + print() + kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings) + + print() + print("6. adding aliases", datetime.datetime.now()) print() _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, @@ -67,7 +104,6 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_ for qid, descr in id_to_descr.items(): descr_file.write(str(qid) + "|" + descr + "\n") - def _get_entity_to_id(entity_def_output): entity_to_id = dict() with open(entity_def_output, 'r', encoding='utf8') as csvfile: @@ -99,11 +135,11 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in print("wp titles:", wp_titles) # adding aliases with prior probabilities + # we can read this file sequentially, it's sorted by alias, and then by count with open(prior_prob_input, mode='r', encoding='utf8') as prior_file: # skip header prior_file.readline() line = prior_file.readline() - # we can read this file sequentially, it's sorted by alias, and then by count previous_alias = None total_count = 0 counts = list() diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index c0c219829..f6797587e 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -12,6 +12,15 @@ from examples.pipeline.wiki_entity_linking import training_set_creator # import neuralcoref +def run_kb_toy_example(kb): + for mention in ("Bush", "President", "Homer"): + candidates = kb.get_candidates(mention) + + print("generating candidates for " + mention + " :") + for c in candidates: + print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") + print() + def run_el_toy_example(nlp, kb): _prepare_pipeline(nlp, kb) diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py index 88b1bf819..8513a25fd 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -14,72 +14,83 @@ from thinc.neural._classes.affine import Affine class EntityEncoder: - INPUT_DIM = 300 # dimension of pre-trained vectors - DESC_WIDTH = 64 - DROP = 0 EPOCHS = 5 - STOP_THRESHOLD = 0.1 + STOP_THRESHOLD = 0.9 # 0.1 BATCH_SIZE = 1000 - def __init__(self, kb, nlp): + def __init__(self, nlp, input_dim, desc_width): self.nlp = nlp - self.kb = kb + self.input_dim = input_dim + self.desc_width = desc_width - def run(self, entity_descr_output): - id_to_descr = kb_creator._get_id_to_description(entity_descr_output) + def apply_encoder(self, description_list): + if self.encoder is None: + raise ValueError("Can not apply encoder before training it") - processed, loss = self._train_model(entity_descr_output, id_to_descr) - print("Trained on", processed, "entities across", self.EPOCHS, "epochs") - print("Final loss:", loss) - print() + print("Encoding", len(description_list), "entities") - # TODO: apply and write to file afterwards ! - # self._apply_encoder(id_to_descr) + batch_size = 10000 - self._test_encoder() + start = 0 + stop = min(batch_size, len(description_list)) + encodings = [] - def _train_model(self, entity_descr_output, id_to_descr): + while start < len(description_list): + docs = list(self.nlp.pipe(description_list[start:stop])) + doc_embeddings = [self._get_doc_embedding(doc) for doc in docs] + enc = self.encoder(np.asarray(doc_embeddings)) + encodings.extend(enc.tolist()) + + start = start + batch_size + stop = min(stop + batch_size, len(description_list)) + print("encoded :", len(encodings)) + + return encodings + + def train(self, description_list, to_print=False): + processed, loss = self._train_model(description_list) + + if to_print: + print("Trained on", processed, "entities across", self.EPOCHS, "epochs") + print("Final loss:", loss) + + # self._test_encoder() + + def _train_model(self, description_list): # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy - self._build_network(self.INPUT_DIM, self.DESC_WIDTH) + self._build_network(self.input_dim, self.desc_width) processed = 0 loss = 1 + descriptions = description_list.copy() # copy this list so that shuffling does not affect other functions for i in range(self.EPOCHS): - entity_keys = list(id_to_descr.keys()) - shuffle(entity_keys) + shuffle(descriptions) batch_nr = 0 start = 0 - stop = min(self.BATCH_SIZE, len(entity_keys)) + stop = min(self.BATCH_SIZE, len(descriptions)) - while loss > self.STOP_THRESHOLD and start < len(entity_keys): + while loss > self.STOP_THRESHOLD and start < len(descriptions): batch = [] - for e in entity_keys[start:stop]: - descr = id_to_descr[e] + for descr in descriptions[start:stop]: doc = self.nlp(descr) doc_vector = self._get_doc_embedding(doc) batch.append(doc_vector) - loss = self.update(batch) + loss = self._update(batch) print(i, batch_nr, loss) processed += len(batch) batch_nr += 1 start = start + self.BATCH_SIZE - stop = min(stop + self.BATCH_SIZE, len(entity_keys)) + stop = min(stop + self.BATCH_SIZE, len(descriptions)) return processed, loss - def _apply_encoder(self, id_to_descr): - for id, descr in id_to_descr.items(): - doc = self.nlp(descr) - doc_vector = self._get_doc_embedding(doc) - encoding = self.encoder(np.asarray([doc_vector])) - @staticmethod def _get_doc_embedding(doc): indices = np.zeros((len(doc),), dtype="i") @@ -101,16 +112,16 @@ class EntityEncoder: self.sgd = create_default_optimizer(self.model.ops) - def update(self, vectors): + def _update(self, vectors): predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP) - loss, d_scores = self.get_loss(scores=predictions, golds=np.asarray(vectors)) + loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors)) bp_model(d_scores, sgd=self.sgd) return loss / len(vectors) @staticmethod - def get_loss(golds, scores): + def _get_loss(golds, scores): loss, gradients = get_cossim_loss(scores, golds) return loss, gradients diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index d813238b7..a669634f9 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el -from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder from examples.pipeline.wiki_entity_linking.train_el import EL_Model import spacy @@ -28,6 +27,7 @@ TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' if __name__ == "__main__": print("START", datetime.datetime.now()) print() + nlp = spacy.load('en_core_web_lg') my_kb = None # one-time methods to create KB and write to file @@ -37,10 +37,7 @@ if __name__ == "__main__": # read KB back in from file to_read_kb = True - to_test_kb = False - - # run entity description pre-training - run_desc_training = True + to_test_kb = True # create training dataset create_wp_training = False @@ -51,6 +48,8 @@ if __name__ == "__main__": # apply named entity linking to the dev dataset apply_to_dev = False + to_test_pipeline = False + # STEP 1 : create prior probabilities from WP # run only once ! if to_create_prior_probs: @@ -69,9 +68,7 @@ if __name__ == "__main__": # run only once ! if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) - my_nlp = spacy.load('en_core_web_sm') - my_vocab = my_nlp.vocab - my_kb = kb_creator.create_kb(my_vocab, + my_kb = kb_creator.create_kb(nlp, max_entities_per_alias=10, min_occ=5, entity_def_output=ENTITY_DEFS, @@ -85,7 +82,7 @@ if __name__ == "__main__": print("STEP 3b: write KB", datetime.datetime.now()) my_kb.dump(KB_FILE) - my_vocab.to_disk(VOCAB_DIR) + nlp.vocab.to_disk(VOCAB_DIR) print() # STEP 4 : read KB back in from file @@ -101,18 +98,9 @@ if __name__ == "__main__": # test KB if to_test_kb: - my_nlp = spacy.load('en_core_web_sm') - run_el.run_el_toy_example(kb=my_kb, nlp=my_nlp) + run_el.run_kb_toy_example(kb=my_kb) print() - # STEP 4b : read KB back in from file, create entity descriptions - # TODO: write back to file - if run_desc_training: - print("STEP 4b: training entity descriptions", datetime.datetime.now()) - my_nlp = spacy.load('en_core_web_md') - EntityEncoder(my_kb, my_nlp).run(entity_descr_output=ENTITY_DESCR) - print() - # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) @@ -121,15 +109,18 @@ if __name__ == "__main__": # STEP 6: apply the EL algorithm on the training dataset if run_el_training: print("STEP 6: training", datetime.datetime.now()) - my_nlp = spacy.load('en_core_web_md') - trainer = EL_Model(kb=my_kb, nlp=my_nlp) + trainer = EL_Model(kb=my_kb, nlp=nlp) trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500) print() - # STEP 7: apply the EL algorithm on the dev dataset + # STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?) if apply_to_dev: - my_nlp = spacy.load('en_core_web_md') - run_el.run_el_dev(kb=my_kb, nlp=my_nlp, training_dir=TRAINING_DIR, limit=2000) + run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000) + print() + + # test KB + if to_test_pipeline: + run_el.run_el_toy_example(kb=my_kb, nlp=nlp) print() # TODO coreference resolution diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 790bb4992..30440227f 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -124,6 +124,14 @@ cdef class KnowledgeBase: return entity_hash cpdef set_entities(self, entity_list, prob_list, vector_list): + if len(entity_list) != len(prob_list): + # TODO: proper error + raise ValueError("Entity list and prob list should have the same length") + + if len(entity_list) != len(vector_list): + # TODO: proper error + raise ValueError("Entity list and vector list should have the same length") + nr_entities = len(entity_list) self._entry_index = PreshMap(nr_entities+1) self._entries = entry_vec(nr_entities+1) @@ -131,12 +139,12 @@ cdef class KnowledgeBase: i = 0 cdef EntryC entry while i < nr_entities: - entity_vector = entity_list[i] + entity_vector = vector_list[i] if len(entity_vector) != self.entity_vector_length: # TODO: proper error - raise ValueError("Entity vector length should have been", self.entity_vector_length) + raise ValueError("Entity vector is", len(entity_vector), "length but should have been", self.entity_vector_length) - entity_hash = self.vocab.strings.add(entity_vector) + entity_hash = self.vocab.strings.add(entity_list[i]) entry.entity_hash = entity_hash entry.prob = prob_list[i] diff --git a/spacy/language.py b/spacy/language.py index 39d95c689..ec3232bd5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -117,7 +117,7 @@ class Language(object): "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), - "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg), + "entity_linker": lambda nlp, **cfg: EntityLinker(**cfg), "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg), From 61f0e2af654ae6202a9b283794021c84d458fd5b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 6 Jun 2019 20:22:14 +0200 Subject: [PATCH 066/148] code cleanup --- .../wiki_entity_linking/kb_creator.py | 13 ++------ .../pipeline/wiki_entity_linking/run_el.py | 12 ++----- .../training_set_creator.py | 1 - .../wiki_entity_linking/wiki_nel_pipeline.py | 14 ++++++-- spacy/pipeline/pipes.pyx | 32 ++++++++----------- 5 files changed, 31 insertions(+), 41 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index 74e8efabd..ee632bd48 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -43,15 +43,10 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_list = title_list[0:34200] title_to_id = {t: title_to_id[t] for t in title_list} - # print("title_list", len(title_list), title_list[0:3]) - entity_list = [title_to_id[x] for x in title_list] - # print("entity_list", len(entity_list), entity_list[0:3]) - # TODO: should we remove entities from the KB where there is no description ? + # Currently keeping entities from the KB where there is no description - putting a default void description description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] - # print("description_list", len(description_list), description_list[0:3]) - print() print("2. _get_entity_frequencies", datetime.datetime.now()) @@ -69,9 +64,6 @@ def create_kb(nlp, max_entities_per_alias, min_occ, print("4. get entity embeddings", datetime.datetime.now()) print() embeddings = encoder.apply_encoder(description_list) - # print("descriptions", description_list[0:3]) - # print("embeddings", len(embeddings), embeddings[0:3]) - #print("embeddings[0]", len(embeddings[0]), embeddings[0][0:3]) print() print("5. adding", len(entity_list), "entities", datetime.datetime.now()) @@ -104,6 +96,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_ for qid, descr in id_to_descr.items(): descr_file.write(str(qid) + "|" + descr + "\n") + def _get_entity_to_id(entity_def_output): entity_to_id = dict() with open(entity_def_output, 'r', encoding='utf8') as csvfile: @@ -135,7 +128,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in print("wp titles:", wp_titles) # adding aliases with prior probabilities - # we can read this file sequentially, it's sorted by alias, and then by count + # we can read this file sequentially, it's sorted by alias, and then by count with open(prior_prob_input, mode='r', encoding='utf8') as prior_file: # skip header prior_file.readline() diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index f6797587e..c3074ab5c 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -13,7 +13,7 @@ from examples.pipeline.wiki_entity_linking import training_set_creator def run_kb_toy_example(kb): - for mention in ("Bush", "President", "Homer"): + for mention in ("Bush", "Douglas Adams", "Homer"): candidates = kb.get_candidates(mention) print("generating candidates for " + mention + " :") @@ -128,18 +128,12 @@ def evaluate(predictions, golds, to_print=True, times_hundred=True): return precision, recall, fscore, accuracy -def _prepare_pipeline(nlp, kb): - # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) - nlp.add_pipe(el_pipe, last=True) + # TODO -def add_coref(): +def add_coref(nlp): """ Add coreference resolution to our model """ - nlp = spacy.load('en_core_web_sm') - # nlp = spacy.load('en') - # TODO: this doesn't work yet # neuralcoref.add_to_pipe(nlp) print("done adding to pipe") diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index b1c63c55c..ac8ad0744 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -18,7 +18,6 @@ ENTITY_FILE = "gold_entities.csv" def create_training(kb, entity_def_input, training_output): if not kb: raise ValueError("kb should be defined") - # nlp = spacy.load('en_core_web_sm') wp_to_id = kb_creator._get_entity_to_id(entity_def_input) _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index a669634f9..390a6800b 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -37,11 +37,13 @@ if __name__ == "__main__": # read KB back in from file to_read_kb = True - to_test_kb = True + to_test_kb = False # create training dataset create_wp_training = False + train_pipe = True + # run EL training run_el_training = False @@ -106,7 +108,15 @@ if __name__ == "__main__": print("STEP 5: create training dataset", datetime.datetime.now()) training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) - # STEP 6: apply the EL algorithm on the training dataset + # STEP 6: create the entity linking pipe + if train_pipe: + # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) + nlp.add_pipe(el_pipe, last=True) + + ### BELOW CODE IS DEPRECATED ### + + # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx if run_el_training: print("STEP 6: training", datetime.datetime.now()) trainer = EL_Model(kb=my_kb, nlp=nlp) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index d9fbe59ff..c5187a593 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1067,41 +1067,37 @@ cdef class EntityRecognizer(Parser): class EntityLinker(Pipe): + """Pipeline component for named entity linking. + + DOCS: TODO + """ name = 'entity_linker' @classmethod def Model(cls, **cfg): embed_width = cfg.get("embed_width", 300) hidden_width = cfg.get("hidden_width", 32) - entity_width = cfg.get("entity_width", 64) article_width = cfg.get("article_width", 128) sent_width = cfg.get("sent_width", 64) - - entity_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=entity_width) + entity_width = cfg["kb"].entity_vector_length article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width) sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width) # dimension of the mention encoder needs to match the dimension of the entity encoder - mention_width = entity_encoder.nO + mention_width = article_width + sent_width mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0) - return entity_encoder, article_encoder, sent_encoder, mention_encoder + return article_encoder, sent_encoder, mention_encoder def __init__(self, **cfg): - # TODO: bring-your-own-model self.mention_encoder = True - self.cfg = dict(cfg) self.kb = self.cfg["kb"] - # TODO: fix this. store entity vectors in the KB ? - self.id_to_descr = kb_creator._get_id_to_description('C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv') - def use_avg_params(self): """Modify the pipe's encoders/models, to use their average parameter values.""" with self.article_encoder.use_params(self.sgd_article.averages) \ - and self.entity_encoder.use_params(self.sgd_entity.averages)\ and self.sent_encoder.use_params(self.sgd_sent.averages) \ and self.mention_encoder.use_params(self.sgd_mention.averages): yield @@ -1113,14 +1109,13 @@ class EntityLinker(Pipe): def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): if self.mention_encoder is True: - self.entity_encoder, self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) + self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) self.sgd_article = create_default_optimizer(self.article_encoder.ops) self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) - self.sgd_entity = create_default_optimizer(self.entity_encoder.ops) def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): - """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) """ + """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """ self.require_model() entity_docs, article_docs, sentence_docs = docs @@ -1131,7 +1126,7 @@ class EntityLinker(Pipe): article_docs = [article_docs] sentence_docs = [sentence_docs] - entity_encodings, bp_entity = self.entity_encoder.begin_update(entity_docs, drop=drop) + entity_encodings = None #TODO doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) @@ -1195,10 +1190,9 @@ class EntityLinker(Pipe): for c in candidates: prior_prob = c.prior_prob kb_id = c.entity_ - description = self.id_to_descr.get(kb_id) - entity_encodings = self.entity_encoder([description]) # TODO: static entity vectors ? - sim = cosine(entity_encodings, mention_enc_t) - score = prior_prob + sim - (prior_prob*sim) # TODO: weights ? + entity_encoding = c.entity_vector + sim = cosine([entity_encoding], mention_enc_t) + score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? scores.append(score) best_index = scores.index(max(scores)) From a5c061f50633831ce49e4cc6660d177569bb9767 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 7 Jun 2019 12:58:42 +0200 Subject: [PATCH 067/148] storing NEL training data in GoldParse objects --- .../wiki_entity_linking/train_descriptions.py | 1 + .../training_set_creator.py | 63 ++++++++++++++++++- .../wiki_entity_linking/wiki_nel_pipeline.py | 28 ++++++++- spacy/kb.pyx | 5 ++ spacy/pipeline/pipes.pyx | 8 ++- 5 files changed, 99 insertions(+), 6 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py index 8513a25fd..f2c3fa05d 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -1,3 +1,4 @@ +# coding: utf-8 from random import shuffle from examples.pipeline.wiki_entity_linking import kb_creator diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index ac8ad0744..c1879e2fb 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -1,11 +1,15 @@ # coding: utf-8 from __future__ import unicode_literals +import os import re -import csv import bz2 import datetime +from os import listdir +from examples.pipeline.wiki_entity_linking import run_el +from spacy.gold import GoldParse +from spacy.matcher import PhraseMatcher from . import wikipedia_processor as wp, kb_creator """ @@ -294,5 +298,62 @@ def read_training_entities(training_output, collect_correct=True, collect_incorr return correct_entries_per_article, incorrect_entries_per_article +def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_print): + correct_entries, incorrect_entries = read_training_entities(training_output=training_dir, + collect_correct=True, + collect_incorrect=True) + + docs = list() + golds = list() + + cnt = 0 + next_entity_nr = 1 + files = listdir(training_dir) + for f in files: + if not limit or cnt < limit: + if dev == run_el.is_dev(f): + article_id = f.replace(".txt", "") + if cnt % 500 == 0 and to_print: + print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") + + try: + # parse the article text + with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: + text = file.read() + article_doc = nlp(text) + truncated_text = text[0:min(doc_cutoff, len(text))] + + gold_entities = dict() + + # process all positive and negative entities, collect all relevant mentions in this article + for mention, entity_pos in correct_entries[article_id].items(): + # find all matches in the doc for the mentions + # TODO: fix this - doesn't look like all entities are found + matcher = PhraseMatcher(nlp.vocab) + patterns = list(nlp.tokenizer.pipe([mention])) + + matcher.add("TerminologyList", None, *patterns) + matches = matcher(article_doc) + + # store gold entities + for match_id, start, end in matches: + gold_entities[(start, end, entity_pos)] = 1.0 + + gold = GoldParse(doc=article_doc, cats=gold_entities) + docs.append(article_doc) + golds.append(gold) + + cnt += 1 + except Exception as e: + print("Problem parsing article", article_id) + print(e) + + if to_print: + print() + print("Processed", cnt, "training articles, dev=" + str(dev)) + print() + return docs, golds + + diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 390a6800b..08f4adda0 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -23,6 +23,9 @@ VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' +MAX_CANDIDATES=10 +MIN_PAIR_OCC=5 +DOC_CHAR_CUTOFF=300 if __name__ == "__main__": print("START", datetime.datetime.now()) @@ -71,8 +74,8 @@ if __name__ == "__main__": if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) my_kb = kb_creator.create_kb(nlp, - max_entities_per_alias=10, - min_occ=5, + max_entities_per_alias=MAX_CANDIDATES, + min_occ=MIN_PAIR_OCC, entity_def_output=ENTITY_DEFS, entity_descr_output=ENTITY_DESCR, count_input=ENTITY_COUNTS, @@ -110,10 +113,29 @@ if __name__ == "__main__": # STEP 6: create the entity linking pipe if train_pipe: - # TODO: the vocab objects are now different between nlp and kb - will be fixed when KB is written as part of NLP IO + id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR) + + docs, golds = training_set_creator.read_training(nlp=nlp, + training_dir=TRAINING_DIR, + id_to_descr=id_to_descr, + doc_cutoff=DOC_CHAR_CUTOFF, + dev=False, + limit=10, + to_print=False) + + # for doc, gold in zip(docs, golds): + # print("doc", doc) + # for entity, label in gold.cats.items(): + # print("entity", entity, label) + # print() + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) nlp.add_pipe(el_pipe, last=True) + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] + with nlp.disable_pipes(*other_pipes): # only train Entity Linking + nlp.begin_training() + ### BELOW CODE IS DEPRECATED ### # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 30440227f..ade2360be 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -82,6 +82,11 @@ cdef class KnowledgeBase: self.vocab.strings.add("") self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) + @property + def entity_vector_length(self): + """RETURNS (uint64): length of the entity vectors""" + return self.entity_vector_length + def __len__(self): return self.get_size_entities() diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index c5187a593..a3caae455 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1081,8 +1081,8 @@ class EntityLinker(Pipe): sent_width = cfg.get("sent_width", 64) entity_width = cfg["kb"].entity_vector_length - article_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width) - sent_encoder = build_nel_encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width) + article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg) + sent_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg) # dimension of the mention encoder needs to match the dimension of the entity encoder mention_width = article_width + sent_width @@ -1118,6 +1118,10 @@ class EntityLinker(Pipe): """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """ self.require_model() + if len(docs) != len(golds): + raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs), + n_golds=len(golds))) + entity_docs, article_docs, sentence_docs = docs assert len(entity_docs) == len(article_docs) == len(sentence_docs) From 0486ccabfdbfd6ee4531574ad18b5dde085b43be Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 7 Jun 2019 13:54:45 +0200 Subject: [PATCH 068/148] introduce goldparse.links --- .../training_set_creator.py | 14 ++-- .../wiki_entity_linking/wiki_nel_pipeline.py | 34 +++++--- spacy/gold.pxd | 1 + spacy/gold.pyx | 5 +- spacy/pipeline/pipes.pyx | 81 +++++++++++-------- 5 files changed, 82 insertions(+), 53 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index c1879e2fb..156bce05f 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -303,8 +303,7 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri collect_correct=True, collect_incorrect=True) - docs = list() - golds = list() + data = [] cnt = 0 next_entity_nr = 1 @@ -323,7 +322,7 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri article_doc = nlp(text) truncated_text = text[0:min(doc_cutoff, len(text))] - gold_entities = dict() + gold_entities = list() # process all positive and negative entities, collect all relevant mentions in this article for mention, entity_pos in correct_entries[article_id].items(): @@ -337,11 +336,10 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri # store gold entities for match_id, start, end in matches: - gold_entities[(start, end, entity_pos)] = 1.0 + gold_entities.append((start, end, entity_pos)) - gold = GoldParse(doc=article_doc, cats=gold_entities) - docs.append(article_doc) - golds.append(gold) + gold = GoldParse(doc=article_doc, links=gold_entities) + data.append((article_doc, gold)) cnt += 1 except Exception as e: @@ -352,7 +350,7 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri print() print("Processed", cnt, "training articles, dev=" + str(dev)) print() - return docs, golds + return data diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 08f4adda0..b66f8b316 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -1,6 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals +import random + +from spacy.util import minibatch, compounding + from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el from examples.pipeline.wiki_entity_linking.train_el import EL_Model @@ -23,9 +27,11 @@ VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' -MAX_CANDIDATES=10 -MIN_PAIR_OCC=5 -DOC_CHAR_CUTOFF=300 +MAX_CANDIDATES = 10 +MIN_PAIR_OCC = 5 +DOC_CHAR_CUTOFF = 300 +EPOCHS = 5 +DROPOUT = 0.1 if __name__ == "__main__": print("START", datetime.datetime.now()) @@ -115,7 +121,7 @@ if __name__ == "__main__": if train_pipe: id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR) - docs, golds = training_set_creator.read_training(nlp=nlp, + train_data = training_set_creator.read_training(nlp=nlp, training_dir=TRAINING_DIR, id_to_descr=id_to_descr, doc_cutoff=DOC_CHAR_CUTOFF, @@ -123,12 +129,6 @@ if __name__ == "__main__": limit=10, to_print=False) - # for doc, gold in zip(docs, golds): - # print("doc", doc) - # for entity, label in gold.cats.items(): - # print("entity", entity, label) - # print() - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) nlp.add_pipe(el_pipe, last=True) @@ -136,6 +136,20 @@ if __name__ == "__main__": with nlp.disable_pipes(*other_pipes): # only train Entity Linking nlp.begin_training() + for itn in range(EPOCHS): + random.shuffle(train_data) + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + docs, golds = zip(*batch) + nlp.update( + docs, + golds, + drop=DROPOUT, + losses=losses, + ) + print("Losses", losses) + ### BELOW CODE IS DEPRECATED ### # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx diff --git a/spacy/gold.pxd b/spacy/gold.pxd index a1550b1ef..8943a155a 100644 --- a/spacy/gold.pxd +++ b/spacy/gold.pxd @@ -31,6 +31,7 @@ cdef class GoldParse: cdef public list ents cdef public dict brackets cdef public object cats + cdef public list links cdef readonly list cand_to_gold cdef readonly list gold_to_cand diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 569979a5f..4fb22f3f0 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -427,7 +427,7 @@ cdef class GoldParse: def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, deps=None, entities=None, make_projective=False, - cats=None, **_): + cats=None, links=None, **_): """Create a GoldParse. doc (Doc): The document the annotations refer to. @@ -450,6 +450,8 @@ cdef class GoldParse: examples of a label to have the value 0.0. Labels not in the dictionary are treated as missing - the gradient for those labels will be zero. + links (iterable): A sequence of `(start_char, end_char, kb_id)` tuples, + representing the external ID of an entity in a knowledge base. RETURNS (GoldParse): The newly constructed object. """ if words is None: @@ -485,6 +487,7 @@ cdef class GoldParse: self.c.ner = self.mem.alloc(len(doc), sizeof(Transition)) self.cats = {} if cats is None else dict(cats) + self.links = links self.words = [None] * len(doc) self.tags = [None] * len(doc) self.heads = [None] * len(doc) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a3caae455..f15ffd036 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1115,48 +1115,61 @@ class EntityLinker(Pipe): self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): - """ docs should be a tuple of (entity_docs, article_docs, sentence_docs) TODO """ self.require_model() if len(docs) != len(golds): - raise ValueError(Errors.E077.format(value="loss", n_docs=len(docs), + raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs), n_golds=len(golds))) - entity_docs, article_docs, sentence_docs = docs - assert len(entity_docs) == len(article_docs) == len(sentence_docs) + if isinstance(docs, Doc): + docs = [docs] + golds = [golds] - if isinstance(entity_docs, Doc): - entity_docs = [entity_docs] - article_docs = [article_docs] - sentence_docs = [sentence_docs] + for doc, gold in zip(docs, golds): + print("doc", doc) + for entity in gold.links: + start, end, gold_kb = entity + print("entity", entity) + mention = doc[start:end].text + print("mention", mention) + candidates = self.kb.get_candidates(mention) + for c in candidates: + prior_prob = c.prior_prob + kb_id = c.entity_ + print("candidate", kb_id, prior_prob) + entity_encoding = c.entity_vector + print() - entity_encodings = None #TODO - doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) - sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) + print() - concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in - range(len(article_docs))] - mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) - - loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) - - mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont) - - # gradient : concat (doc+sent) vs. desc - sent_start = self.article_encoder.nO - sent_gradients = list() - doc_gradients = list() - for x in mention_gradient: - doc_gradients.append(list(x[0:sent_start])) - sent_gradients.append(list(x[sent_start:])) - - bp_doc(doc_gradients, sgd=self.sgd_article) - bp_sent(sent_gradients, sgd=self.sgd_sent) - - if losses is not None: - losses.setdefault(self.name, 0.0) - losses[self.name] += loss - return loss + # entity_encodings = None #TODO + # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) + # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) + # + # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in + # range(len(article_docs))] + # mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) + # + # loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) + # + # mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont) + # + # # gradient : concat (doc+sent) vs. desc + # sent_start = self.article_encoder.nO + # sent_gradients = list() + # doc_gradients = list() + # for x in mention_gradient: + # doc_gradients.append(list(x[0:sent_start])) + # sent_gradients.append(list(x[sent_start:])) + # + # bp_doc(doc_gradients, sgd=self.sgd_article) + # bp_sent(sent_gradients, sgd=self.sgd_sent) + # + # if losses is not None: + # losses.setdefault(self.name, 0.0) + # losses[self.name] += loss + # return loss + return None def get_loss(self, docs, golds, scores): loss, gradients = get_cossim_loss(scores, golds) From 7de1ee69b819cba8b66db370dcb1ec169b4a7b74 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 7 Jun 2019 15:55:10 +0200 Subject: [PATCH 069/148] training loop in proper pipe format --- .../wiki_entity_linking/wiki_nel_pipeline.py | 13 +-- spacy/pipeline/pipes.pyx | 84 ++++++++++--------- 2 files changed, 49 insertions(+), 48 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index b66f8b316..ded4bdc24 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -126,7 +126,7 @@ if __name__ == "__main__": id_to_descr=id_to_descr, doc_cutoff=DOC_CHAR_CUTOFF, dev=False, - limit=10, + limit=100, to_print=False) el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) @@ -137,6 +137,8 @@ if __name__ == "__main__": nlp.begin_training() for itn in range(EPOCHS): + print() + print("EPOCH", itn) random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) @@ -150,15 +152,6 @@ if __name__ == "__main__": ) print("Losses", losses) - ### BELOW CODE IS DEPRECATED ### - - # STEP 6: apply the EL algorithm on the training dataset - TODO deprecated - code moved to pipes.pyx - if run_el_training: - print("STEP 6: training", datetime.datetime.now()) - trainer = EL_Model(kb=my_kb, nlp=nlp) - trainer.train_model(training_dir=TRAINING_DIR, entity_descr_output=ENTITY_DESCR, trainlimit=10000, devlimit=500) - print() - # STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?) if apply_to_dev: run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f15ffd036..01302b618 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1125,51 +1125,59 @@ class EntityLinker(Pipe): docs = [docs] golds = [golds] + article_docs = list() + sentence_docs = list() + entity_encodings = list() + for doc, gold in zip(docs, golds): - print("doc", doc) for entity in gold.links: start, end, gold_kb = entity - print("entity", entity) - mention = doc[start:end].text - print("mention", mention) - candidates = self.kb.get_candidates(mention) + mention = doc[start:end] + sentence = mention.sent + + candidates = self.kb.get_candidates(mention.text) for c in candidates: - prior_prob = c.prior_prob kb_id = c.entity_ - print("candidate", kb_id, prior_prob) - entity_encoding = c.entity_vector - print() + # TODO: currently only training on the positive instances + if kb_id == gold_kb: + prior_prob = c.prior_prob + entity_encoding = c.entity_vector - print() + entity_encodings.append(entity_encoding) + article_docs.append(doc) + sentence_docs.append(sentence.as_doc()) - # entity_encodings = None #TODO - # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) - # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) - # - # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in - # range(len(article_docs))] - # mention_encodings, bp_cont = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) - # - # loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) - # - # mention_gradient = bp_cont(d_scores, sgd=self.sgd_cont) - # - # # gradient : concat (doc+sent) vs. desc - # sent_start = self.article_encoder.nO - # sent_gradients = list() - # doc_gradients = list() - # for x in mention_gradient: - # doc_gradients.append(list(x[0:sent_start])) - # sent_gradients.append(list(x[sent_start:])) - # - # bp_doc(doc_gradients, sgd=self.sgd_article) - # bp_sent(sent_gradients, sgd=self.sgd_sent) - # - # if losses is not None: - # losses.setdefault(self.name, 0.0) - # losses[self.name] += loss - # return loss - return None + if len(entity_encodings) > 0: + doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) + sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) + + concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in + range(len(article_docs))] + mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop) + + entity_encodings = np.asarray(entity_encodings, dtype=np.float32) + + loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) + + mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention) + + # gradient : concat (doc+sent) vs. desc + sent_start = self.article_encoder.nO + sent_gradients = list() + doc_gradients = list() + for x in mention_gradient: + doc_gradients.append(list(x[0:sent_start])) + sent_gradients.append(list(x[sent_start:])) + + bp_doc(doc_gradients, sgd=self.sgd_article) + bp_sent(sent_gradients, sgd=self.sgd_sent) + + if losses is not None: + losses.setdefault(self.name, 0.0) + losses[self.name] += loss + return loss + + return 0 def get_loss(self, docs, golds, scores): loss, gradients = get_cossim_loss(scores, golds) From 83dc7b46fd1b39023c6eb883471c961d9e5bd51c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 10 Jun 2019 21:25:26 +0200 Subject: [PATCH 070/148] first tests with EL pipe --- .../wiki_entity_linking/kb_creator.py | 4 +-- .../wiki_entity_linking/train_descriptions.py | 4 +-- .../training_set_creator.py | 4 +-- .../wiki_entity_linking/wiki_nel_pipeline.py | 19 +++++----- spacy/pipeline/pipes.pyx | 36 ++++++++++++++----- 5 files changed, 43 insertions(+), 24 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index ee632bd48..e7e3d077d 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_list = list(title_to_id.keys()) # TODO: remove this filter (just for quicker testing of code) - title_list = title_list[0:34200] - title_to_id = {t: title_to_id[t] for t in title_list} + # title_list = title_list[0:34200] + # title_to_id = {t: title_to_id[t] for t in title_list} entity_list = [title_to_id[x] for x in title_list] diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py index f2c3fa05d..e1a2f1797 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -17,7 +17,7 @@ class EntityEncoder: DROP = 0 EPOCHS = 5 - STOP_THRESHOLD = 0.9 # 0.1 + STOP_THRESHOLD = 0.1 BATCH_SIZE = 1000 @@ -32,7 +32,7 @@ class EntityEncoder: print("Encoding", len(description_list), "entities") - batch_size = 10000 + batch_size = 100000 start = 0 stop = min(batch_size, len(description_list)) diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index 156bce05f..38a86058d 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -298,7 +298,7 @@ def read_training_entities(training_output, collect_correct=True, collect_incorr return correct_entries_per_article, incorrect_entries_per_article -def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_print): +def read_training(nlp, training_dir, dev, limit, to_print): correct_entries, incorrect_entries = read_training_entities(training_output=training_dir, collect_correct=True, collect_incorrect=True) @@ -306,7 +306,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri data = [] cnt = 0 - next_entity_nr = 1 files = listdir(training_dir) for f in files: if not limit or cnt < limit: @@ -320,7 +319,6 @@ def read_training(nlp, training_dir, id_to_descr, doc_cutoff, dev, limit, to_pri with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: text = file.read() article_doc = nlp(text) - truncated_text = text[0:min(doc_cutoff, len(text))] gold_entities = list() diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index ded4bdc24..4be1ae2fb 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -121,15 +121,16 @@ if __name__ == "__main__": if train_pipe: id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR) - train_data = training_set_creator.read_training(nlp=nlp, - training_dir=TRAINING_DIR, - id_to_descr=id_to_descr, - doc_cutoff=DOC_CHAR_CUTOFF, - dev=False, - limit=100, - to_print=False) + train_limit = 10 + print("Training on", train_limit, "articles") - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb}) + train_data = training_set_creator.read_training(nlp=nlp, + training_dir=TRAINING_DIR, + dev=False, + limit=train_limit, + to_print=False) + + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF}) nlp.add_pipe(el_pipe, last=True) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] @@ -141,7 +142,7 @@ if __name__ == "__main__": print("EPOCH", itn) random.shuffle(train_data) losses = {} - batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) for batch in batches: docs, golds = zip(*batch) nlp.update( diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 01302b618..e5ed2ec23 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -11,9 +11,8 @@ from collections import OrderedDict from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm -from thinc.neural.util import to_categorical, copy_array - -from spacy.cli.pretrain import get_cossim_loss +from thinc.neural.util import to_categorical +from thinc.neural.util import get_array_module from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser @@ -33,9 +32,6 @@ from .._ml import masked_language_model, create_default_optimizer from ..errors import Errors, TempErrors from .. import util -# TODO: remove -from examples.pipeline.wiki_entity_linking import kb_creator - def _load_cfg(path): if path.exists(): @@ -1094,6 +1090,7 @@ class EntityLinker(Pipe): self.mention_encoder = True self.cfg = dict(cfg) self.kb = self.cfg["kb"] + self.doc_cutoff = self.cfg["doc_cutoff"] def use_avg_params(self): """Modify the pipe's encoders/models, to use their average parameter values.""" @@ -1134,6 +1131,7 @@ class EntityLinker(Pipe): start, end, gold_kb = entity mention = doc[start:end] sentence = mention.sent + first_par = doc[0:self.doc_cutoff].as_doc() candidates = self.kb.get_candidates(mention.text) for c in candidates: @@ -1144,7 +1142,7 @@ class EntityLinker(Pipe): entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) - article_docs.append(doc) + article_docs.append(first_par) sentence_docs.append(sentence.as_doc()) if len(entity_encodings) > 0: @@ -1158,6 +1156,10 @@ class EntityLinker(Pipe): entity_encodings = np.asarray(entity_encodings, dtype=np.float32) loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) + # print("scores", mention_encodings) + # print("golds", entity_encodings) + # print("loss", loss) + # print("d_scores", d_scores) mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention) @@ -1180,9 +1182,26 @@ class EntityLinker(Pipe): return 0 def get_loss(self, docs, golds, scores): - loss, gradients = get_cossim_loss(scores, golds) + targets = [[1] for _ in golds] # assuming we're only using positive examples + loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets) + #loss = loss / len(golds) return loss, gradients + def get_cossim_loss_2(self, yh, y, t): + # Add a small constant to avoid 0 vectors + yh = yh + 1e-8 + y = y + 1e-8 + # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity + xp = get_array_module(yh) + norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) + norm_y = xp.linalg.norm(y, axis=1, keepdims=True) + mul_norms = norm_yh * norm_y + cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms + d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2)) + loss = xp.abs(cos - t).sum() + inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))]) + return loss, -inverse + def __call__(self, doc): entities, kb_ids = self.predict([doc]) self.set_annotations([doc], entities, kb_ids) @@ -1220,6 +1239,7 @@ class EntityLinker(Pipe): score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? scores.append(score) + # TODO: thresholding best_index = scores.index(max(scores)) best_candidate = candidates[best_index] final_entities.append(ent) From fe1ed432eff61f087a06c89840f37dc75d24ee59 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 11 Jun 2019 11:40:58 +0200 Subject: [PATCH 071/148] eval on dev set, varying combo's of prior and context scores --- .../wiki_entity_linking/kb_creator.py | 2 +- .../pipeline/wiki_entity_linking/run_el.py | 19 --- .../wiki_entity_linking/wiki_nel_pipeline.py | 130 ++++++++++++++---- spacy/pipeline/pipes.pyx | 46 ++++--- 4 files changed, 127 insertions(+), 70 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index e7e3d077d..d097ac449 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -20,7 +20,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ, """ Create the knowledge base from Wikidata entries """ kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH) - # disable parts of the pipeline when rerunning + # disable this part of the pipeline when rerunning the KB generation from preprocessed files read_raw_data = False if read_raw_data: diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index c3074ab5c..52ccccfda 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -21,29 +21,10 @@ def run_kb_toy_example(kb): print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") print() -def run_el_toy_example(nlp, kb): - _prepare_pipeline(nlp, kb) - candidates = kb.get_candidates("Bush") - - print("generating candidates for 'Bush' :") - for c in candidates: - print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") - print() - - text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel. " \ - "The main character in Doug's novel is the man Arthur Dent, " \ - "but Douglas doesn't write about George Washington or Homer Simpson." - doc = nlp(text) - - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) def run_el_dev(nlp, kb, training_dir, limit=None): - _prepare_pipeline(nlp, kb) - correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir, collect_correct=True, collect_incorrect=False) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 4be1ae2fb..6e4ca6970 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -6,7 +6,6 @@ import random from spacy.util import minibatch, compounding from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el -from examples.pipeline.wiki_entity_linking.train_el import EL_Model import spacy from spacy.vocab import Vocab @@ -30,10 +29,11 @@ TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' MAX_CANDIDATES = 10 MIN_PAIR_OCC = 5 DOC_CHAR_CUTOFF = 300 -EPOCHS = 5 +EPOCHS = 10 DROPOUT = 0.1 -if __name__ == "__main__": + +def run_pipeline(): print("START", datetime.datetime.now()) print() nlp = spacy.load('en_core_web_lg') @@ -51,15 +51,11 @@ if __name__ == "__main__": # create training dataset create_wp_training = False + # train the EL pipe train_pipe = True - # run EL training - run_el_training = False - - # apply named entity linking to the dev dataset - apply_to_dev = False - - to_test_pipeline = False + # test the EL pipe on a simple example + to_test_pipeline = True # STEP 1 : create prior probabilities from WP # run only once ! @@ -119,10 +115,11 @@ if __name__ == "__main__": # STEP 6: create the entity linking pipe if train_pipe: - id_to_descr = kb_creator._get_id_to_description(ENTITY_DESCR) - - train_limit = 10 + train_limit = 5 + dev_limit = 2 print("Training on", train_limit, "articles") + print("Dev testing on", dev_limit, "articles") + print() train_data = training_set_creator.read_training(nlp=nlp, training_dir=TRAINING_DIR, @@ -130,6 +127,12 @@ if __name__ == "__main__": limit=train_limit, to_print=False) + dev_data = training_set_creator.read_training(nlp=nlp, + training_dir=TRAINING_DIR, + dev=True, + limit=dev_limit, + to_print=False) + el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF}) nlp.add_pipe(el_pipe, last=True) @@ -137,12 +140,12 @@ if __name__ == "__main__": with nlp.disable_pipes(*other_pipes): # only train Entity Linking nlp.begin_training() - for itn in range(EPOCHS): - print() - print("EPOCH", itn) - random.shuffle(train_data) - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) + for itn in range(EPOCHS): + random.shuffle(train_data) + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) + + with nlp.disable_pipes(*other_pipes): for batch in batches: docs, golds = zip(*batch) nlp.update( @@ -151,20 +154,89 @@ if __name__ == "__main__": drop=DROPOUT, losses=losses, ) - print("Losses", losses) - # STEP 7: apply the EL algorithm on the dev dataset (TODO: overlaps with code from run_el_training ?) - if apply_to_dev: - run_el.run_el_dev(kb=my_kb, nlp=nlp, training_dir=TRAINING_DIR, limit=2000) - print() + el_pipe.context_weight = 1 + el_pipe.prior_weight = 1 + dev_acc_1_1 = _measure_accuracy(dev_data, nlp) + train_acc_1_1 = _measure_accuracy(train_data, nlp) - # test KB + el_pipe.context_weight = 0 + el_pipe.prior_weight = 1 + dev_acc_0_1 = _measure_accuracy(dev_data, nlp) + train_acc_0_1 = _measure_accuracy(train_data, nlp) + + el_pipe.context_weight = 1 + el_pipe.prior_weight = 0 + dev_acc_1_0 = _measure_accuracy(dev_data, nlp) + train_acc_1_0 = _measure_accuracy(train_data, nlp) + + print("Epoch, train loss, train/dev acc, 1-1, 0-1, 1-0:", itn, losses['entity_linker'], + round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/", + round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2)) + + # test Entity Linker if to_test_pipeline: - run_el.run_el_toy_example(kb=my_kb, nlp=nlp) print() - - # TODO coreference resolution - # add_coref() + run_el_toy_example(kb=my_kb, nlp=nlp) + print() print() print("STOP", datetime.datetime.now()) + + +def _measure_accuracy(data, nlp): + correct = 0 + incorrect = 0 + + texts = [d.text for d, g in data] + docs = list(nlp.pipe(texts)) + golds = [g for d, g in data] + + for doc, gold in zip(docs, golds): + correct_entries_per_article = dict() + for entity in gold.links: + start, end, gold_kb = entity + correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb + + for ent in doc.ents: + if ent.label_ == "PERSON": # TODO: expand to other types + pred_entity = ent.kb_id_ + start = ent.start + end = ent.end + gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None) + if gold_entity is not None: + if gold_entity == pred_entity: + correct += 1 + else: + incorrect += 1 + + if correct == incorrect == 0: + return 0 + + acc = correct / (correct + incorrect) + return acc + + +def run_el_toy_example(nlp, kb): + text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ + "Douglas reminds us to always bring our towel. " \ + "The main character in Doug's novel is the man Arthur Dent, " \ + "but Douglas doesn't write about George Washington or Homer Simpson." + doc = nlp(text) + + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) + + print() + + # Q4426480 is her husband, Q3568763 her tutor + text = "Ada Lovelace loved her husband William King dearly. " \ + "Ada Lovelace was tutored by her favorite physics tutor William King." + doc = nlp(text) + + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) + + +if __name__ == "__main__": + run_pipeline() diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index e5ed2ec23..9ef9df601 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1068,6 +1068,8 @@ class EntityLinker(Pipe): DOCS: TODO """ name = 'entity_linker' + context_weight = 1 + prior_weight = 1 @classmethod def Model(cls, **cfg): @@ -1093,14 +1095,15 @@ class EntityLinker(Pipe): self.doc_cutoff = self.cfg["doc_cutoff"] def use_avg_params(self): - """Modify the pipe's encoders/models, to use their average parameter values.""" - with self.article_encoder.use_params(self.sgd_article.averages) \ - and self.sent_encoder.use_params(self.sgd_sent.averages) \ - and self.mention_encoder.use_params(self.sgd_mention.averages): - yield + # Modify the pipe's encoders/models, to use their average parameter values. + # TODO: this doesn't work yet because there's no exit method + self.article_encoder.use_params(self.sgd_article.averages) + self.sent_encoder.use_params(self.sgd_sent.averages) + self.mention_encoder.use_params(self.sgd_mention.averages) + def require_model(self): - """Raise an error if the component's model is not initialized.""" + # Raise an error if the component's model is not initialized. if getattr(self, "mention_encoder", None) in (None, True, False): raise ValueError(Errors.E109.format(name=self.name)) @@ -1110,6 +1113,7 @@ class EntityLinker(Pipe): self.sgd_article = create_default_optimizer(self.article_encoder.ops) self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) + return self.sgd_article def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): self.require_model() @@ -1229,27 +1233,27 @@ class EntityLinker(Pipe): candidates = self.kb.get_candidates(ent.text) if candidates: - with self.use_avg_params: - scores = list() - for c in candidates: - prior_prob = c.prior_prob - kb_id = c.entity_ - entity_encoding = c.entity_vector - sim = cosine([entity_encoding], mention_enc_t) - score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? - scores.append(score) + scores = list() + for c in candidates: + prior_prob = c.prior_prob * self.prior_weight + kb_id = c.entity_ + entity_encoding = c.entity_vector + sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight + score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? + scores.append(score) - # TODO: thresholding - best_index = scores.index(max(scores)) - best_candidate = candidates[best_index] - final_entities.append(ent) - final_kb_ids.append(best_candidate) + # TODO: thresholding + best_index = scores.index(max(scores)) + best_candidate = candidates[best_index] + final_entities.append(ent) + final_kb_ids.append(best_candidate.entity_) return final_entities, final_kb_ids def set_annotations(self, docs, entities, kb_ids=None): for entity, kb_id in zip(entities, kb_ids): - entity.ent_kb_id_ = kb_id + for token in entity: + token.ent_kb_id_ = kb_id class Sentencizer(object): """Segment the Doc into sentences using a rule-based strategy. From 66813a1fdcfa2b1f2c9e3af0b8b3922427d1d73a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 11 Jun 2019 14:18:20 +0200 Subject: [PATCH 072/148] speed up predictions --- .../wiki_entity_linking/wiki_nel_pipeline.py | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 6e4ca6970..8753450bb 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -115,8 +115,8 @@ def run_pipeline(): # STEP 6: create the entity linking pipe if train_pipe: - train_limit = 5 - dev_limit = 2 + train_limit = 100 + dev_limit = 20 print("Training on", train_limit, "articles") print("Dev testing on", dev_limit, "articles") print() @@ -155,22 +155,25 @@ def run_pipeline(): losses=losses, ) + # print(" measuring accuracy 1-1") el_pipe.context_weight = 1 el_pipe.prior_weight = 1 - dev_acc_1_1 = _measure_accuracy(dev_data, nlp) - train_acc_1_1 = _measure_accuracy(train_data, nlp) + dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_1_1 = _measure_accuracy(train_data, el_pipe) + # print(" measuring accuracy 0-1") el_pipe.context_weight = 0 el_pipe.prior_weight = 1 - dev_acc_0_1 = _measure_accuracy(dev_data, nlp) - train_acc_0_1 = _measure_accuracy(train_data, nlp) + dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_0_1 = _measure_accuracy(train_data, el_pipe) + # print(" measuring accuracy 1-0") el_pipe.context_weight = 1 el_pipe.prior_weight = 0 - dev_acc_1_0 = _measure_accuracy(dev_data, nlp) - train_acc_1_0 = _measure_accuracy(train_data, nlp) + dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) + train_acc_1_0 = _measure_accuracy(train_data, el_pipe) - print("Epoch, train loss, train/dev acc, 1-1, 0-1, 1-0:", itn, losses['entity_linker'], + print("Epoch, train loss, train/dev acc, 1-1, 0-1, 1-0:", itn, round(losses['entity_linker'], 2), round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/", round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2)) @@ -184,12 +187,13 @@ def run_pipeline(): print("STOP", datetime.datetime.now()) -def _measure_accuracy(data, nlp): +def _measure_accuracy(data, el_pipe): correct = 0 incorrect = 0 - texts = [d.text for d, g in data] - docs = list(nlp.pipe(texts)) + docs = [d for d, g in data] + docs = el_pipe.pipe(docs) + golds = [g for d, g in data] for doc, gold in zip(docs, golds): From 6521cfa1328605b012db60d6077725bc697edd58 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Jun 2019 13:37:05 +0200 Subject: [PATCH 073/148] speeding up training --- .../wiki_entity_linking/wiki_nel_pipeline.py | 95 ++++++++++--------- spacy/pipeline/pipes.pyx | 5 + 2 files changed, 57 insertions(+), 43 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 8753450bb..90218edda 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -115,6 +115,7 @@ def run_pipeline(): # STEP 6: create the entity linking pipe if train_pipe: + print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) train_limit = 100 dev_limit = 20 print("Training on", train_limit, "articles") @@ -131,7 +132,7 @@ def run_pipeline(): training_dir=TRAINING_DIR, dev=True, limit=dev_limit, - to_print=False) + to_print=False) el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF}) nlp.add_pipe(el_pipe, last=True) @@ -147,35 +148,40 @@ def run_pipeline(): with nlp.disable_pipes(*other_pipes): for batch in batches: - docs, golds = zip(*batch) - nlp.update( - docs, - golds, - drop=DROPOUT, - losses=losses, - ) + try: + docs, golds = zip(*batch) + nlp.update( + docs, + golds, + drop=DROPOUT, + losses=losses, + ) + except Exception as e: + print("Error updating batch", e) - # print(" measuring accuracy 1-1") - el_pipe.context_weight = 1 - el_pipe.prior_weight = 1 - dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_1_1 = _measure_accuracy(train_data, el_pipe) + print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) - # print(" measuring accuracy 0-1") - el_pipe.context_weight = 0 - el_pipe.prior_weight = 1 - dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_0_1 = _measure_accuracy(train_data, el_pipe) + # baseline using only prior probabilities + el_pipe.context_weight = 0 + el_pipe.prior_weight = 1 + dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_0_1 = _measure_accuracy(train_data, el_pipe) - # print(" measuring accuracy 1-0") - el_pipe.context_weight = 1 - el_pipe.prior_weight = 0 - dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) - train_acc_1_0 = _measure_accuracy(train_data, el_pipe) + # print(" measuring accuracy 1-1") + el_pipe.context_weight = 1 + el_pipe.prior_weight = 1 + dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_1_1 = _measure_accuracy(train_data, el_pipe) - print("Epoch, train loss, train/dev acc, 1-1, 0-1, 1-0:", itn, round(losses['entity_linker'], 2), - round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/", - round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2)) + # print(" measuring accuracy 1-0") + el_pipe.context_weight = 1 + el_pipe.prior_weight = 0 + dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) + train_acc_1_0 = _measure_accuracy(train_data, el_pipe) + + print("train/dev acc, 1-1, 0-1, 1-0:" , + round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/", + round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2)) # test Entity Linker if to_test_pipeline: @@ -193,26 +199,29 @@ def _measure_accuracy(data, el_pipe): docs = [d for d, g in data] docs = el_pipe.pipe(docs) - golds = [g for d, g in data] for doc, gold in zip(docs, golds): - correct_entries_per_article = dict() - for entity in gold.links: - start, end, gold_kb = entity - correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb + try: + correct_entries_per_article = dict() + for entity in gold.links: + start, end, gold_kb = entity + correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb - for ent in doc.ents: - if ent.label_ == "PERSON": # TODO: expand to other types - pred_entity = ent.kb_id_ - start = ent.start - end = ent.end - gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None) - if gold_entity is not None: - if gold_entity == pred_entity: - correct += 1 - else: - incorrect += 1 + for ent in doc.ents: + if ent.label_ == "PERSON": # TODO: expand to other types + pred_entity = ent.kb_id_ + start = ent.start + end = ent.end + gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None) + if gold_entity is not None: + if gold_entity == pred_entity: + correct += 1 + else: + incorrect += 1 + + except Exception as e: + print("Error assessing accuracy", e) if correct == incorrect == 0: return 0 @@ -243,4 +252,4 @@ def run_el_toy_example(nlp, kb): if __name__ == "__main__": - run_pipeline() + run_pipeline() \ No newline at end of file diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 9ef9df601..deaab0a19 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1220,8 +1220,13 @@ class EntityLinker(Pipe): def predict(self, docs): self.require_model() + + if isinstance(docs, Doc): + docs = [docs] + final_entities = list() final_kb_ids = list() + for i, article_doc in enumerate(docs): doc_encoding = self.article_encoder([article_doc]) for ent in article_doc.ents: From b12001f368017b7f19ddb5b4f2f670d8dbf8e57b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 12 Jun 2019 22:05:53 +0200 Subject: [PATCH 074/148] small fixes --- .../wiki_entity_linking/wiki_nel_pipeline.py | 37 ++++++++++++------- spacy/pipeline/pipes.pyx | 2 +- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 90218edda..ebad16ba5 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -116,8 +116,8 @@ def run_pipeline(): # STEP 6: create the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - train_limit = 100 - dev_limit = 20 + train_limit = 5000 + dev_limit = 1000 print("Training on", train_limit, "articles") print("Dev testing on", dev_limit, "articles") print() @@ -145,6 +145,7 @@ def run_pipeline(): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) + batchnr = 0 with nlp.disable_pipes(*other_pipes): for batch in batches: @@ -156,35 +157,43 @@ def run_pipeline(): drop=DROPOUT, losses=losses, ) + batchnr += 1 except Exception as e: print("Error updating batch", e) + losses['entity_linker'] = losses['entity_linker'] / batchnr print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) - # baseline using only prior probabilities - el_pipe.context_weight = 0 - el_pipe.prior_weight = 1 - dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_0_1 = _measure_accuracy(train_data, el_pipe) + print() + print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) + print() # print(" measuring accuracy 1-1") el_pipe.context_weight = 1 el_pipe.prior_weight = 1 dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) train_acc_1_1 = _measure_accuracy(train_data, el_pipe) + print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2)) - # print(" measuring accuracy 1-0") + # baseline using only prior probabilities + el_pipe.context_weight = 0 + el_pipe.prior_weight = 1 + dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_0_1 = _measure_accuracy(train_data, el_pipe) + print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2)) + + # using only context el_pipe.context_weight = 1 el_pipe.prior_weight = 0 dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) train_acc_1_0 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc, 1-1, 0-1, 1-0:" , - round(train_acc_1_1, 2), round(train_acc_0_1, 2), round(train_acc_1_0, 2), "/", - round(dev_acc_1_1, 2), round(dev_acc_0_1, 2), round(dev_acc_1_0, 2)) + print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2)) + print() - # test Entity Linker if to_test_pipeline: + print() + print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) print() run_el_toy_example(kb=my_kb, nlp=nlp) print() @@ -197,9 +206,9 @@ def _measure_accuracy(data, el_pipe): correct = 0 incorrect = 0 - docs = [d for d, g in data] + docs = [d for d, g in data if len(d) > 0] docs = el_pipe.pipe(docs) - golds = [g for d, g in data] + golds = [g for d, g in data if len(d) > 0] for doc, gold in zip(docs, golds): try: diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index deaab0a19..f9043f0e4 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1188,7 +1188,7 @@ class EntityLinker(Pipe): def get_loss(self, docs, golds, scores): targets = [[1] for _ in golds] # assuming we're only using positive examples loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets) - #loss = loss / len(golds) + loss = loss / len(golds) return loss, gradients def get_cossim_loss_2(self, yh, y, t): From 78dd3e11da60532dc6f4c5cbcd76fa7577d3cb33 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 13 Jun 2019 16:25:39 +0200 Subject: [PATCH 075/148] write entity linking pipe to file and keep vocab consistent between kb and nlp --- .../wiki_entity_linking/kb_creator.py | 4 +- .../wiki_entity_linking/wiki_nel_pipeline.py | 145 +++++++++------- spacy/kb.pyx | 6 + spacy/language.py | 9 + spacy/pipeline/pipes.pyx | 155 ++++++++++++++---- 5 files changed, 226 insertions(+), 93 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index d097ac449..785811ea6 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_list = list(title_to_id.keys()) # TODO: remove this filter (just for quicker testing of code) - # title_list = title_list[0:34200] - # title_to_id = {t: title_to_id[t] for t in title_list} + title_list = title_list[0:342] + title_to_id = {t: title_to_id[t] for t in title_list} entity_list = [title_to_id[x] for x in title_list] diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index ebad16ba5..0c03784a1 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -6,6 +6,7 @@ import random from spacy.util import minibatch, compounding from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el +from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH import spacy from spacy.vocab import Vocab @@ -22,41 +23,48 @@ ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv' KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' -VOCAB_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/vocab' +NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1' +NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2' TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' MAX_CANDIDATES = 10 MIN_PAIR_OCC = 5 DOC_CHAR_CUTOFF = 300 -EPOCHS = 10 +EPOCHS = 2 DROPOUT = 0.1 def run_pipeline(): print("START", datetime.datetime.now()) print() - nlp = spacy.load('en_core_web_lg') - my_kb = None + nlp_1 = spacy.load('en_core_web_lg') + nlp_2 = None + kb_1 = None + kb_2 = None # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = False + to_create_kb = True # read KB back in from file to_read_kb = True - to_test_kb = False + to_test_kb = True # create training dataset create_wp_training = False # train the EL pipe train_pipe = True + measure_performance = False # test the EL pipe on a simple example to_test_pipeline = True + # write the NLP object, read back in and test again + test_nlp_io = True + # STEP 1 : create prior probabilities from WP # run only once ! if to_create_prior_probs: @@ -75,7 +83,7 @@ def run_pipeline(): # run only once ! if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) - my_kb = kb_creator.create_kb(nlp, + kb_1 = kb_creator.create_kb(nlp_1, max_entities_per_alias=MAX_CANDIDATES, min_occ=MIN_PAIR_OCC, entity_def_output=ENTITY_DEFS, @@ -83,63 +91,66 @@ def run_pipeline(): count_input=ENTITY_COUNTS, prior_prob_input=PRIOR_PROB, to_print=False) - print("kb entities:", my_kb.get_size_entities()) - print("kb aliases:", my_kb.get_size_aliases()) + print("kb entities:", kb_1.get_size_entities()) + print("kb aliases:", kb_1.get_size_aliases()) print() - print("STEP 3b: write KB", datetime.datetime.now()) - my_kb.dump(KB_FILE) - nlp.vocab.to_disk(VOCAB_DIR) + print("STEP 3b: write KB and NLP", datetime.datetime.now()) + kb_1.dump(KB_FILE) + nlp_1.to_disk(NLP_1_DIR) print() # STEP 4 : read KB back in from file if to_read_kb: print("STEP 4: to_read_kb", datetime.datetime.now()) - my_vocab = Vocab() - my_vocab.from_disk(VOCAB_DIR) - my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64) # TODO entity vectors - my_kb.load_bulk(KB_FILE) - print("kb entities:", my_kb.get_size_entities()) - print("kb aliases:", my_kb.get_size_aliases()) + # my_vocab = Vocab() + # my_vocab.from_disk(VOCAB_DIR) + # my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64) + nlp_2 = spacy.load(NLP_1_DIR) + kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH) + kb_2.load_bulk(KB_FILE) + print("kb entities:", kb_2.get_size_entities()) + print("kb aliases:", kb_2.get_size_aliases()) print() # test KB if to_test_kb: - run_el.run_kb_toy_example(kb=my_kb) + run_el.run_kb_toy_example(kb=kb_2) print() # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - training_set_creator.create_training(kb=my_kb, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) + training_set_creator.create_training(kb=kb_2, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) # STEP 6: create the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - train_limit = 5000 - dev_limit = 1000 + train_limit = 10 + dev_limit = 5 print("Training on", train_limit, "articles") print("Dev testing on", dev_limit, "articles") print() - train_data = training_set_creator.read_training(nlp=nlp, + train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, limit=train_limit, to_print=False) - dev_data = training_set_creator.read_training(nlp=nlp, + dev_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit, to_print=False) - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": my_kb, "doc_cutoff": DOC_CHAR_CUTOFF}) - nlp.add_pipe(el_pipe, last=True) + el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_CHAR_CUTOFF}) + el_pipe.set_kb(kb_2) + nlp_2.add_pipe(el_pipe, last=True) - other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] - with nlp.disable_pipes(*other_pipes): # only train Entity Linking - nlp.begin_training() + other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"] + with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking + nlp_2.begin_training() for itn in range(EPOCHS): random.shuffle(train_data) @@ -147,11 +158,11 @@ def run_pipeline(): batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) batchnr = 0 - with nlp.disable_pipes(*other_pipes): + with nlp_2.disable_pipes(*other_pipes): for batch in batches: try: docs, golds = zip(*batch) - nlp.update( + nlp_2.update( docs, golds, drop=DROPOUT, @@ -164,40 +175,62 @@ def run_pipeline(): losses['entity_linker'] = losses['entity_linker'] / batchnr print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) - print() - print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) - print() + if measure_performance: + print() + print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) + print() - # print(" measuring accuracy 1-1") - el_pipe.context_weight = 1 - el_pipe.prior_weight = 1 - dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_1_1 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2)) + # print(" measuring accuracy 1-1") + el_pipe.context_weight = 1 + el_pipe.prior_weight = 1 + dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_1_1 = _measure_accuracy(train_data, el_pipe) + print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2)) - # baseline using only prior probabilities - el_pipe.context_weight = 0 - el_pipe.prior_weight = 1 - dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_0_1 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2)) + # baseline using only prior probabilities + el_pipe.context_weight = 0 + el_pipe.prior_weight = 1 + dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) + train_acc_0_1 = _measure_accuracy(train_data, el_pipe) + print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2)) - # using only context - el_pipe.context_weight = 1 - el_pipe.prior_weight = 0 - dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) - train_acc_1_0 = _measure_accuracy(train_data, el_pipe) + # using only context + el_pipe.context_weight = 1 + el_pipe.prior_weight = 0 + dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) + train_acc_1_0 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2)) - print() + print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2)) + print() if to_test_pipeline: print() print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) print() - run_el_toy_example(kb=my_kb, nlp=nlp) + run_el_toy_example(nlp=nlp_2) print() + if test_nlp_io: + print() + print("STEP 9: testing NLP IO", datetime.datetime.now()) + print() + print("writing to", NLP_2_DIR) + print(" vocab len nlp_2", len(nlp_2.vocab)) + print(" vocab len kb_2", len(kb_2.vocab)) + nlp_2.to_disk(NLP_2_DIR) + print() + print("reading from", NLP_2_DIR) + nlp_3 = spacy.load(NLP_2_DIR) + print(" vocab len nlp_3", len(nlp_3.vocab)) + + for pipe_name, pipe in nlp_3.pipeline: + if pipe_name == "entity_linker": + print(" vocab len kb_3", len(pipe.kb.vocab)) + + print() + print("running toy example with NLP 2") + run_el_toy_example(nlp=nlp_3) + print() print("STOP", datetime.datetime.now()) @@ -239,7 +272,7 @@ def _measure_accuracy(data, el_pipe): return acc -def run_el_toy_example(nlp, kb): +def run_el_toy_example(nlp): text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ "Douglas reminds us to always bring our towel. " \ "The main character in Doug's novel is the man Arthur Dent, " \ @@ -261,4 +294,4 @@ def run_el_toy_example(nlp, kb): if __name__ == "__main__": - run_pipeline() \ No newline at end of file + run_pipeline() diff --git a/spacy/kb.pyx b/spacy/kb.pyx index ade2360be..9a84439ea 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -2,6 +2,8 @@ # cython: profile=True # coding: utf8 from collections import OrderedDict +from pathlib import Path, WindowsPath + from cpython.exc cimport PyErr_CheckSignals from spacy import util @@ -389,6 +391,8 @@ cdef class Writer: def __init__(self, object loc): if path.exists(loc): assert not path.isdir(loc), "%s is directory." % loc + if isinstance(loc, Path): + loc = bytes(loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'wb') assert self._fp != NULL @@ -431,6 +435,8 @@ cdef class Reader: def __init__(self, object loc): assert path.exists(loc) assert not path.isdir(loc) + if isinstance(loc, Path): + loc = bytes(loc) cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc self._fp = fopen(bytes_loc, 'rb') if not self._fp: diff --git a/spacy/language.py b/spacy/language.py index ec3232bd5..0e5e29244 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -11,6 +11,7 @@ from copy import copy, deepcopy from thinc.neural import Model import srsly +from spacy.kb import KnowledgeBase from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer @@ -809,6 +810,14 @@ class Language(object): # Convert to list here in case exclude is (default) tuple exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) + + # download the KB for the entity linking component - requires the vocab + for pipe_name, pipe in self.pipeline: + if pipe_name == "entity_linker": + kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=pipe.cfg["entity_width"]) + kb.load_bulk(path / pipe_name / "kb") + pipe.set_kb(kb) + self._path = path return self diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f9043f0e4..e73ff6a0e 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -14,6 +14,7 @@ from thinc.misc import LayerNorm from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module +from spacy.kb import KnowledgeBase from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -1077,7 +1078,7 @@ class EntityLinker(Pipe): hidden_width = cfg.get("hidden_width", 32) article_width = cfg.get("article_width", 128) sent_width = cfg.get("sent_width", 64) - entity_width = cfg["kb"].entity_vector_length + entity_width = cfg.get("entity_width") # no default because this needs to correspond with the KB article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg) sent_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg) @@ -1089,34 +1090,41 @@ class EntityLinker(Pipe): return article_encoder, sent_encoder, mention_encoder def __init__(self, **cfg): + self.article_encoder = True + self.sent_encoder = True self.mention_encoder = True + self.kb = None self.cfg = dict(cfg) - self.kb = self.cfg["kb"] - self.doc_cutoff = self.cfg["doc_cutoff"] - - def use_avg_params(self): - # Modify the pipe's encoders/models, to use their average parameter values. - # TODO: this doesn't work yet because there's no exit method - self.article_encoder.use_params(self.sgd_article.averages) - self.sent_encoder.use_params(self.sgd_sent.averages) - self.mention_encoder.use_params(self.sgd_mention.averages) + self.doc_cutoff = self.cfg.get("doc_cutoff", 150) + def set_kb(self, kb): + self.kb = kb def require_model(self): # Raise an error if the component's model is not initialized. if getattr(self, "mention_encoder", None) in (None, True, False): raise ValueError(Errors.E109.format(name=self.name)) + def require_kb(self): + # Raise an error if the knowledge base is not initialized. + if getattr(self, "kb", None) in (None, True, False): + # TODO: custom error + raise ValueError(Errors.E109.format(name=self.name)) + def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): + self.require_kb() + self.cfg["entity_width"] = self.kb.entity_vector_length + if self.mention_encoder is True: self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) - self.sgd_article = create_default_optimizer(self.article_encoder.ops) - self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) - self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) + self.sgd_article = create_default_optimizer(self.article_encoder.ops) + self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) + self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) return self.sgd_article def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): self.require_model() + self.require_kb() if len(docs) != len(golds): raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs), @@ -1220,6 +1228,7 @@ class EntityLinker(Pipe): def predict(self, docs): self.require_model() + self.require_kb() if isinstance(docs, Doc): docs = [docs] @@ -1228,30 +1237,32 @@ class EntityLinker(Pipe): final_kb_ids = list() for i, article_doc in enumerate(docs): - doc_encoding = self.article_encoder([article_doc]) - for ent in article_doc.ents: - sent_doc = ent.sent.as_doc() - sent_encoding = self.sent_encoder([sent_doc]) - concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] - mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]])) - mention_enc_t = np.transpose(mention_encoding) + if len(article_doc) > 0: + doc_encoding = self.article_encoder([article_doc]) + for ent in article_doc.ents: + sent_doc = ent.sent.as_doc() + if len(sent_doc) > 0: + sent_encoding = self.sent_encoder([sent_doc]) + concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] + mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]])) + mention_enc_t = np.transpose(mention_encoding) - candidates = self.kb.get_candidates(ent.text) - if candidates: - scores = list() - for c in candidates: - prior_prob = c.prior_prob * self.prior_weight - kb_id = c.entity_ - entity_encoding = c.entity_vector - sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight - score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? - scores.append(score) + candidates = self.kb.get_candidates(ent.text) + if candidates: + scores = list() + for c in candidates: + prior_prob = c.prior_prob * self.prior_weight + kb_id = c.entity_ + entity_encoding = c.entity_vector + sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight + score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? + scores.append(score) - # TODO: thresholding - best_index = scores.index(max(scores)) - best_candidate = candidates[best_index] - final_entities.append(ent) - final_kb_ids.append(best_candidate.entity_) + # TODO: thresholding + best_index = scores.index(max(scores)) + best_candidate = candidates[best_index] + final_entities.append(ent) + final_kb_ids.append(best_candidate.entity_) return final_entities, final_kb_ids @@ -1260,6 +1271,80 @@ class EntityLinker(Pipe): for token in entity: token.ent_kb_id_ = kb_id + def to_bytes(self, exclude=tuple(), **kwargs): + """Serialize the pipe to a bytestring. + + exclude (list): String names of serialization fields to exclude. + RETURNS (bytes): The serialized object. + """ + serialize = OrderedDict() + serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) + serialize["kb"] = self.kb.to_bytes # TODO + if self.mention_encoder not in (True, False, None): + serialize["article_encoder"] = self.article_encoder.to_bytes + serialize["sent_encoder"] = self.sent_encoder.to_bytes + serialize["mention_encoder"] = self.mention_encoder.to_bytes + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + return util.to_bytes(serialize, exclude) + + def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): + """Load the pipe from a bytestring.""" + deserialize = OrderedDict() + deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) + deserialize["kb"] = lambda b: self.kb.from_bytes(b) # TODO + deserialize["article_encoder"] = lambda b: self.article_encoder.from_bytes(b) + deserialize["sent_encoder"] = lambda b: self.sent_encoder.from_bytes(b) + deserialize["mention_encoder"] = lambda b: self.mention_encoder.from_bytes(b) + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_bytes(bytes_data, deserialize, exclude) + return self + + def to_disk(self, path, exclude=tuple(), **kwargs): + """Serialize the pipe to disk.""" + serialize = OrderedDict() + serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) + serialize["kb"] = lambda p: self.kb.dump(p) + if self.mention_encoder not in (None, True, False): + serialize["article_encoder"] = lambda p: p.open("wb").write(self.article_encoder.to_bytes()) + serialize["sent_encoder"] = lambda p: p.open("wb").write(self.sent_encoder.to_bytes()) + serialize["mention_encoder"] = lambda p: p.open("wb").write(self.mention_encoder.to_bytes()) + exclude = util.get_serialization_exclude(serialize, exclude, kwargs) + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude=tuple(), **kwargs): + """Load the pipe from disk.""" + def load_article_encoder(p): + if self.article_encoder is True: + self.article_encoder, _, _ = self.Model(**self.cfg) + self.article_encoder.from_bytes(p.open("rb").read()) + + def load_sent_encoder(p): + if self.sent_encoder is True: + _, self.sent_encoder, _ = self.Model(**self.cfg) + self.sent_encoder.from_bytes(p.open("rb").read()) + + def load_mention_encoder(p): + if self.mention_encoder is True: + _, _, self.mention_encoder = self.Model(**self.cfg) + self.mention_encoder.from_bytes(p.open("rb").read()) + + deserialize = OrderedDict() + deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) + deserialize["article_encoder"] = load_article_encoder + deserialize["sent_encoder"] = load_sent_encoder + deserialize["mention_encoder"] = load_mention_encoder + exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) + util.from_disk(path, deserialize, exclude) + return self + + def rehearse(self, docs, sgd=None, losses=None, **config): + # TODO + pass + + def add_label(self, label): + pass + + class Sentencizer(object): """Segment the Doc into sentences using a rule-based strategy. From 0b04d142de01806e15a696fcc667c8563d438005 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 13 Jun 2019 22:32:56 +0200 Subject: [PATCH 076/148] regenerating KB --- .../wiki_entity_linking/kb_creator.py | 19 +++++++++---------- .../wiki_entity_linking/train_descriptions.py | 4 ++-- .../wiki_entity_linking/wiki_nel_pipeline.py | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index 785811ea6..7b740216b 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -14,6 +14,7 @@ from . import wikidata_processor as wd INPUT_DIM = 300 # dimension of pre-trained vectors DESC_WIDTH = 64 + def create_kb(nlp, max_entities_per_alias, min_occ, entity_def_output, entity_descr_output, count_input, prior_prob_input, to_print=False): @@ -25,8 +26,7 @@ def create_kb(nlp, max_entities_per_alias, min_occ, if read_raw_data: print() - print("1. _read_wikidata_entities", datetime.datetime.now()) - print() + print(" * _read_wikidata_entities", datetime.datetime.now()) title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None) # write the title-ID and ID-description mappings to file @@ -40,8 +40,8 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_list = list(title_to_id.keys()) # TODO: remove this filter (just for quicker testing of code) - title_list = title_list[0:342] - title_to_id = {t: title_to_id[t] for t in title_list} + # title_list = title_list[0:342] + # title_to_id = {t: title_to_id[t] for t in title_list} entity_list = [title_to_id[x] for x in title_list] @@ -49,29 +49,28 @@ def create_kb(nlp, max_entities_per_alias, min_occ, description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] print() - print("2. _get_entity_frequencies", datetime.datetime.now()) + print(" * _get_entity_frequencies", datetime.datetime.now()) print() entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list) print() - print("3. train entity encoder", datetime.datetime.now()) + print(" * train entity encoder", datetime.datetime.now()) print() encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH) encoder.train(description_list=description_list, to_print=True) print() - print("4. get entity embeddings", datetime.datetime.now()) + print(" * get entity embeddings", datetime.datetime.now()) print() embeddings = encoder.apply_encoder(description_list) print() - print("5. adding", len(entity_list), "entities", datetime.datetime.now()) - print() + print(" * adding", len(entity_list), "entities", datetime.datetime.now()) kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings) print() - print("6. adding aliases", datetime.datetime.now()) + print(" * adding aliases", datetime.datetime.now()) print() _add_aliases(kb, title_to_id=title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py index e1a2f1797..92859fd84 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -17,7 +17,7 @@ class EntityEncoder: DROP = 0 EPOCHS = 5 - STOP_THRESHOLD = 0.1 + STOP_THRESHOLD = 0.04 BATCH_SIZE = 1000 @@ -127,7 +127,7 @@ class EntityEncoder: return loss, gradients def _test_encoder(self): - """ Test encoder on some dummy examples """ + # Test encoder on some dummy examples desc_A1 = "Fictional character in The Simpsons" desc_A2 = "Simpsons - fictional human" desc_A3 = "Fictional character in The Flintstones" diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 0c03784a1..d5002e26f 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -22,7 +22,7 @@ ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv' -KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb' +KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb' NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1' NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2' @@ -56,14 +56,14 @@ def run_pipeline(): create_wp_training = False # train the EL pipe - train_pipe = True + train_pipe = False measure_performance = False # test the EL pipe on a simple example - to_test_pipeline = True + to_test_pipeline = False # write the NLP object, read back in and test again - test_nlp_io = True + test_nlp_io = False # STEP 1 : create prior probabilities from WP # run only once ! From b312f2d0e79b886d0d824f9294ccc2f1f24b725a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 14 Jun 2019 15:55:26 +0200 Subject: [PATCH 077/148] redo training data to be independent of KB and entity-level instead of doc-level --- .../wiki_entity_linking/train_descriptions.py | 2 - .../training_set_creator.py | 219 +++++++++--------- .../wiki_entity_linking/wiki_nel_pipeline.py | 44 ++-- spacy/pipeline/pipes.pyx | 96 ++++---- 4 files changed, 179 insertions(+), 182 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/examples/pipeline/wiki_entity_linking/train_descriptions.py index 92859fd84..bf4bcbc3d 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/examples/pipeline/wiki_entity_linking/train_descriptions.py @@ -1,8 +1,6 @@ # coding: utf-8 from random import shuffle -from examples.pipeline.wiki_entity_linking import kb_creator - import numpy as np from spacy._ml import zero_init, create_default_optimizer diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index 38a86058d..fc620a1d3 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -19,17 +19,15 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm ENTITY_FILE = "gold_entities.csv" -def create_training(kb, entity_def_input, training_output): - if not kb: - raise ValueError("kb should be defined") +def create_training(entity_def_input, training_output): wp_to_id = kb_creator._get_entity_to_id(entity_def_input) - _process_wikipedia_texts(kb, wp_to_id, training_output, limit=100000000) # TODO: full dataset + _process_wikipedia_texts(wp_to_id, training_output, limit=100000000) # TODO: full dataset 100000000 -def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): +def _process_wikipedia_texts(wp_to_id, training_output, limit=None): """ Read the XML wikipedia data to parse out training data: - raw text data + positive and negative instances + raw text data + positive instances """ title_regex = re.compile(r'(?<=).*(?=)') @@ -43,8 +41,9 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): _write_training_entity(outputfile=entityfile, article_id="article_id", alias="alias", - entity="entity", - correct="correct") + entity="WD_id", + start="start", + end="end") with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file: line = file.readline() @@ -75,14 +74,11 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): elif clean_line == "": if article_id: try: - _process_wp_text(kb, wp_to_id, entityfile, article_id, article_text.strip(), training_output) - # on a previous run, an error occurred after 46M lines and 2h + _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(), training_output) except Exception as e: print("Error processing article", article_id, article_title, e) else: - print("Done processing a page, but couldn't find an article_id ?") - print(article_title) - print(article_text) + print("Done processing a page, but couldn't find an article_id ?", article_title) article_text = "" article_title = None article_id = None @@ -122,7 +118,14 @@ def _process_wikipedia_texts(kb, wp_to_id, training_output, limit=None): text_regex = re.compile(r'(?<=).*(?=", entity) - candidates = kb.get_candidates(alias) + if open_read > 2: + reading_special_case = True - # as training data, we only store entities that are sufficiently ambiguous - if len(candidates) > 1: - _write_training_article(article_id=article_id, clean_text=clean_text, training_output=training_output) - # print("alias", alias) + if open_read == 2 and reading_text: + reading_text = False + reading_entity = True + reading_mention = False - # print all incorrect candidates - for c in candidates: - if entity != c.entity_: + # we just finished reading an entity + if open_read == 0 and not reading_text: + if '#' in entity_buffer or entity_buffer.startswith(':'): + reading_special_case = True + # Ignore cases with nested structures like File: handles etc + if not reading_special_case: + if not mention_buffer: + mention_buffer = entity_buffer + start = len(final_text) + end = start + len(mention_buffer) + qid = wp_to_id.get(entity_buffer, None) + if qid: _write_training_entity(outputfile=entityfile, article_id=article_id, - alias=alias, - entity=c.entity_, - correct="0") + alias=mention_buffer, + entity=qid, + start=start, + end=end) + found_entities = True + final_text += mention_buffer - # print the one correct candidate - _write_training_entity(outputfile=entityfile, - article_id=article_id, - alias=alias, - entity=entity, - correct="1") + entity_buffer = "" + mention_buffer = "" - # print("gold entity", entity) - # print() + reading_text = True + reading_entity = False + reading_mention = False + reading_special_case = False - # _run_ner_depr(nlp, clean_text, article_dict) - # print() + if found_entities: + _write_training_article(article_id=article_id, clean_text=final_text, training_output=training_output) info_regex = re.compile(r'{[^{]*?}') -interwiki_regex = re.compile(r'\[\[([^|]*?)]]') -interwiki_2_regex = re.compile(r'\[\[[^|]*?\|([^|]*?)]]') -htlm_regex = re.compile(r'<!--[^!]*-->') +htlm_regex = re.compile(r'<!--[^-]*-->') category_regex = re.compile(r'\[\[Category:[^\[]*]]') file_regex = re.compile(r'\[\[File:[^[\]]+]]') ref_regex = re.compile(r'<ref.*?>') # non-greedy @@ -215,12 +242,6 @@ def _get_clean_wp_text(article_text): try_again = False previous_length = len(clean_text) - # remove simple interwiki links (no alternative name) - clean_text = interwiki_regex.sub(r'\1', clean_text) - - # remove simple interwiki links by picking the alternative name - clean_text = interwiki_2_regex.sub(r'\1', clean_text) - # remove HTML comments clean_text = htlm_regex.sub('', clean_text) @@ -265,43 +286,34 @@ def _write_training_article(article_id, clean_text, training_output): outputfile.write(clean_text) -def _write_training_entity(outputfile, article_id, alias, entity, correct): - outputfile.write(article_id + "|" + alias + "|" + entity + "|" + correct + "\n") +def _write_training_entity(outputfile, article_id, alias, entity, start, end): + outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n") -def read_training_entities(training_output, collect_correct=True, collect_incorrect=False): +def read_training_entities(training_output): entityfile_loc = training_output + "/" + ENTITY_FILE - incorrect_entries_per_article = dict() - correct_entries_per_article = dict() + entries_per_article = dict() + with open(entityfile_loc, mode='r', encoding='utf8') as file: for line in file: fields = line.replace('\n', "").split(sep='|') article_id = fields[0] alias = fields[1] - entity = fields[2] - correct = fields[3] + wp_title = fields[2] + start = fields[3] + end = fields[4] - if correct == "1" and collect_correct: - entry_dict = correct_entries_per_article.get(article_id, dict()) - if alias in entry_dict: - raise ValueError("Found alias", alias, "multiple times for article", article_id, "in", ENTITY_FILE) - entry_dict[alias] = entity - correct_entries_per_article[article_id] = entry_dict + entries_by_offset = entries_per_article.get(article_id, dict()) + entries_by_offset[start + "-" + end] = (alias, wp_title) - if correct == "0" and collect_incorrect: - entry_dict = incorrect_entries_per_article.get(article_id, dict()) - entities = entry_dict.get(alias, set()) - entities.add(entity) - entry_dict[alias] = entities - incorrect_entries_per_article[article_id] = entry_dict + entries_per_article[article_id] = entries_by_offset - return correct_entries_per_article, incorrect_entries_per_article + return entries_per_article def read_training(nlp, training_dir, dev, limit, to_print): - correct_entries, incorrect_entries = read_training_entities(training_output=training_dir, - collect_correct=True, - collect_incorrect=True) + # This method will provide training examples that correspond to the entity annotations found by the nlp object + entries_per_article = read_training_entities(training_output=training_dir) data = [] @@ -320,36 +332,33 @@ def read_training(nlp, training_dir, dev, limit, to_print): text = file.read() article_doc = nlp(text) + entries_by_offset = entries_per_article.get(article_id, dict()) + gold_entities = list() + for ent in article_doc.ents: + start = ent.start_char + end = ent.end_char - # process all positive and negative entities, collect all relevant mentions in this article - for mention, entity_pos in correct_entries[article_id].items(): - # find all matches in the doc for the mentions - # TODO: fix this - doesn't look like all entities are found - matcher = PhraseMatcher(nlp.vocab) - patterns = list(nlp.tokenizer.pipe([mention])) + entity_tuple = entries_by_offset.get(str(start) + "-" + str(end), None) + if entity_tuple: + alias, wp_title = entity_tuple + if ent.text != alias: + print("Non-matching entity in", article_id, start, end) + else: + gold_entities.append((start, end, wp_title)) - matcher.add("TerminologyList", None, *patterns) - matches = matcher(article_doc) - - # store gold entities - for match_id, start, end in matches: - gold_entities.append((start, end, entity_pos)) - - gold = GoldParse(doc=article_doc, links=gold_entities) - data.append((article_doc, gold)) + if gold_entities: + gold = GoldParse(doc=article_doc, links=gold_entities) + data.append((article_doc, gold)) cnt += 1 except Exception as e: print("Problem parsing article", article_id) print(e) + raise e if to_print: print() print("Processed", cnt, "training articles, dev=" + str(dev)) print() return data - - - - diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index d5002e26f..faea93f53 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -30,8 +30,8 @@ TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' MAX_CANDIDATES = 10 MIN_PAIR_OCC = 5 -DOC_CHAR_CUTOFF = 300 -EPOCHS = 2 +DOC_SENT_CUTOFF = 2 +EPOCHS = 10 DROPOUT = 0.1 @@ -46,14 +46,14 @@ def run_pipeline(): # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = True + to_create_kb = False # TODO: entity_defs should also contain entities not in the KB # read KB back in from file - to_read_kb = True - to_test_kb = True + to_read_kb = False + to_test_kb = False # create training dataset - create_wp_training = False + create_wp_training = True # train the EL pipe train_pipe = False @@ -103,9 +103,6 @@ def run_pipeline(): # STEP 4 : read KB back in from file if to_read_kb: print("STEP 4: to_read_kb", datetime.datetime.now()) - # my_vocab = Vocab() - # my_vocab.from_disk(VOCAB_DIR) - # my_kb = KnowledgeBase(vocab=my_vocab, entity_vector_length=64) nlp_2 = spacy.load(NLP_1_DIR) kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH) kb_2.load_bulk(KB_FILE) @@ -121,13 +118,13 @@ def run_pipeline(): # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - training_set_creator.create_training(kb=kb_2, entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) + training_set_creator.create_training(entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) # STEP 6: create the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - train_limit = 10 - dev_limit = 5 + train_limit = 50 + dev_limit = 10 print("Training on", train_limit, "articles") print("Dev testing on", dev_limit, "articles") print() @@ -144,7 +141,7 @@ def run_pipeline(): limit=dev_limit, to_print=False) - el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_CHAR_CUTOFF}) + el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_SENT_CUTOFF}) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) @@ -199,10 +196,14 @@ def run_pipeline(): el_pipe.prior_weight = 0 dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) train_acc_1_0 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2)) print() + # reset for follow-up tests + el_pipe.context_weight = 1 + el_pipe.prior_weight = 1 + + if to_test_pipeline: print() print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) @@ -215,17 +216,10 @@ def run_pipeline(): print("STEP 9: testing NLP IO", datetime.datetime.now()) print() print("writing to", NLP_2_DIR) - print(" vocab len nlp_2", len(nlp_2.vocab)) - print(" vocab len kb_2", len(kb_2.vocab)) nlp_2.to_disk(NLP_2_DIR) print() print("reading from", NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR) - print(" vocab len nlp_3", len(nlp_3.vocab)) - - for pipe_name, pipe in nlp_3.pipeline: - if pipe_name == "entity_linker": - print(" vocab len kb_3", len(pipe.kb.vocab)) print() print("running toy example with NLP 2") @@ -253,9 +247,10 @@ def _measure_accuracy(data, el_pipe): for ent in doc.ents: if ent.label_ == "PERSON": # TODO: expand to other types pred_entity = ent.kb_id_ - start = ent.start - end = ent.end + start = ent.start_char + end = ent.end_char gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None) + # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' if gold_entity is not None: if gold_entity == pred_entity: correct += 1 @@ -285,7 +280,8 @@ def run_el_toy_example(nlp): print() # Q4426480 is her husband, Q3568763 her tutor - text = "Ada Lovelace loved her husband William King dearly. " \ + text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine."\ + "Ada Lovelace loved her husband William King dearly. " \ "Ada Lovelace was tutored by her favorite physics tutor William King." doc = nlp(text) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index e73ff6a0e..5d82da7ee 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1074,6 +1074,9 @@ class EntityLinker(Pipe): @classmethod def Model(cls, **cfg): + if "entity_width" not in cfg: + raise ValueError("entity_width not found") + embed_width = cfg.get("embed_width", 300) hidden_width = cfg.get("hidden_width", 32) article_width = cfg.get("article_width", 128) @@ -1095,7 +1098,10 @@ class EntityLinker(Pipe): self.mention_encoder = True self.kb = None self.cfg = dict(cfg) - self.doc_cutoff = self.cfg.get("doc_cutoff", 150) + self.doc_cutoff = self.cfg.get("doc_cutoff", 5) + self.sgd_article = None + self.sgd_sent = None + self.sgd_mention = None def set_kb(self, kb): self.kb = kb @@ -1126,6 +1132,12 @@ class EntityLinker(Pipe): self.require_model() self.require_kb() + if losses is not None: + losses.setdefault(self.name, 0.0) + + if not docs or not golds: + return 0 + if len(docs) != len(golds): raise ValueError(Errors.E077.format(value="EL training", n_docs=len(docs), n_golds=len(golds))) @@ -1141,21 +1153,30 @@ class EntityLinker(Pipe): for doc, gold in zip(docs, golds): for entity in gold.links: start, end, gold_kb = entity - mention = doc[start:end] - sentence = mention.sent - first_par = doc[0:self.doc_cutoff].as_doc() + mention = doc.text[start:end] + sent_start = 0 + sent_end = len(doc) + first_par_end = len(doc) + for index, sent in enumerate(doc.sents): + if start >= sent.start_char and end <= sent.end_char: + sent_start = sent.start + sent_end = sent.end + if index == self.doc_cutoff-1: + first_par_end = sent.end + sentence = doc[sent_start:sent_end].as_doc() + first_par = doc[0:first_par_end].as_doc() - candidates = self.kb.get_candidates(mention.text) + candidates = self.kb.get_candidates(mention) for c in candidates: kb_id = c.entity_ - # TODO: currently only training on the positive instances + # Currently only training on the positive instances if kb_id == gold_kb: prior_prob = c.prior_prob entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) article_docs.append(first_par) - sentence_docs.append(sentence.as_doc()) + sentence_docs.append(sentence) if len(entity_encodings) > 0: doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) @@ -1168,11 +1189,6 @@ class EntityLinker(Pipe): entity_encodings = np.asarray(entity_encodings, dtype=np.float32) loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) - # print("scores", mention_encodings) - # print("golds", entity_encodings) - # print("loss", loss) - # print("d_scores", d_scores) - mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention) # gradient : concat (doc+sent) vs. desc @@ -1187,7 +1203,6 @@ class EntityLinker(Pipe): bp_sent(sent_gradients, sgd=self.sgd_sent) if losses is not None: - losses.setdefault(self.name, 0.0) losses[self.name] += loss return loss @@ -1230,16 +1245,25 @@ class EntityLinker(Pipe): self.require_model() self.require_kb() - if isinstance(docs, Doc): - docs = [docs] - final_entities = list() final_kb_ids = list() - for i, article_doc in enumerate(docs): - if len(article_doc) > 0: - doc_encoding = self.article_encoder([article_doc]) - for ent in article_doc.ents: + if not docs: + return final_entities, final_kb_ids + + if isinstance(docs, Doc): + docs = [docs] + + for i, doc in enumerate(docs): + if len(doc) > 0: + first_par_end = len(doc) + for index, sent in enumerate(doc.sents): + if index == self.doc_cutoff-1: + first_par_end = sent.end + first_par = doc[0:first_par_end].as_doc() + + doc_encoding = self.article_encoder([first_par]) + for ent in doc.ents: sent_doc = ent.sent.as_doc() if len(sent_doc) > 0: sent_encoding = self.sent_encoder([sent_doc]) @@ -1254,7 +1278,7 @@ class EntityLinker(Pipe): prior_prob = c.prior_prob * self.prior_weight kb_id = c.entity_ entity_encoding = c.entity_vector - sim = cosine(np.asarray([entity_encoding]), mention_enc_t) * self.context_weight + sim = float(cosine(np.asarray([entity_encoding]), mention_enc_t)) * self.context_weight score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? scores.append(score) @@ -1271,36 +1295,7 @@ class EntityLinker(Pipe): for token in entity: token.ent_kb_id_ = kb_id - def to_bytes(self, exclude=tuple(), **kwargs): - """Serialize the pipe to a bytestring. - - exclude (list): String names of serialization fields to exclude. - RETURNS (bytes): The serialized object. - """ - serialize = OrderedDict() - serialize["cfg"] = lambda: srsly.json_dumps(self.cfg) - serialize["kb"] = self.kb.to_bytes # TODO - if self.mention_encoder not in (True, False, None): - serialize["article_encoder"] = self.article_encoder.to_bytes - serialize["sent_encoder"] = self.sent_encoder.to_bytes - serialize["mention_encoder"] = self.mention_encoder.to_bytes - exclude = util.get_serialization_exclude(serialize, exclude, kwargs) - return util.to_bytes(serialize, exclude) - - def from_bytes(self, bytes_data, exclude=tuple(), **kwargs): - """Load the pipe from a bytestring.""" - deserialize = OrderedDict() - deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b)) - deserialize["kb"] = lambda b: self.kb.from_bytes(b) # TODO - deserialize["article_encoder"] = lambda b: self.article_encoder.from_bytes(b) - deserialize["sent_encoder"] = lambda b: self.sent_encoder.from_bytes(b) - deserialize["mention_encoder"] = lambda b: self.mention_encoder.from_bytes(b) - exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) - util.from_bytes(bytes_data, deserialize, exclude) - return self - def to_disk(self, path, exclude=tuple(), **kwargs): - """Serialize the pipe to disk.""" serialize = OrderedDict() serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["kb"] = lambda p: self.kb.dump(p) @@ -1312,7 +1307,6 @@ class EntityLinker(Pipe): util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude=tuple(), **kwargs): - """Load the pipe from disk.""" def load_article_encoder(p): if self.article_encoder is True: self.article_encoder, _, _ = self.Model(**self.cfg) From 81731907ba0c3589c28367c5ec08f8a8f3eaeeae Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 14 Jun 2019 19:55:46 +0200 Subject: [PATCH 078/148] performance per entity type --- .../wiki_entity_linking/kb_creator.py | 37 +++--- .../training_set_creator.py | 2 +- .../wiki_entity_linking/wiki_nel_pipeline.py | 122 +++++++++++------- .../wiki_entity_linking/wikidata_processor.py | 28 ++-- .../wikipedia_processor.py | 5 +- 5 files changed, 114 insertions(+), 80 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index 7b740216b..4d7bd646b 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -15,10 +15,10 @@ INPUT_DIM = 300 # dimension of pre-trained vectors DESC_WIDTH = 64 -def create_kb(nlp, max_entities_per_alias, min_occ, +def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, entity_def_output, entity_descr_output, count_input, prior_prob_input, to_print=False): - """ Create the knowledge base from Wikidata entries """ + # Create the knowledge base from Wikidata entries kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH) # disable this part of the pipeline when rerunning the KB generation from preprocessed files @@ -37,21 +37,26 @@ def create_kb(nlp, max_entities_per_alias, min_occ, title_to_id = _get_entity_to_id(entity_def_output) id_to_descr = _get_id_to_description(entity_descr_output) - title_list = list(title_to_id.keys()) - - # TODO: remove this filter (just for quicker testing of code) - # title_list = title_list[0:342] - # title_to_id = {t: title_to_id[t] for t in title_list} - - entity_list = [title_to_id[x] for x in title_list] - - # Currently keeping entities from the KB where there is no description - putting a default void description - description_list = [id_to_descr.get(x, "No description defined") for x in entity_list] - print() print(" * _get_entity_frequencies", datetime.datetime.now()) print() - entity_frequencies = wp.get_entity_frequencies(count_input=count_input, entities=title_list) + entity_frequencies = wp.get_all_frequencies(count_input=count_input) + + # filter the entities for in the KB by frequency, because there's just too much data otherwise + filtered_title_to_id = dict() + entity_list = list() + description_list = list() + frequency_list = list() + for title, entity in title_to_id.items(): + freq = entity_frequencies.get(title, 0) + desc = id_to_descr.get(entity, None) + if desc and freq > min_entity_freq: + entity_list.append(entity) + description_list.append(desc) + frequency_list.append(freq) + filtered_title_to_id[title] = entity + + print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), "titles") print() print(" * train entity encoder", datetime.datetime.now()) @@ -67,12 +72,12 @@ def create_kb(nlp, max_entities_per_alias, min_occ, print() print(" * adding", len(entity_list), "entities", datetime.datetime.now()) - kb.set_entities(entity_list=entity_list, prob_list=entity_frequencies, vector_list=embeddings) + kb.set_entities(entity_list=entity_list, prob_list=frequency_list, vector_list=embeddings) print() print(" * adding aliases", datetime.datetime.now()) print() - _add_aliases(kb, title_to_id=title_to_id, + _add_aliases(kb, title_to_id=filtered_title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input) diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index fc620a1d3..845ce62dc 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -21,7 +21,7 @@ ENTITY_FILE = "gold_entities.csv" def create_training(entity_def_input, training_output): wp_to_id = kb_creator._get_entity_to_id(entity_def_input) - _process_wikipedia_texts(wp_to_id, training_output, limit=100000000) # TODO: full dataset 100000000 + _process_wikipedia_texts(wp_to_id, training_output, limit=100000000) def _process_wikipedia_texts(wp_to_id, training_output, limit=None): diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index faea93f53..1e5280f89 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -29,6 +29,7 @@ NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2' TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' MAX_CANDIDATES = 10 +MIN_ENTITY_FREQ = 200 MIN_PAIR_OCC = 5 DOC_SENT_CUTOFF = 2 EPOCHS = 10 @@ -46,14 +47,14 @@ def run_pipeline(): # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = False # TODO: entity_defs should also contain entities not in the KB + to_create_kb = True # read KB back in from file to_read_kb = False to_test_kb = False # create training dataset - create_wp_training = True + create_wp_training = False # train the EL pipe train_pipe = False @@ -84,13 +85,14 @@ def run_pipeline(): if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) kb_1 = kb_creator.create_kb(nlp_1, - max_entities_per_alias=MAX_CANDIDATES, - min_occ=MIN_PAIR_OCC, - entity_def_output=ENTITY_DEFS, - entity_descr_output=ENTITY_DESCR, - count_input=ENTITY_COUNTS, - prior_prob_input=PRIOR_PROB, - to_print=False) + max_entities_per_alias=MAX_CANDIDATES, + min_entity_freq=MIN_ENTITY_FREQ, + min_occ=MIN_PAIR_OCC, + entity_def_output=ENTITY_DEFS, + entity_descr_output=ENTITY_DESCR, + count_input=ENTITY_COUNTS, + prior_prob_input=PRIOR_PROB, + to_print=False) print("kb entities:", kb_1.get_size_entities()) print("kb aliases:", kb_1.get_size_aliases()) print() @@ -112,7 +114,7 @@ def run_pipeline(): # test KB if to_test_kb: - run_el.run_kb_toy_example(kb=kb_2) + test_kb(kb_2) print() # STEP 5: create a training dataset from WP @@ -121,10 +123,18 @@ def run_pipeline(): training_set_creator.create_training(entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) # STEP 6: create the entity linking pipe + el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_SENT_CUTOFF}) + el_pipe.set_kb(kb_2) + nlp_2.add_pipe(el_pipe, last=True) + + other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"] + with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking + nlp_2.begin_training() + if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - train_limit = 50 - dev_limit = 10 + train_limit = 10 + dev_limit = 2 print("Training on", train_limit, "articles") print("Dev testing on", dev_limit, "articles") print() @@ -141,14 +151,6 @@ def run_pipeline(): limit=dev_limit, to_print=False) - el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_SENT_CUTOFF}) - el_pipe.set_kb(kb_2) - nlp_2.add_pipe(el_pipe, last=True) - - other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"] - with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking - nlp_2.begin_training() - for itn in range(EPOCHS): random.shuffle(train_data) losses = {} @@ -180,30 +182,32 @@ def run_pipeline(): # print(" measuring accuracy 1-1") el_pipe.context_weight = 1 el_pipe.prior_weight = 1 - dev_acc_1_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_1_1 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc combo:", round(train_acc_1_1, 2), round(dev_acc_1_1, 2)) + dev_acc_1_1, dev_acc_1_1_dict = _measure_accuracy(dev_data, el_pipe) + print("dev acc combo:", round(dev_acc_1_1, 3), [(x, round(y, 3)) for x, y in dev_acc_1_1_dict.items()]) + train_acc_1_1, train_acc_1_1_dict = _measure_accuracy(train_data, el_pipe) + print("train acc combo:", round(train_acc_1_1, 3), [(x, round(y, 3)) for x, y in train_acc_1_1_dict.items()]) # baseline using only prior probabilities el_pipe.context_weight = 0 el_pipe.prior_weight = 1 - dev_acc_0_1 = _measure_accuracy(dev_data, el_pipe) - train_acc_0_1 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc prior:", round(train_acc_0_1, 2), round(dev_acc_0_1, 2)) + dev_acc_0_1, dev_acc_0_1_dict = _measure_accuracy(dev_data, el_pipe) + print("dev acc prior:", round(dev_acc_0_1, 3), [(x, round(y, 3)) for x, y in dev_acc_0_1_dict.items()]) + train_acc_0_1, train_acc_0_1_dict = _measure_accuracy(train_data, el_pipe) + print("train acc prior:", round(train_acc_0_1, 3), [(x, round(y, 3)) for x, y in train_acc_0_1_dict.items()]) # using only context el_pipe.context_weight = 1 el_pipe.prior_weight = 0 - dev_acc_1_0 = _measure_accuracy(dev_data, el_pipe) - train_acc_1_0 = _measure_accuracy(train_data, el_pipe) - print("train/dev acc context:", round(train_acc_1_0, 2), round(dev_acc_1_0, 2)) + dev_acc_1_0, dev_acc_1_0_dict = _measure_accuracy(dev_data, el_pipe) + print("dev acc context:", round(dev_acc_1_0, 3), [(x, round(y, 3)) for x, y in dev_acc_1_0_dict.items()]) + train_acc_1_0, train_acc_1_0_dict = _measure_accuracy(train_data, el_pipe) + print("train acc context:", round(train_acc_1_0, 3), [(x, round(y, 3)) for x, y in train_acc_1_0_dict.items()]) print() # reset for follow-up tests el_pipe.context_weight = 1 el_pipe.prior_weight = 1 - if to_test_pipeline: print() print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) @@ -230,8 +234,8 @@ def run_pipeline(): def _measure_accuracy(data, el_pipe): - correct = 0 - incorrect = 0 + correct_by_label = dict() + incorrect_by_label = dict() docs = [d for d, g in data if len(d) > 0] docs = el_pipe.pipe(docs) @@ -245,31 +249,53 @@ def _measure_accuracy(data, el_pipe): correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb for ent in doc.ents: - if ent.label_ == "PERSON": # TODO: expand to other types - pred_entity = ent.kb_id_ - start = ent.start_char - end = ent.end_char - gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None) - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - if gold_entity == pred_entity: - correct += 1 - else: - incorrect += 1 + ent_label = ent.label_ + pred_entity = ent.kb_id_ + start = ent.start_char + end = ent.end_char + gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None) + # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' + if gold_entity is not None: + if gold_entity == pred_entity: + correct = correct_by_label.get(ent_label, 0) + correct_by_label[ent_label] = correct + 1 + else: + incorrect = incorrect_by_label.get(ent_label, 0) + incorrect_by_label[ent_label] = incorrect + 1 except Exception as e: print("Error assessing accuracy", e) - if correct == incorrect == 0: - return 0 + acc_by_label = dict() + total_correct = 0 + total_incorrect = 0 + for label, correct in correct_by_label.items(): + incorrect = incorrect_by_label.get(label, 0) + total_correct += correct + total_incorrect += incorrect + if correct == incorrect == 0: + acc_by_label[label] = 0 + else: + acc_by_label[label] = correct / (correct + incorrect) + acc = 0 + if not (total_correct == total_incorrect == 0): + acc = total_correct / (total_correct + total_incorrect) + return acc, acc_by_label - acc = correct / (correct + incorrect) - return acc + +def test_kb(kb): + for mention in ("Bush", "Douglas Adams", "Homer", "Brazil", "China"): + candidates = kb.get_candidates(mention) + + print("generating candidates for " + mention + " :") + for c in candidates: + print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") + print() def run_el_toy_example(nlp): text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel. " \ + "Douglas reminds us to always bring our towel, even in China or Brazil. " \ "The main character in Doug's novel is the man Arthur Dent, " \ "but Douglas doesn't write about George Washington or Homer Simpson." doc = nlp(text) diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py index 7d84b1a2a..f6a6cbe23 100644 --- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py +++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re import bz2 import json import datetime @@ -14,7 +13,7 @@ def read_wikidata_entities_json(limit=None, to_print=False): """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ lang = 'en' - prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected + # prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected site_filter = 'enwiki' title_to_id = dict() @@ -41,18 +40,19 @@ def read_wikidata_entities_json(limit=None, to_print=False): entry_type = obj["type"] if entry_type == "item": - # filtering records on their properties - keep = False + # filtering records on their properties (currently disabled to get ALL data) + # keep = False + keep = True claims = obj["claims"] - for prop, value_set in prop_filter.items(): - claim_property = claims.get(prop, None) - if claim_property: - for cp in claim_property: - cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') - cp_rank = cp['rank'] - if cp_rank != "deprecated" and cp_id in value_set: - keep = True + # for prop, value_set in prop_filter.items(): + # claim_property = claims.get(prop, None) + # if claim_property: + # for cp in claim_property: + # cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') + # cp_rank = cp['rank'] + # if cp_rank != "deprecated" and cp_id in value_set: + # keep = True if keep: unique_id = obj["id"] @@ -70,6 +70,7 @@ def read_wikidata_entities_json(limit=None, to_print=False): if to_print: print("prop:", prop, cp_values) + found_link = False if parse_sitelinks: site_value = obj["sitelinks"].get(site_filter, None) if site_value: @@ -77,6 +78,7 @@ def read_wikidata_entities_json(limit=None, to_print=False): if to_print: print(site_filter, ":", site) title_to_id[site] = unique_id + found_link = True if parse_labels: labels = obj["labels"] @@ -86,7 +88,7 @@ def read_wikidata_entities_json(limit=None, to_print=False): if to_print: print("label (" + lang + "):", lang_label["value"]) - if parse_descriptions: + if found_link and parse_descriptions: descriptions = obj["descriptions"] if descriptions: lang_descr = descriptions.get(lang, None) diff --git a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py index 0461cb19f..e53423487 100644 --- a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py +++ b/examples/pipeline/wiki_entity_linking/wikipedia_processor.py @@ -175,7 +175,7 @@ def write_entity_counts(prior_prob_input, count_output, to_print=False): print("Total count:", total_count) -def get_entity_frequencies(count_input, entities): +def get_all_frequencies(count_input): entity_to_count = dict() with open(count_input, 'r', encoding='utf8') as csvfile: csvreader = csv.reader(csvfile, delimiter='|') @@ -184,4 +184,5 @@ def get_entity_frequencies(count_input, entities): for row in csvreader: entity_to_count[row[0]] = int(row[1]) - return [entity_to_count.get(e, 0) for e in entities] + return entity_to_count + From 24db1392b9fad37fc532bf53d7f152611f319e70 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sun, 16 Jun 2019 21:14:45 +0200 Subject: [PATCH 079/148] reprocessing all of wikipedia for training data --- .../wiki_entity_linking/kb_creator.py | 2 +- .../pipeline/wiki_entity_linking/run_el.py | 4 +- .../pipeline/wiki_entity_linking/train_el.py | 4 +- .../training_set_creator.py | 108 +++++++++--------- .../wiki_entity_linking/wiki_nel_pipeline.py | 78 +++++++------ .../wiki_entity_linking/wikidata_processor.py | 2 +- 6 files changed, 98 insertions(+), 100 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/examples/pipeline/wiki_entity_linking/kb_creator.py index 4d7bd646b..80d0e21e9 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/examples/pipeline/wiki_entity_linking/kb_creator.py @@ -56,7 +56,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, frequency_list.append(freq) filtered_title_to_id[title] = entity - print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), "titles") + print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), "titles with filter frequency", min_entity_freq) print() print(" * train entity encoder", datetime.datetime.now()) diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py index 52ccccfda..c26e8d65a 100644 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ b/examples/pipeline/wiki_entity_linking/run_el.py @@ -25,9 +25,7 @@ def run_kb_toy_example(kb): def run_el_dev(nlp, kb, training_dir, limit=None): - correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir, - collect_correct=True, - collect_incorrect=False) + correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir) predictions = list() golds = list() diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py index 143e38d99..a4026d935 100644 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ b/examples/pipeline/wiki_entity_linking/train_el.py @@ -389,9 +389,7 @@ class EL_Model: bp_sent(sent_gradients, sgd=self.sgd_sent) def _get_training_data(self, training_dir, id_to_descr, dev, limit, to_print): - correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir, - collect_correct=True, - collect_incorrect=True) + correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir) entities_by_cluster = dict() gold_by_entity = dict() diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index 845ce62dc..5d089c620 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -16,12 +16,13 @@ from . import wikipedia_processor as wp, kb_creator Process Wikipedia interlinks to generate a training dataset for the EL algorithm """ -ENTITY_FILE = "gold_entities.csv" +# ENTITY_FILE = "gold_entities.csv" +ENTITY_FILE = "gold_entities_100000.csv" # use this file for faster processing def create_training(entity_def_input, training_output): wp_to_id = kb_creator._get_entity_to_id(entity_def_input) - _process_wikipedia_texts(wp_to_id, training_output, limit=100000000) + _process_wikipedia_texts(wp_to_id, training_output, limit=None) def _process_wikipedia_texts(wp_to_id, training_output, limit=None): @@ -290,75 +291,72 @@ def _write_training_entity(outputfile, article_id, alias, entity, start, end): outputfile.write(article_id + "|" + alias + "|" + entity + "|" + str(start) + "|" + str(end) + "\n") -def read_training_entities(training_output): +def is_dev(article_id): + return article_id.endswith("3") + + +def read_training_entities(training_output, dev, limit): entityfile_loc = training_output + "/" + ENTITY_FILE entries_per_article = dict() + article_ids = set() with open(entityfile_loc, mode='r', encoding='utf8') as file: for line in file: - fields = line.replace('\n', "").split(sep='|') - article_id = fields[0] - alias = fields[1] - wp_title = fields[2] - start = fields[3] - end = fields[4] + if not limit or len(article_ids) < limit: + fields = line.replace('\n', "").split(sep='|') + article_id = fields[0] + if dev == is_dev(article_id) and article_id != "article_id": + article_ids.add(article_id) - entries_by_offset = entries_per_article.get(article_id, dict()) - entries_by_offset[start + "-" + end] = (alias, wp_title) + alias = fields[1] + wp_title = fields[2] + start = fields[3] + end = fields[4] - entries_per_article[article_id] = entries_by_offset + entries_by_offset = entries_per_article.get(article_id, dict()) + entries_by_offset[start + "-" + end] = (alias, wp_title) + + entries_per_article[article_id] = entries_by_offset return entries_per_article -def read_training(nlp, training_dir, dev, limit, to_print): - # This method will provide training examples that correspond to the entity annotations found by the nlp object - entries_per_article = read_training_entities(training_output=training_dir) +def read_training(nlp, training_dir, dev, limit): + # This method provides training examples that correspond to the entity annotations found by the nlp object + + print("reading training entities") + entries_per_article = read_training_entities(training_output=training_dir, dev=dev, limit=limit) + print("done reading training entities") data = [] + for article_id, entries_by_offset in entries_per_article.items(): + file_name = article_id + ".txt" + try: + # parse the article text + with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as file: + text = file.read() + article_doc = nlp(text) - cnt = 0 - files = listdir(training_dir) - for f in files: - if not limit or cnt < limit: - if dev == run_el.is_dev(f): - article_id = f.replace(".txt", "") - if cnt % 500 == 0 and to_print: - print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") + gold_entities = list() + for ent in article_doc.ents: + start = ent.start_char + end = ent.end_char - try: - # parse the article text - with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: - text = file.read() - article_doc = nlp(text) + entity_tuple = entries_by_offset.get(str(start) + "-" + str(end), None) + if entity_tuple: + alias, wp_title = entity_tuple + if ent.text != alias: + print("Non-matching entity in", article_id, start, end) + else: + gold_entities.append((start, end, wp_title)) - entries_by_offset = entries_per_article.get(article_id, dict()) + if gold_entities: + gold = GoldParse(doc=article_doc, links=gold_entities) + data.append((article_doc, gold)) - gold_entities = list() - for ent in article_doc.ents: - start = ent.start_char - end = ent.end_char + except Exception as e: + print("Problem parsing article", article_id) + print(e) + raise e - entity_tuple = entries_by_offset.get(str(start) + "-" + str(end), None) - if entity_tuple: - alias, wp_title = entity_tuple - if ent.text != alias: - print("Non-matching entity in", article_id, start, end) - else: - gold_entities.append((start, end, wp_title)) - - if gold_entities: - gold = GoldParse(doc=article_doc, links=gold_entities) - data.append((article_doc, gold)) - - cnt += 1 - except Exception as e: - print("Problem parsing article", article_id) - print(e) - raise e - - if to_print: - print() - print("Processed", cnt, "training articles, dev=" + str(dev)) - print() return data diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 1e5280f89..b3b3479e2 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -29,7 +29,7 @@ NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2' TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' MAX_CANDIDATES = 10 -MIN_ENTITY_FREQ = 200 +MIN_ENTITY_FREQ = 20 MIN_PAIR_OCC = 5 DOC_SENT_CUTOFF = 2 EPOCHS = 10 @@ -47,21 +47,21 @@ def run_pipeline(): # one-time methods to create KB and write to file to_create_prior_probs = False to_create_entity_counts = False - to_create_kb = True + to_create_kb = False # read KB back in from file - to_read_kb = False + to_read_kb = True to_test_kb = False # create training dataset create_wp_training = False # train the EL pipe - train_pipe = False - measure_performance = False + train_pipe = True + measure_performance = True # test the EL pipe on a simple example - to_test_pipeline = False + to_test_pipeline = True # write the NLP object, read back in and test again test_nlp_io = False @@ -135,46 +135,50 @@ def run_pipeline(): print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) train_limit = 10 dev_limit = 2 - print("Training on", train_limit, "articles") - print("Dev testing on", dev_limit, "articles") - print() train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=False, - limit=train_limit, - to_print=False) + limit=train_limit) + + print("Training on", len(train_data), "articles") + print() + + if not train_data: + print("Did not find any training data") + + else: + for itn in range(EPOCHS): + random.shuffle(train_data) + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) + batchnr = 0 + + with nlp_2.disable_pipes(*other_pipes): + for batch in batches: + try: + docs, golds = zip(*batch) + nlp_2.update( + docs, + golds, + drop=DROPOUT, + losses=losses, + ) + batchnr += 1 + except Exception as e: + print("Error updating batch", e) + + losses['entity_linker'] = losses['entity_linker'] / batchnr + print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) dev_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, - limit=dev_limit, - to_print=False) + limit=dev_limit) + print("Dev testing on", len(dev_data), "articles") + print() - for itn in range(EPOCHS): - random.shuffle(train_data) - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) - batchnr = 0 - - with nlp_2.disable_pipes(*other_pipes): - for batch in batches: - try: - docs, golds = zip(*batch) - nlp_2.update( - docs, - golds, - drop=DROPOUT, - losses=losses, - ) - batchnr += 1 - except Exception as e: - print("Error updating batch", e) - - losses['entity_linker'] = losses['entity_linker'] / batchnr - print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) - - if measure_performance: + if len(dev_data) and measure_performance: print() print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) print() diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/examples/pipeline/wiki_entity_linking/wikidata_processor.py index f6a6cbe23..967849abb 100644 --- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py +++ b/examples/pipeline/wiki_entity_linking/wikidata_processor.py @@ -104,7 +104,7 @@ def read_wikidata_entities_json(limit=None, to_print=False): if lang_aliases: for item in lang_aliases: if to_print: - print("alias (" + lang + "):", item["value"]) + print("alias (" + lang + "):", item["value"]) if to_print: print() From 6332af40de10b221ec7ef4354b3d51bf6f80ca71 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 17 Jun 2019 14:39:40 +0200 Subject: [PATCH 080/148] baseline performances: oracle KB, random and prior prob --- .../training_set_creator.py | 116 ++++++-------- .../wiki_entity_linking/wiki_nel_pipeline.py | 143 ++++++++++++++---- 2 files changed, 161 insertions(+), 98 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index 5d089c620..4ce69e75d 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -5,11 +5,8 @@ import os import re import bz2 import datetime -from os import listdir -from examples.pipeline.wiki_entity_linking import run_el from spacy.gold import GoldParse -from spacy.matcher import PhraseMatcher from . import wikipedia_processor as wp, kb_creator """ @@ -17,7 +14,7 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm """ # ENTITY_FILE = "gold_entities.csv" -ENTITY_FILE = "gold_entities_100000.csv" # use this file for faster processing +ENTITY_FILE = "gold_entities_1000000.csv" # use this file for faster processing def create_training(entity_def_input, training_output): @@ -58,7 +55,6 @@ def _process_wikipedia_texts(wp_to_id, training_output, limit=None): if cnt % 1000000 == 0: print(datetime.datetime.now(), "processed", cnt, "lines of Wikipedia dump") clean_line = line.strip().decode("utf-8") - # print(clean_line) if clean_line == "": reading_revision = True @@ -121,7 +117,6 @@ text_regex = re.compile(r'(?<=).*(?= 0 and len(data) % 50 == 0: + print("Read", total_entities, "entities in", len(data), "articles") + fields = line.replace('\n', "").split(sep='|') + article_id = fields[0] + alias = fields[1] + wp_title = fields[2] + start = fields[3] + end = fields[4] - if gold_entities: - gold = GoldParse(doc=article_doc, links=gold_entities) - data.append((article_doc, gold)) + if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles: + if not current_doc or (current_article_id != article_id): + # store the data from the previous article + if gold_entities and current_doc: + gold = GoldParse(doc=current_doc, links=gold_entities) + data.append((current_doc, gold)) + total_entities += len(gold_entities) - except Exception as e: - print("Problem parsing article", article_id) - print(e) - raise e + # parse the new article text + file_name = article_id + ".txt" + try: + with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f: + text = f.read() + current_doc = nlp(text) + for ent in current_doc.ents: + ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent.text + except Exception as e: + print("Problem parsing article", article_id, e) + current_article_id = article_id + gold_entities = list() + + # repeat checking this condition in case an exception was thrown + if current_doc and (current_article_id == article_id): + found_ent = ents_by_offset.get(start + "_" + end, None) + if found_ent: + if found_ent != alias: + skip_articles.add(current_article_id) + else: + gold_entities.append((int(start), int(end), wp_title)) + + print("Read", total_entities, "entities in", len(data), "articles") return data diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index b3b3479e2..7b54df527 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -64,7 +64,8 @@ def run_pipeline(): to_test_pipeline = True # write the NLP object, read back in and test again - test_nlp_io = False + to_write_nlp = True + to_read_nlp = True # STEP 1 : create prior probabilities from WP # run only once ! @@ -133,7 +134,7 @@ def run_pipeline(): if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - train_limit = 10 + train_limit = 5 dev_limit = 2 train_data = training_set_creator.read_training(nlp=nlp_2, @@ -166,46 +167,42 @@ def run_pipeline(): ) batchnr += 1 except Exception as e: - print("Error updating batch", e) + print("Error updating batch:", e) + raise(e) - losses['entity_linker'] = losses['entity_linker'] / batchnr - print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) + if batchnr > 0: + losses['entity_linker'] = losses['entity_linker'] / batchnr + print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) dev_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit) - print("Dev testing on", len(dev_data), "articles") + print() + print("Dev testing on", len(dev_data), "articles") if len(dev_data) and measure_performance: print() print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) print() + acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label = _measure_baselines(dev_data, kb_2) + print("dev acc oracle:", round(acc_oracle, 3), [(x, round(y, 3)) for x, y in acc_oracle_by_label.items()]) + print("dev acc random:", round(acc_random, 3), [(x, round(y, 3)) for x, y in acc_random_by_label.items()]) + print("dev acc prior:", round(acc_prior, 3), [(x, round(y, 3)) for x, y in acc_prior_by_label.items()]) + # print(" measuring accuracy 1-1") el_pipe.context_weight = 1 el_pipe.prior_weight = 1 - dev_acc_1_1, dev_acc_1_1_dict = _measure_accuracy(dev_data, el_pipe) - print("dev acc combo:", round(dev_acc_1_1, 3), [(x, round(y, 3)) for x, y in dev_acc_1_1_dict.items()]) - train_acc_1_1, train_acc_1_1_dict = _measure_accuracy(train_data, el_pipe) - print("train acc combo:", round(train_acc_1_1, 3), [(x, round(y, 3)) for x, y in train_acc_1_1_dict.items()]) - - # baseline using only prior probabilities - el_pipe.context_weight = 0 - el_pipe.prior_weight = 1 - dev_acc_0_1, dev_acc_0_1_dict = _measure_accuracy(dev_data, el_pipe) - print("dev acc prior:", round(dev_acc_0_1, 3), [(x, round(y, 3)) for x, y in dev_acc_0_1_dict.items()]) - train_acc_0_1, train_acc_0_1_dict = _measure_accuracy(train_data, el_pipe) - print("train acc prior:", round(train_acc_0_1, 3), [(x, round(y, 3)) for x, y in train_acc_0_1_dict.items()]) + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe) + print("dev acc combo:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) # using only context el_pipe.context_weight = 1 el_pipe.prior_weight = 0 - dev_acc_1_0, dev_acc_1_0_dict = _measure_accuracy(dev_data, el_pipe) - print("dev acc context:", round(dev_acc_1_0, 3), [(x, round(y, 3)) for x, y in dev_acc_1_0_dict.items()]) - train_acc_1_0, train_acc_1_0_dict = _measure_accuracy(train_data, el_pipe) - print("train acc context:", round(train_acc_1_0, 3), [(x, round(y, 3)) for x, y in train_acc_1_0_dict.items()]) + dev_acc_context, dev_acc_1_0_dict = _measure_accuracy(dev_data, el_pipe) + print("dev acc context:", round(dev_acc_context, 3), [(x, round(y, 3)) for x, y in dev_acc_1_0_dict.items()]) print() # reset for follow-up tests @@ -219,7 +216,7 @@ def run_pipeline(): run_el_toy_example(nlp=nlp_2) print() - if test_nlp_io: + if to_write_nlp: print() print("STEP 9: testing NLP IO", datetime.datetime.now()) print() @@ -229,9 +226,10 @@ def run_pipeline(): print("reading from", NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR) - print() - print("running toy example with NLP 2") - run_el_toy_example(nlp=nlp_3) + if to_read_nlp: + print() + print("running toy example with NLP 2") + run_el_toy_example(nlp=nlp_3) print() print("STOP", datetime.datetime.now()) @@ -270,6 +268,80 @@ def _measure_accuracy(data, el_pipe): except Exception as e: print("Error assessing accuracy", e) + acc, acc_by_label = calculate_acc(correct_by_label, incorrect_by_label) + return acc, acc_by_label + + +def _measure_baselines(data, kb): + random_correct_by_label = dict() + random_incorrect_by_label = dict() + + oracle_correct_by_label = dict() + oracle_incorrect_by_label = dict() + + prior_correct_by_label = dict() + prior_incorrect_by_label = dict() + + docs = [d for d, g in data if len(d) > 0] + golds = [g for d, g in data if len(d) > 0] + + for doc, gold in zip(docs, golds): + try: + correct_entries_per_article = dict() + for entity in gold.links: + start, end, gold_kb = entity + correct_entries_per_article[str(start) + "-" + str(end)] = gold_kb + + for ent in doc.ents: + ent_label = ent.label_ + start = ent.start_char + end = ent.end_char + gold_entity = correct_entries_per_article.get(str(start) + "-" + str(end), None) + + # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' + if gold_entity is not None: + candidates = kb.get_candidates(ent.text) + oracle_candidate = "" + best_candidate = "" + random_candidate = "" + if candidates: + scores = list() + + for c in candidates: + scores.append(c.prior_prob) + if c.entity_ == gold_entity: + oracle_candidate = c.entity_ + + best_index = scores.index(max(scores)) + best_candidate = candidates[best_index].entity_ + random_candidate = random.choice(candidates).entity_ + + if gold_entity == best_candidate: + prior_correct_by_label[ent_label] = prior_correct_by_label.get(ent_label, 0) + 1 + else: + prior_incorrect_by_label[ent_label] = prior_incorrect_by_label.get(ent_label, 0) + 1 + + if gold_entity == random_candidate: + random_correct_by_label[ent_label] = random_correct_by_label.get(ent_label, 0) + 1 + else: + random_incorrect_by_label[ent_label] = random_incorrect_by_label.get(ent_label, 0) + 1 + + if gold_entity == oracle_candidate: + oracle_correct_by_label[ent_label] = oracle_correct_by_label.get(ent_label, 0) + 1 + else: + oracle_incorrect_by_label[ent_label] = oracle_incorrect_by_label.get(ent_label, 0) + 1 + + except Exception as e: + print("Error assessing accuracy", e) + + acc_prior, acc_prior_by_label = calculate_acc(prior_correct_by_label, prior_incorrect_by_label) + acc_random, acc_random_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label) + acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label) + + return acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label + + +def calculate_acc(correct_by_label, incorrect_by_label): acc_by_label = dict() total_correct = 0 total_incorrect = 0 @@ -303,18 +375,25 @@ def run_el_toy_example(nlp): "The main character in Doug's novel is the man Arthur Dent, " \ "but Douglas doesn't write about George Washington or Homer Simpson." doc = nlp(text) - + print(text) for ent in doc.ents: print("ent", ent.text, ent.label_, ent.kb_id_) - print() - # Q4426480 is her husband, Q3568763 her tutor - text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine."\ - "Ada Lovelace loved her husband William King dearly. " \ - "Ada Lovelace was tutored by her favorite physics tutor William King." + # Q4426480 is her husband + text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine. "\ + "She loved her husband William King dearly. " doc = nlp(text) + print(text) + for ent in doc.ents: + print("ent", ent.text, ent.label_, ent.kb_id_) + print() + # Q3568763 is her tutor + text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine. "\ + "She was tutored by her favorite physics tutor William King." + doc = nlp(text) + print(text) for ent in doc.ents: print("ent", ent.text, ent.label_, ent.kb_id_) From ffae7d35552476adc14e2be6d66f64edd6ae06ed Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 18 Jun 2019 00:05:47 +0200 Subject: [PATCH 081/148] sentence encoder only (removing article/mention encoder) --- .../training_set_creator.py | 41 +++--- .../wiki_entity_linking/wiki_nel_pipeline.py | 15 +- spacy/pipeline/pipes.pyx | 129 +++++++++--------- 3 files changed, 95 insertions(+), 90 deletions(-) diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/examples/pipeline/wiki_entity_linking/training_set_creator.py index 4ce69e75d..cc985202c 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/examples/pipeline/wiki_entity_linking/training_set_creator.py @@ -294,7 +294,6 @@ def read_training(nlp, training_dir, dev, limit): # we assume the data is written sequentially current_article_id = None current_doc = None - gold_entities = list() ents_by_offset = dict() skip_articles = set() total_entities = 0 @@ -302,8 +301,6 @@ def read_training(nlp, training_dir, dev, limit): with open(entityfile_loc, mode='r', encoding='utf8') as file: for line in file: if not limit or len(data) < limit: - if len(data) > 0 and len(data) % 50 == 0: - print("Read", total_entities, "entities in", len(data), "articles") fields = line.replace('\n', "").split(sep='|') article_id = fields[0] alias = fields[1] @@ -313,34 +310,42 @@ def read_training(nlp, training_dir, dev, limit): if dev == is_dev(article_id) and article_id != "article_id" and article_id not in skip_articles: if not current_doc or (current_article_id != article_id): - # store the data from the previous article - if gold_entities and current_doc: - gold = GoldParse(doc=current_doc, links=gold_entities) - data.append((current_doc, gold)) - total_entities += len(gold_entities) - # parse the new article text file_name = article_id + ".txt" try: with open(os.path.join(training_dir, file_name), mode="r", encoding='utf8') as f: text = f.read() - current_doc = nlp(text) - for ent in current_doc.ents: - ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent.text + if len(text) < 30000: # threshold for convenience / speed of processing + current_doc = nlp(text) + current_article_id = article_id + ents_by_offset = dict() + for ent in current_doc.ents: + ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent + else: + skip_articles.add(current_article_id) + current_doc = None except Exception as e: print("Problem parsing article", article_id, e) - current_article_id = article_id - gold_entities = list() - # repeat checking this condition in case an exception was thrown if current_doc and (current_article_id == article_id): found_ent = ents_by_offset.get(start + "_" + end, None) if found_ent: - if found_ent != alias: + if found_ent.text != alias: skip_articles.add(current_article_id) + current_doc = None else: - gold_entities.append((int(start), int(end), wp_title)) + sent = found_ent.sent.as_doc() + # currently feeding the gold data one entity per sentence at a time + gold_start = int(start) - found_ent.sent.start_char + gold_end = int(end) - found_ent.sent.start_char + gold_entities = list() + gold_entities.append((gold_start, gold_end, wp_title)) + gold = GoldParse(doc=current_doc, links=gold_entities) + data.append((sent, gold)) + total_entities += 1 + if len(data) % 500 == 0: + print(" -read", total_entities, "entities") - print("Read", total_entities, "entities in", len(data), "articles") + print(" -read", total_entities, "entities") return data diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py index 7b54df527..bdae023b9 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py @@ -9,7 +9,6 @@ from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_ from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH import spacy -from spacy.vocab import Vocab from spacy.kb import KnowledgeBase import datetime @@ -64,8 +63,8 @@ def run_pipeline(): to_test_pipeline = True # write the NLP object, read back in and test again - to_write_nlp = True - to_read_nlp = True + to_write_nlp = False + to_read_nlp = False # STEP 1 : create prior probabilities from WP # run only once ! @@ -134,8 +133,8 @@ def run_pipeline(): if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - train_limit = 5 - dev_limit = 2 + train_limit = 25000 + dev_limit = 1000 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -345,7 +344,11 @@ def calculate_acc(correct_by_label, incorrect_by_label): acc_by_label = dict() total_correct = 0 total_incorrect = 0 - for label, correct in correct_by_label.items(): + all_keys = set() + all_keys.update(correct_by_label.keys()) + all_keys.update(incorrect_by_label.keys()) + for label in sorted(all_keys): + correct = correct_by_label.get(label, 0) incorrect = incorrect_by_label.get(label, 0) total_correct += correct total_incorrect += incorrect diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 5d82da7ee..fbdca8280 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1079,36 +1079,39 @@ class EntityLinker(Pipe): embed_width = cfg.get("embed_width", 300) hidden_width = cfg.get("hidden_width", 32) - article_width = cfg.get("article_width", 128) - sent_width = cfg.get("sent_width", 64) entity_width = cfg.get("entity_width") # no default because this needs to correspond with the KB + sent_width = entity_width - article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg) - sent_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg) + model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg) # dimension of the mention encoder needs to match the dimension of the entity encoder - mention_width = article_width + sent_width - mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0) + # article_width = cfg.get("article_width", 128) + # sent_width = cfg.get("sent_width", 64) + # article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg) + # mention_width = article_width + sent_width + # mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0) + # return article_encoder, sent_encoder, mention_encoder - return article_encoder, sent_encoder, mention_encoder + return model def __init__(self, **cfg): - self.article_encoder = True - self.sent_encoder = True - self.mention_encoder = True + # self.article_encoder = True + # self.sent_encoder = True + # self.mention_encoder = True + self.model = True self.kb = None self.cfg = dict(cfg) self.doc_cutoff = self.cfg.get("doc_cutoff", 5) - self.sgd_article = None - self.sgd_sent = None - self.sgd_mention = None + # self.sgd_article = None + # self.sgd_sent = None + # self.sgd_mention = None def set_kb(self, kb): self.kb = kb def require_model(self): # Raise an error if the component's model is not initialized. - if getattr(self, "mention_encoder", None) in (None, True, False): + if getattr(self, "model", None) in (None, True, False): raise ValueError(Errors.E109.format(name=self.name)) def require_kb(self): @@ -1121,12 +1124,19 @@ class EntityLinker(Pipe): self.require_kb() self.cfg["entity_width"] = self.kb.entity_vector_length - if self.mention_encoder is True: - self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) - self.sgd_article = create_default_optimizer(self.article_encoder.ops) - self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) - self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) - return self.sgd_article + if self.model is True: + self.model = self.Model(**self.cfg) + + if sgd is None: + sgd = self.create_optimizer() + return sgd + + # if self.mention_encoder is True: + # self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) + # self.sgd_article = create_default_optimizer(self.article_encoder.ops) + # self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) + # self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) + # return self.sgd_article def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): self.require_model() @@ -1146,7 +1156,7 @@ class EntityLinker(Pipe): docs = [docs] golds = [golds] - article_docs = list() + # article_docs = list() sentence_docs = list() entity_encodings = list() @@ -1173,34 +1183,32 @@ class EntityLinker(Pipe): if kb_id == gold_kb: prior_prob = c.prior_prob entity_encoding = c.entity_vector - entity_encodings.append(entity_encoding) - article_docs.append(first_par) + # article_docs.append(first_par) sentence_docs.append(sentence) if len(entity_encodings) > 0: - doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) - sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) + # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) + # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) - concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in - range(len(article_docs))] - mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop) + # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in range(len(article_docs))] + # mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop) + sent_encodings, bp_sent = self.model.begin_update(sentence_docs, drop=drop) entity_encodings = np.asarray(entity_encodings, dtype=np.float32) - loss, d_scores = self.get_loss(scores=mention_encodings, golds=entity_encodings, docs=None) - mention_gradient = bp_mention(d_scores, sgd=self.sgd_mention) + loss, d_scores = self.get_loss(scores=sent_encodings, golds=entity_encodings, docs=None) + bp_sent(d_scores, sgd=sgd) # gradient : concat (doc+sent) vs. desc - sent_start = self.article_encoder.nO - sent_gradients = list() - doc_gradients = list() - for x in mention_gradient: - doc_gradients.append(list(x[0:sent_start])) - sent_gradients.append(list(x[sent_start:])) - - bp_doc(doc_gradients, sgd=self.sgd_article) - bp_sent(sent_gradients, sgd=self.sgd_sent) + # sent_start = self.article_encoder.nO + # sent_gradients = list() + # doc_gradients = list() + # for x in mention_gradient: + # doc_gradients.append(list(x[0:sent_start])) + # sent_gradients.append(list(x[sent_start:])) + # bp_doc(doc_gradients, sgd=self.sgd_article) + # bp_sent(sent_gradients, sgd=self.sgd_sent) if losses is not None: losses[self.name] += loss @@ -1262,14 +1270,17 @@ class EntityLinker(Pipe): first_par_end = sent.end first_par = doc[0:first_par_end].as_doc() - doc_encoding = self.article_encoder([first_par]) + # doc_encoding = self.article_encoder([first_par]) for ent in doc.ents: sent_doc = ent.sent.as_doc() if len(sent_doc) > 0: - sent_encoding = self.sent_encoder([sent_doc]) - concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] - mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]])) - mention_enc_t = np.transpose(mention_encoding) + # sent_encoding = self.sent_encoder([sent_doc]) + # concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] + # mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]])) + # mention_enc_t = np.transpose(mention_encoding) + + sent_encoding = self.model([sent_doc]) + sent_enc_t = np.transpose(sent_encoding) candidates = self.kb.get_candidates(ent.text) if candidates: @@ -1278,7 +1289,7 @@ class EntityLinker(Pipe): prior_prob = c.prior_prob * self.prior_weight kb_id = c.entity_ entity_encoding = c.entity_vector - sim = float(cosine(np.asarray([entity_encoding]), mention_enc_t)) * self.context_weight + sim = float(cosine(np.asarray([entity_encoding]), sent_enc_t)) * self.context_weight score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? scores.append(score) @@ -1299,34 +1310,20 @@ class EntityLinker(Pipe): serialize = OrderedDict() serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) serialize["kb"] = lambda p: self.kb.dump(p) - if self.mention_encoder not in (None, True, False): - serialize["article_encoder"] = lambda p: p.open("wb").write(self.article_encoder.to_bytes()) - serialize["sent_encoder"] = lambda p: p.open("wb").write(self.sent_encoder.to_bytes()) - serialize["mention_encoder"] = lambda p: p.open("wb").write(self.mention_encoder.to_bytes()) + if self.model not in (None, True, False): + serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) exclude = util.get_serialization_exclude(serialize, exclude, kwargs) util.to_disk(path, serialize, exclude) def from_disk(self, path, exclude=tuple(), **kwargs): - def load_article_encoder(p): - if self.article_encoder is True: - self.article_encoder, _, _ = self.Model(**self.cfg) - self.article_encoder.from_bytes(p.open("rb").read()) - - def load_sent_encoder(p): - if self.sent_encoder is True: - _, self.sent_encoder, _ = self.Model(**self.cfg) - self.sent_encoder.from_bytes(p.open("rb").read()) - - def load_mention_encoder(p): - if self.mention_encoder is True: - _, _, self.mention_encoder = self.Model(**self.cfg) - self.mention_encoder.from_bytes(p.open("rb").read()) + def load_model(p): + if self.model is True: + self.model = self.Model(**self.cfg) + self.model.from_bytes(p.open("rb").read()) deserialize = OrderedDict() deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) - deserialize["article_encoder"] = load_article_encoder - deserialize["sent_encoder"] = load_sent_encoder - deserialize["mention_encoder"] = load_mention_encoder + deserialize["model"] = load_model exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) return self From 0d177c1146d6384737a20400a2218a411fd8ab81 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 18 Jun 2019 13:20:40 +0200 Subject: [PATCH 082/148] clean up code, remove old code, move to bin --- .../wiki_entity_linking => bin}/__init__.py | 0 bin/wiki_entity_linking/__init__.py | 0 .../wiki_entity_linking/kb_creator.py | 13 +- .../wiki_entity_linking/train_descriptions.py | 39 -- .../training_set_creator.py | 7 +- .../wiki_entity_linking/wikidata_processor.py | 31 +- .../wikipedia_processor.py | 4 +- .../pipeline/wiki_entity_linking/run_el.py | 136 ----- .../pipeline/wiki_entity_linking/train_el.py | 490 ------------------ ...pipeline.py => wikidata_entity_linking.py} | 92 ++-- spacy/_ml.py | 5 +- spacy/pipeline/pipes.pyx | 62 +-- 12 files changed, 92 insertions(+), 787 deletions(-) rename {examples/pipeline/wiki_entity_linking => bin}/__init__.py (100%) create mode 100644 bin/wiki_entity_linking/__init__.py rename {examples/pipeline => bin}/wiki_entity_linking/kb_creator.py (94%) rename {examples/pipeline => bin}/wiki_entity_linking/train_descriptions.py (69%) rename {examples/pipeline => bin}/wiki_entity_linking/training_set_creator.py (98%) rename {examples/pipeline => bin}/wiki_entity_linking/wikidata_processor.py (80%) rename {examples/pipeline => bin}/wiki_entity_linking/wikipedia_processor.py (98%) delete mode 100644 examples/pipeline/wiki_entity_linking/run_el.py delete mode 100644 examples/pipeline/wiki_entity_linking/train_el.py rename examples/pipeline/{wiki_entity_linking/wiki_nel_pipeline.py => wikidata_entity_linking.py} (82%) diff --git a/examples/pipeline/wiki_entity_linking/__init__.py b/bin/__init__.py similarity index 100% rename from examples/pipeline/wiki_entity_linking/__init__.py rename to bin/__init__.py diff --git a/bin/wiki_entity_linking/__init__.py b/bin/wiki_entity_linking/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/examples/pipeline/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py similarity index 94% rename from examples/pipeline/wiki_entity_linking/kb_creator.py rename to bin/wiki_entity_linking/kb_creator.py index 80d0e21e9..8d293a0a1 100644 --- a/examples/pipeline/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -1,15 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals -import spacy -from examples.pipeline.wiki_entity_linking.train_descriptions import EntityEncoder +from bin.wiki_entity_linking.train_descriptions import EntityEncoder from spacy.kb import KnowledgeBase import csv import datetime -from . import wikipedia_processor as wp -from . import wikidata_processor as wd +from bin.wiki_entity_linking import wikidata_processor as wd, wikipedia_processor as wp INPUT_DIM = 300 # dimension of pre-trained vectors DESC_WIDTH = 64 @@ -34,7 +32,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, else: # read the mappings from file - title_to_id = _get_entity_to_id(entity_def_output) + title_to_id = get_entity_to_id(entity_def_output) id_to_descr = _get_id_to_description(entity_descr_output) print() @@ -56,7 +54,8 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, frequency_list.append(freq) filtered_title_to_id[title] = entity - print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), "titles with filter frequency", min_entity_freq) + print("Kept", len(filtered_title_to_id.keys()), "out of", len(title_to_id.keys()), + "titles with filter frequency", min_entity_freq) print() print(" * train entity encoder", datetime.datetime.now()) @@ -101,7 +100,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_ descr_file.write(str(qid) + "|" + descr + "\n") -def _get_entity_to_id(entity_def_output): +def get_entity_to_id(entity_def_output): entity_to_id = dict() with open(entity_def_output, 'r', encoding='utf8') as csvfile: csvreader = csv.reader(csvfile, delimiter='|') diff --git a/examples/pipeline/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py similarity index 69% rename from examples/pipeline/wiki_entity_linking/train_descriptions.py rename to bin/wiki_entity_linking/train_descriptions.py index bf4bcbc3d..cc5016237 100644 --- a/examples/pipeline/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -55,8 +55,6 @@ class EntityEncoder: print("Trained on", processed, "entities across", self.EPOCHS, "epochs") print("Final loss:", loss) - # self._test_encoder() - def _train_model(self, description_list): # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy @@ -123,40 +121,3 @@ class EntityEncoder: def _get_loss(golds, scores): loss, gradients = get_cossim_loss(scores, golds) return loss, gradients - - def _test_encoder(self): - # Test encoder on some dummy examples - desc_A1 = "Fictional character in The Simpsons" - desc_A2 = "Simpsons - fictional human" - desc_A3 = "Fictional character in The Flintstones" - desc_A4 = "Politician from the US" - - A1_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A1))]) - A2_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A2))]) - A3_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A3))]) - A4_doc_vector = np.asarray([self._get_doc_embedding(self.nlp(desc_A4))]) - - loss_a1_a1, _ = get_cossim_loss(A1_doc_vector, A1_doc_vector) - loss_a1_a2, _ = get_cossim_loss(A1_doc_vector, A2_doc_vector) - loss_a1_a3, _ = get_cossim_loss(A1_doc_vector, A3_doc_vector) - loss_a1_a4, _ = get_cossim_loss(A1_doc_vector, A4_doc_vector) - - print("sim doc A1 A1", loss_a1_a1) - print("sim doc A1 A2", loss_a1_a2) - print("sim doc A1 A3", loss_a1_a3) - print("sim doc A1 A4", loss_a1_a4) - - A1_encoded = self.encoder(A1_doc_vector) - A2_encoded = self.encoder(A2_doc_vector) - A3_encoded = self.encoder(A3_doc_vector) - A4_encoded = self.encoder(A4_doc_vector) - - loss_a1_a1, _ = get_cossim_loss(A1_encoded, A1_encoded) - loss_a1_a2, _ = get_cossim_loss(A1_encoded, A2_encoded) - loss_a1_a3, _ = get_cossim_loss(A1_encoded, A3_encoded) - loss_a1_a4, _ = get_cossim_loss(A1_encoded, A4_encoded) - - print("sim encoded A1 A1", loss_a1_a1) - print("sim encoded A1 A2", loss_a1_a2) - print("sim encoded A1 A3", loss_a1_a3) - print("sim encoded A1 A4", loss_a1_a4) diff --git a/examples/pipeline/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py similarity index 98% rename from examples/pipeline/wiki_entity_linking/training_set_creator.py rename to bin/wiki_entity_linking/training_set_creator.py index cc985202c..a0d130824 100644 --- a/examples/pipeline/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -7,7 +7,7 @@ import bz2 import datetime from spacy.gold import GoldParse -from . import wikipedia_processor as wp, kb_creator +from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp """ Process Wikipedia interlinks to generate a training dataset for the EL algorithm @@ -18,7 +18,7 @@ ENTITY_FILE = "gold_entities_1000000.csv" # use this file for faster processin def create_training(entity_def_input, training_output): - wp_to_id = kb_creator._get_entity_to_id(entity_def_input) + wp_to_id = kb_creator.get_entity_to_id(entity_def_input) _process_wikipedia_texts(wp_to_id, training_output, limit=None) @@ -71,7 +71,8 @@ def _process_wikipedia_texts(wp_to_id, training_output, limit=None): elif clean_line == "": if article_id: try: - _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(), training_output) + _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_text.strip(), + training_output) except Exception as e: print("Error processing article", article_id, article_title, e) else: diff --git a/examples/pipeline/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py similarity index 80% rename from examples/pipeline/wiki_entity_linking/wikidata_processor.py rename to bin/wiki_entity_linking/wikidata_processor.py index 967849abb..899c607cc 100644 --- a/examples/pipeline/wiki_entity_linking/wikidata_processor.py +++ b/bin/wiki_entity_linking/wikidata_processor.py @@ -13,9 +13,12 @@ def read_wikidata_entities_json(limit=None, to_print=False): """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ lang = 'en' - # prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected site_filter = 'enwiki' + # filter currently disabled to get ALL data + prop_filter = dict() + # prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected + title_to_id = dict() id_to_descr = dict() @@ -25,6 +28,7 @@ def read_wikidata_entities_json(limit=None, to_print=False): parse_labels = False parse_descriptions = True parse_aliases = False + parse_claims = False with bz2.open(WIKIDATA_JSON, mode='rb') as file: line = file.readline() @@ -45,14 +49,15 @@ def read_wikidata_entities_json(limit=None, to_print=False): keep = True claims = obj["claims"] - # for prop, value_set in prop_filter.items(): - # claim_property = claims.get(prop, None) - # if claim_property: - # for cp in claim_property: - # cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') - # cp_rank = cp['rank'] - # if cp_rank != "deprecated" and cp_id in value_set: - # keep = True + if parse_claims: + for prop, value_set in prop_filter.items(): + claim_property = claims.get(prop, None) + if claim_property: + for cp in claim_property: + cp_id = cp['mainsnak'].get('datavalue', {}).get('value', {}).get('id') + cp_rank = cp['rank'] + if cp_rank != "deprecated" and cp_id in value_set: + keep = True if keep: unique_id = obj["id"] @@ -64,8 +69,10 @@ def read_wikidata_entities_json(limit=None, to_print=False): # parsing all properties that refer to other entities if parse_properties: for prop, claim_property in claims.items(): - cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property if cp['mainsnak'].get('datavalue')] - cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) if cp_dict.get('id') is not None] + cp_dicts = [cp['mainsnak']['datavalue'].get('value') for cp in claim_property + if cp['mainsnak'].get('datavalue')] + cp_values = [cp_dict.get('id') for cp_dict in cp_dicts if isinstance(cp_dict, dict) + if cp_dict.get('id') is not None] if cp_values: if to_print: print("prop:", prop, cp_values) @@ -104,7 +111,7 @@ def read_wikidata_entities_json(limit=None, to_print=False): if lang_aliases: for item in lang_aliases: if to_print: - print("alias (" + lang + "):", item["value"]) + print("alias (" + lang + "):", item["value"]) if to_print: print() diff --git a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py similarity index 98% rename from examples/pipeline/wiki_entity_linking/wikipedia_processor.py rename to bin/wiki_entity_linking/wikipedia_processor.py index e53423487..0747c9db7 100644 --- a/examples/pipeline/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -26,8 +26,8 @@ wiki_namespaces = ["b", "betawikiversity", "Book", "c", "Category", "Commons", "mw", "n", "nost", "oldwikisource", "outreach", "outreachwiki", "otrs", "OTRSwiki", "Portal", "phab", "Phabricator", "Project", "q", "quality", "rev", "s", "spcom", "Special", "species", "Strategy", "sulutil", "svn", - "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", "tswiki", - "User", "User talk", "v", "voy", + "Talk", "Template", "Template talk", "Testwiki", "ticket", "TimedText", "Toollabs", "tools", + "tswiki", "User", "User talk", "v", "voy", "w", "Wikibooks", "Wikidata", "wikiHow", "Wikinvest", "wikilivres", "Wikimedia", "Wikinews", "Wikipedia", "Wikipedia talk", "Wikiquote", "Wikisource", "Wikispecies", "Wikitech", "Wikiversity", "Wikivoyage", "wikt", "wiktionary", "wmf", "wmania", "WP"] diff --git a/examples/pipeline/wiki_entity_linking/run_el.py b/examples/pipeline/wiki_entity_linking/run_el.py deleted file mode 100644 index c26e8d65a..000000000 --- a/examples/pipeline/wiki_entity_linking/run_el.py +++ /dev/null @@ -1,136 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os -import spacy -import datetime -from os import listdir - -from examples.pipeline.wiki_entity_linking import training_set_creator - -# requires: pip install neuralcoref --no-binary neuralcoref -# import neuralcoref - - -def run_kb_toy_example(kb): - for mention in ("Bush", "Douglas Adams", "Homer"): - candidates = kb.get_candidates(mention) - - print("generating candidates for " + mention + " :") - for c in candidates: - print(" ", c.prior_prob, c.alias_, "-->", c.entity_ + " (freq=" + str(c.entity_freq) + ")") - print() - - - - -def run_el_dev(nlp, kb, training_dir, limit=None): - correct_entries_per_article, _ = training_set_creator.read_training_entities(training_output=training_dir) - - predictions = list() - golds = list() - - cnt = 0 - for f in listdir(training_dir): - if not limit or cnt < limit: - if is_dev(f): - article_id = f.replace(".txt", "") - if cnt % 500 == 0: - print(datetime.datetime.now(), "processed", cnt, "files in the dev dataset") - cnt += 1 - with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: - text = file.read() - doc = nlp(text) - for ent in doc.ents: - if ent.label_ == "PERSON": # TODO: expand to other types - gold_entity = correct_entries_per_article[article_id].get(ent.text, None) - # only evaluating gold entities we know, because the training data is not complete - if gold_entity: - predictions.append(ent.kb_id_) - golds.append(gold_entity) - - print("Processed", cnt, "dev articles") - print() - evaluate(predictions, golds) - - -def is_dev(file_name): - return file_name.endswith("3.txt") - - -def evaluate(predictions, golds, to_print=True, times_hundred=True): - if len(predictions) != len(golds): - raise ValueError("predictions and gold entities should have the same length") - - tp = 0 - fp = 0 - fn = 0 - - corrects = 0 - incorrects = 0 - - for pred, gold in zip(predictions, golds): - is_correct = pred == gold - if is_correct: - corrects += 1 - else: - incorrects += 1 - if not pred: - if not is_correct: # we don't care about tn - fn += 1 - elif is_correct: - tp += 1 - else: - fp += 1 - - if to_print: - print("Evaluating", len(golds), "entities") - print("tp", tp) - print("fp", fp) - print("fn", fn) - - precision = tp / (tp + fp + 0.0000001) - recall = tp / (tp + fn + 0.0000001) - if times_hundred: - precision = precision*100 - recall = recall*100 - fscore = 2 * recall * precision / (recall + precision + 0.0000001) - - accuracy = corrects / (corrects + incorrects) - - if to_print: - print("precision", round(precision, 1), "%") - print("recall", round(recall, 1), "%") - print("Fscore", round(fscore, 1), "%") - print("Accuracy", round(accuracy, 1), "%") - - return precision, recall, fscore, accuracy - - - - - -# TODO -def add_coref(nlp): - """ Add coreference resolution to our model """ - # TODO: this doesn't work yet - # neuralcoref.add_to_pipe(nlp) - print("done adding to pipe") - - doc = nlp(u'My sister has a dog. She loves him.') - print("done doc") - - print(doc._.has_coref) - print(doc._.coref_clusters) - - -# TODO -def _run_ner_depr(nlp, clean_text, article_dict): - doc = nlp(clean_text) - for ent in doc.ents: - if ent.label_ == "PERSON": # TODO: expand to non-persons - ent_id = article_dict.get(ent.text) - if ent_id: - print(" -", ent.text, ent.label_, ent_id) - else: - print(" -", ent.text, ent.label_, '???') # TODO: investigate these cases diff --git a/examples/pipeline/wiki_entity_linking/train_el.py b/examples/pipeline/wiki_entity_linking/train_el.py deleted file mode 100644 index a4026d935..000000000 --- a/examples/pipeline/wiki_entity_linking/train_el.py +++ /dev/null @@ -1,490 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os -import datetime -from os import listdir -import numpy as np -import random -from random import shuffle -from thinc.neural._classes.convolution import ExtractWindow -from thinc.neural.util import get_array_module - -from examples.pipeline.wiki_entity_linking import run_el, training_set_creator, kb_creator - -from spacy._ml import SpacyVectors, create_default_optimizer, zero_init, cosine - -from thinc.api import chain, concatenate, flatten_add_lengths, clone, with_flatten -from thinc.v2v import Model, Maxout, Affine -from thinc.t2v import Pooling, mean_pool -from thinc.t2t import ParametricAttention -from thinc.misc import Residual -from thinc.misc import LayerNorm as LN - -# from spacy.cli.pretrain import get_cossim_loss -from spacy.matcher import PhraseMatcher - - -class EL_Model: - - PRINT_INSPECT = False - PRINT_BATCH_LOSS = False - EPS = 0.0000000005 - - BATCH_SIZE = 100 - - DOC_CUTOFF = 300 # number of characters from the doc context - INPUT_DIM = 300 # dimension of pre-trained vectors - - HIDDEN_1_WIDTH = 32 - DESC_WIDTH = 64 - ARTICLE_WIDTH = 128 - SENT_WIDTH = 64 - - DROP = 0.4 - LEARN_RATE = 0.005 - EPOCHS = 10 - L2 = 1e-6 - - name = "entity_linker" - - def __init__(self, kb, nlp): - run_el._prepare_pipeline(nlp, kb) - self.nlp = nlp - self.kb = kb - - self._build_cnn(embed_width=self.INPUT_DIM, - desc_width=self.DESC_WIDTH, - article_width=self.ARTICLE_WIDTH, - sent_width=self.SENT_WIDTH, - hidden_1_width=self.HIDDEN_1_WIDTH) - - def train_model(self, training_dir, entity_descr_output, trainlimit=None, devlimit=None, to_print=True): - np.seterr(divide="raise", over="warn", under="ignore", invalid="raise") - - id_to_descr = kb_creator._get_id_to_description(entity_descr_output) - - train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts = \ - self._get_training_data(training_dir, id_to_descr, False, trainlimit, to_print=False) - train_clusters = list(train_ent.keys()) - - dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts = \ - self._get_training_data(training_dir, id_to_descr, True, devlimit, to_print=False) - dev_clusters = list(dev_ent.keys()) - - dev_pos_count = len([g for g in dev_gold.values() if g]) - dev_neg_count = len([g for g in dev_gold.values() if not g]) - - # inspect data - if self.PRINT_INSPECT: - for cluster, entities in train_ent.items(): - print() - for entity in entities: - print("entity", entity) - print("gold", train_gold[entity]) - print("desc", train_desc[entity]) - print("sentence ID", train_sent[entity]) - print("sentence text", train_sent_texts[train_sent[entity]]) - print("article ID", train_art[entity]) - print("article text", train_art_texts[train_art[entity]]) - print() - - train_pos_entities = [k for k, v in train_gold.items() if v] - train_neg_entities = [k for k, v in train_gold.items() if not v] - - train_pos_count = len(train_pos_entities) - train_neg_count = len(train_neg_entities) - - self._begin_training() - - if to_print: - print() - print("Training on", len(train_clusters), "entity clusters in", len(train_art_texts), "articles") - print("Training instances pos/neg:", train_pos_count, train_neg_count) - print() - print("Dev test on", len(dev_clusters), "entity clusters in", len(dev_art_texts), "articles") - print("Dev instances pos/neg:", dev_pos_count, dev_neg_count) - print() - print(" DOC_CUTOFF", self.DOC_CUTOFF) - print(" INPUT_DIM", self.INPUT_DIM) - print(" HIDDEN_1_WIDTH", self.HIDDEN_1_WIDTH) - print(" DESC_WIDTH", self.DESC_WIDTH) - print(" ARTICLE_WIDTH", self.ARTICLE_WIDTH) - print(" SENT_WIDTH", self.SENT_WIDTH) - print(" DROP", self.DROP) - print(" LEARNING RATE", self.LEARN_RATE) - print(" BATCH SIZE", self.BATCH_SIZE) - print() - - dev_random = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, - calc_random=True) - print("acc", "dev_random", round(dev_random, 2)) - - dev_pre = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, - avg=True) - print("acc", "dev_pre", round(dev_pre, 2)) - print() - - processed = 0 - for i in range(self.EPOCHS): - shuffle(train_clusters) - - start = 0 - stop = min(self.BATCH_SIZE, len(train_clusters)) - - while start < len(train_clusters): - next_batch = {c: train_ent[c] for c in train_clusters[start:stop]} - processed += len(next_batch.keys()) - - self.update(entity_clusters=next_batch, golds=train_gold, descs=train_desc, - art_texts=train_art_texts, arts=train_art, - sent_texts=train_sent_texts, sents=train_sent) - - start = start + self.BATCH_SIZE - stop = min(stop + self.BATCH_SIZE, len(train_clusters)) - - train_acc = self._test_dev(train_ent, train_gold, train_desc, train_art, train_art_texts, train_sent, train_sent_texts, avg=True) - dev_acc = self._test_dev(dev_ent, dev_gold, dev_desc, dev_art, dev_art_texts, dev_sent, dev_sent_texts, avg=True) - - print(i, "acc train/dev", round(train_acc, 2), round(dev_acc, 2)) - - if to_print: - print() - print("Trained on", processed, "entity clusters across", self.EPOCHS, "epochs") - - def _test_dev(self, entity_clusters, golds, descs, arts, art_texts, sents, sent_texts, avg=True, calc_random=False): - correct = 0 - incorrect = 0 - - if calc_random: - for cluster, entities in entity_clusters.items(): - correct_entities = [e for e in entities if golds[e]] - assert len(correct_entities) == 1 - - entities = list(entities) - shuffle(entities) - - if calc_random: - predicted_entity = random.choice(entities) - if predicted_entity in correct_entities: - correct += 1 - else: - incorrect += 1 - - else: - all_clusters = list() - arts_list = list() - sents_list = list() - - for cluster in entity_clusters.keys(): - all_clusters.append(cluster) - arts_list.append(art_texts[arts[cluster]]) - sents_list.append(sent_texts[sents[cluster]]) - - art_docs = list(self.nlp.pipe(arts_list)) - sent_docs = list(self.nlp.pipe(sents_list)) - - for i, cluster in enumerate(all_clusters): - entities = entity_clusters[cluster] - correct_entities = [e for e in entities if golds[e]] - assert len(correct_entities) == 1 - - entities = list(entities) - shuffle(entities) - - desc_docs = self.nlp.pipe([descs[e] for e in entities]) - sent_doc = sent_docs[i] - article_doc = art_docs[i] - - predicted_index = self._predict(article_doc=article_doc, sent_doc=sent_doc, - desc_docs=desc_docs, avg=avg) - if entities[predicted_index] in correct_entities: - correct += 1 - else: - incorrect += 1 - - if correct == incorrect == 0: - return 0 - - acc = correct / (correct + incorrect) - return acc - - def _predict(self, article_doc, sent_doc, desc_docs, avg=True, apply_threshold=True): - if avg: - with self.article_encoder.use_params(self.sgd_article.averages) \ - and self.desc_encoder.use_params(self.sgd_desc.averages)\ - and self.sent_encoder.use_params(self.sgd_sent.averages): - desc_encodings = self.desc_encoder(desc_docs) - doc_encoding = self.article_encoder([article_doc]) - sent_encoding = self.sent_encoder([sent_doc]) - - else: - desc_encodings = self.desc_encoder(desc_docs) - doc_encoding = self.article_encoder([article_doc]) - sent_encoding = self.sent_encoder([sent_doc]) - - concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] - - if avg: - with self.cont_encoder.use_params(self.sgd_cont.averages): - cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]])) - - else: - cont_encodings = self.cont_encoder(np.asarray([concat_encoding[0]])) - - context_enc = np.transpose(cont_encodings) - - highest_sim = -5 - best_i = -1 - for i, desc_enc in enumerate(desc_encodings): - sim = cosine(desc_enc, context_enc) - if sim >= highest_sim: - best_i = i - highest_sim = sim - - return best_i - - def _build_cnn(self, embed_width, desc_width, article_width, sent_width, hidden_1_width): - self.desc_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_1_width, end_width=desc_width) - self.cont_encoder = self._context_encoder(embed_width=embed_width, article_width=article_width, - sent_width=sent_width, hidden_width=hidden_1_width, - end_width=desc_width) - - - # def _encoder(self, width): - # tok2vec = Tok2Vec(width=width, embed_size=2000, pretrained_vectors=self.nlp.vocab.vectors.name, cnn_maxout_pieces=3, - # subword_features=False, conv_depth=4, bilstm_depth=0) - # - # return tok2vec >> flatten_add_lengths >> Pooling(mean_pool) - - def _context_encoder(self, embed_width, article_width, sent_width, hidden_width, end_width): - self.article_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_width, end_width=article_width) - self.sent_encoder = self._encoder(in_width=embed_width, hidden_with=hidden_width, end_width=sent_width) - - model = Affine(end_width, article_width+sent_width, drop_factor=0.0) - return model - - @staticmethod - def _encoder(in_width, hidden_with, end_width): - conv_depth = 2 - cnn_maxout_pieces = 3 - - with Model.define_operators({">>": chain, "**": clone}): - convolution = Residual((ExtractWindow(nW=1) >> - LN(Maxout(hidden_with, hidden_with * 3, pieces=cnn_maxout_pieces)))) - - encoder = SpacyVectors \ - >> with_flatten(LN(Maxout(hidden_with, in_width)) >> convolution ** conv_depth, pad=conv_depth) \ - >> flatten_add_lengths \ - >> ParametricAttention(hidden_with)\ - >> Pooling(mean_pool) \ - >> Residual(zero_init(Maxout(hidden_with, hidden_with))) \ - >> zero_init(Affine(end_width, hidden_with, drop_factor=0.0)) - - # TODO: ReLu or LN(Maxout) ? - # sum_pool or mean_pool ? - - return encoder - - def _begin_training(self): - self.sgd_article = create_default_optimizer(self.article_encoder.ops) - self.sgd_article.learn_rate = self.LEARN_RATE - self.sgd_article.L2 = self.L2 - - self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) - self.sgd_sent.learn_rate = self.LEARN_RATE - self.sgd_sent.L2 = self.L2 - - self.sgd_cont = create_default_optimizer(self.cont_encoder.ops) - self.sgd_cont.learn_rate = self.LEARN_RATE - self.sgd_cont.L2 = self.L2 - - self.sgd_desc = create_default_optimizer(self.desc_encoder.ops) - self.sgd_desc.learn_rate = self.LEARN_RATE - self.sgd_desc.L2 = self.L2 - - def get_loss(self, pred, gold, targets): - loss, gradients = self.get_cossim_loss(pred, gold, targets) - return loss, gradients - - def get_cossim_loss(self, yh, y, t): - # Add a small constant to avoid 0 vectors - # print() - # print("yh", yh) - # print("y", y) - # print("t", t) - yh = yh + 1e-8 - y = y + 1e-8 - # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity - xp = get_array_module(yh) - norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) - norm_y = xp.linalg.norm(y, axis=1, keepdims=True) - mul_norms = norm_yh * norm_y - cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms - # print("cos", cos) - d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2)) - # print("abs", xp.abs(cos - t)) - loss = xp.abs(cos - t).sum() - # print("loss", loss) - # print("d_yh", d_yh) - inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))]) - # print("inverse", inverse) - return loss, -inverse - - def update(self, entity_clusters, golds, descs, art_texts, arts, sent_texts, sents): - arts_list = list() - sents_list = list() - descs_list = list() - targets = list() - - for cluster, entities in entity_clusters.items(): - art = art_texts[arts[cluster]] - sent = sent_texts[sents[cluster]] - for e in entities: - if golds[e]: - arts_list.append(art) - sents_list.append(sent) - descs_list.append(descs[e]) - targets.append([1]) - # else: - # arts_list.append(art) - # sents_list.append(sent) - # descs_list.append(descs[e]) - # targets.append([-1]) - - desc_docs = self.nlp.pipe(descs_list) - desc_encodings, bp_desc = self.desc_encoder.begin_update(desc_docs, drop=self.DROP) - - art_docs = self.nlp.pipe(arts_list) - sent_docs = self.nlp.pipe(sents_list) - - doc_encodings, bp_doc = self.article_encoder.begin_update(art_docs, drop=self.DROP) - sent_encodings, bp_sent = self.sent_encoder.begin_update(sent_docs, drop=self.DROP) - - concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in - range(len(targets))] - cont_encodings, bp_cont = self.cont_encoder.begin_update(np.asarray(concat_encodings), drop=self.DROP) - - loss, cont_gradient = self.get_loss(cont_encodings, desc_encodings, targets) - - # loss, desc_gradient = self.get_loss(desc_encodings, cont_encodings, targets) - # cont_gradient = cont_gradient / 2 - # desc_gradient = desc_gradient / 2 - # bp_desc(desc_gradient, sgd=self.sgd_desc) - - if self.PRINT_BATCH_LOSS: - print("batch loss", loss) - - context_gradient = bp_cont(cont_gradient, sgd=self.sgd_cont) - - # gradient : concat (doc+sent) vs. desc - sent_start = self.ARTICLE_WIDTH - sent_gradients = list() - doc_gradients = list() - for x in context_gradient: - doc_gradients.append(list(x[0:sent_start])) - sent_gradients.append(list(x[sent_start:])) - - bp_doc(doc_gradients, sgd=self.sgd_article) - bp_sent(sent_gradients, sgd=self.sgd_sent) - - def _get_training_data(self, training_dir, id_to_descr, dev, limit, to_print): - correct_entries, incorrect_entries = training_set_creator.read_training_entities(training_output=training_dir) - - entities_by_cluster = dict() - gold_by_entity = dict() - desc_by_entity = dict() - article_by_cluster = dict() - text_by_article = dict() - sentence_by_cluster = dict() - text_by_sentence = dict() - sentence_by_text = dict() - - cnt = 0 - next_entity_nr = 1 - next_sent_nr = 1 - files = listdir(training_dir) - shuffle(files) - for f in files: - if not limit or cnt < limit: - if dev == run_el.is_dev(f): - article_id = f.replace(".txt", "") - if cnt % 500 == 0 and to_print: - print(datetime.datetime.now(), "processed", cnt, "files in the training dataset") - - try: - # parse the article text - with open(os.path.join(training_dir, f), mode="r", encoding='utf8') as file: - text = file.read() - article_doc = self.nlp(text) - truncated_text = text[0:min(self.DOC_CUTOFF, len(text))] - text_by_article[article_id] = truncated_text - - # process all positive and negative entities, collect all relevant mentions in this article - for mention, entity_pos in correct_entries[article_id].items(): - cluster = article_id + "_" + mention - descr = id_to_descr.get(entity_pos) - entities = set() - if descr: - entity = "E_" + str(next_entity_nr) + "_" + cluster - next_entity_nr += 1 - gold_by_entity[entity] = 1 - desc_by_entity[entity] = descr - entities.add(entity) - - entity_negs = incorrect_entries[article_id][mention] - for entity_neg in entity_negs: - descr = id_to_descr.get(entity_neg) - if descr: - entity = "E_" + str(next_entity_nr) + "_" + cluster - next_entity_nr += 1 - gold_by_entity[entity] = 0 - desc_by_entity[entity] = descr - entities.add(entity) - - found_matches = 0 - if len(entities) > 1: - entities_by_cluster[cluster] = entities - - # find all matches in the doc for the mentions - # TODO: fix this - doesn't look like all entities are found - matcher = PhraseMatcher(self.nlp.vocab) - patterns = list(self.nlp.tokenizer.pipe([mention])) - - matcher.add("TerminologyList", None, *patterns) - matches = matcher(article_doc) - - # store sentences - for match_id, start, end in matches: - span = article_doc[start:end] - if mention == span.text: - found_matches += 1 - sent_text = span.sent.text - sent_nr = sentence_by_text.get(sent_text, None) - if sent_nr is None: - sent_nr = "S_" + str(next_sent_nr) + article_id - next_sent_nr += 1 - text_by_sentence[sent_nr] = sent_text - sentence_by_text[sent_text] = sent_nr - article_by_cluster[cluster] = article_id - sentence_by_cluster[cluster] = sent_nr - - if found_matches == 0: - # print("Could not find neg instances or sentence matches for", mention, "in", article_id) - entities_by_cluster.pop(cluster, None) - article_by_cluster.pop(cluster, None) - sentence_by_cluster.pop(cluster, None) - for entity in entities: - gold_by_entity.pop(entity, None) - desc_by_entity.pop(entity, None) - cnt += 1 - except: - print("Problem parsing article", article_id) - - if to_print: - print() - print("Processed", cnt, "training articles, dev=" + str(dev)) - print() - return entities_by_cluster, gold_by_entity, desc_by_entity, article_by_cluster, text_by_article, \ - sentence_by_cluster, text_by_sentence - diff --git a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py b/examples/pipeline/wikidata_entity_linking.py similarity index 82% rename from examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py rename to examples/pipeline/wikidata_entity_linking.py index bdae023b9..d537cce7e 100644 --- a/examples/pipeline/wiki_entity_linking/wiki_nel_pipeline.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -5,8 +5,8 @@ import random from spacy.util import minibatch, compounding -from examples.pipeline.wiki_entity_linking import wikipedia_processor as wp, kb_creator, training_set_creator, run_el -from examples.pipeline.wiki_entity_linking.kb_creator import DESC_WIDTH +from bin.wiki_entity_linking import training_set_creator, kb_creator, wikipedia_processor as wp +from bin.wiki_entity_linking.kb_creator import DESC_WIDTH import spacy from spacy.kb import KnowledgeBase @@ -30,9 +30,11 @@ TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' MAX_CANDIDATES = 10 MIN_ENTITY_FREQ = 20 MIN_PAIR_OCC = 5 -DOC_SENT_CUTOFF = 2 + EPOCHS = 10 DROPOUT = 0.1 +LEARN_RATE = 0.005 +L2 = 1e-6 def run_pipeline(): @@ -40,7 +42,6 @@ def run_pipeline(): print() nlp_1 = spacy.load('en_core_web_lg') nlp_2 = None - kb_1 = None kb_2 = None # one-time methods to create KB and write to file @@ -114,7 +115,7 @@ def run_pipeline(): # test KB if to_test_kb: - test_kb(kb_2) + check_kb(kb_2) print() # STEP 5: create a training dataset from WP @@ -122,19 +123,21 @@ def run_pipeline(): print("STEP 5: create training dataset", datetime.datetime.now()) training_set_creator.create_training(entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) - # STEP 6: create the entity linking pipe - el_pipe = nlp_2.create_pipe(name='entity_linker', config={"doc_cutoff": DOC_SENT_CUTOFF}) + # STEP 6: create and train the entity linking pipe + el_pipe = nlp_2.create_pipe(name='entity_linker', config={}) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"] with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking - nlp_2.begin_training() + optimizer = nlp_2.begin_training() + optimizer.learn_rate = LEARN_RATE + optimizer.L2 = L2 if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) train_limit = 25000 - dev_limit = 1000 + dev_limit = 5000 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -144,6 +147,14 @@ def run_pipeline(): print("Training on", len(train_data), "articles") print() + dev_data = training_set_creator.read_training(nlp=nlp_2, + training_dir=TRAINING_DIR, + dev=True, + limit=dev_limit) + + print("Dev testing on", len(dev_data), "articles") + print() + if not train_data: print("Did not find any training data") @@ -161,53 +172,55 @@ def run_pipeline(): nlp_2.update( docs, golds, + sgd=optimizer, drop=DROPOUT, losses=losses, ) batchnr += 1 except Exception as e: print("Error updating batch:", e) - raise(e) if batchnr > 0: - losses['entity_linker'] = losses['entity_linker'] / batchnr - print("Epoch, train loss", itn, round(losses['entity_linker'], 2)) - - dev_data = training_set_creator.read_training(nlp=nlp_2, - training_dir=TRAINING_DIR, - dev=True, - limit=dev_limit) - - print() - print("Dev testing on", len(dev_data), "articles") + with el_pipe.model.use_params(optimizer.averages): + el_pipe.context_weight = 1 + el_pipe.prior_weight = 0 + dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe) + losses['entity_linker'] = losses['entity_linker'] / batchnr + print("Epoch, train loss", itn, round(losses['entity_linker'], 2), + " / dev acc context avg", round(dev_acc_context, 3)) + # STEP 7: measure the performance of our trained pipe on an independent dev set if len(dev_data) and measure_performance: print() print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) print() - acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label = _measure_baselines(dev_data, kb_2) - print("dev acc oracle:", round(acc_oracle, 3), [(x, round(y, 3)) for x, y in acc_oracle_by_label.items()]) - print("dev acc random:", round(acc_random, 3), [(x, round(y, 3)) for x, y in acc_random_by_label.items()]) - print("dev acc prior:", round(acc_prior, 3), [(x, round(y, 3)) for x, y in acc_prior_by_label.items()]) + acc_r, acc_r_by_label, acc_p, acc_p_by_label, acc_o, acc_o_by_label = _measure_baselines(dev_data, kb_2) + print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_by_label.items()]) + print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_by_label.items()]) + print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_by_label.items()]) - # print(" measuring accuracy 1-1") - el_pipe.context_weight = 1 - el_pipe.prior_weight = 1 - dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe) - print("dev acc combo:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) + with el_pipe.model.use_params(optimizer.averages): + # measuring combined accuracy (prior + context) + el_pipe.context_weight = 1 + el_pipe.prior_weight = 1 + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe) + print("dev acc combo avg:", round(dev_acc_combo, 3), + [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) - # using only context - el_pipe.context_weight = 1 - el_pipe.prior_weight = 0 - dev_acc_context, dev_acc_1_0_dict = _measure_accuracy(dev_data, el_pipe) - print("dev acc context:", round(dev_acc_context, 3), [(x, round(y, 3)) for x, y in dev_acc_1_0_dict.items()]) - print() + # using only context + el_pipe.context_weight = 1 + el_pipe.prior_weight = 0 + dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe) + print("dev acc context avg:", round(dev_acc_context, 3), + [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()]) + print() # reset for follow-up tests el_pipe.context_weight = 1 el_pipe.prior_weight = 1 + # STEP 8: apply the EL pipe on a toy example if to_test_pipeline: print() print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) @@ -215,6 +228,7 @@ def run_pipeline(): run_el_toy_example(nlp=nlp_2) print() + # STEP 9: write the NLP pipeline (including entity linker) to file if to_write_nlp: print() print("STEP 9: testing NLP IO", datetime.datetime.now()) @@ -225,6 +239,7 @@ def run_pipeline(): print("reading from", NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR) + # verify that the IO has gone correctly if to_read_nlp: print() print("running toy example with NLP 2") @@ -272,6 +287,7 @@ def _measure_accuracy(data, el_pipe): def _measure_baselines(data, kb): + # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound random_correct_by_label = dict() random_incorrect_by_label = dict() @@ -362,7 +378,7 @@ def calculate_acc(correct_by_label, incorrect_by_label): return acc, acc_by_label -def test_kb(kb): +def check_kb(kb): for mention in ("Bush", "Douglas Adams", "Homer", "Brazil", "China"): candidates = kb.get_candidates(mention) @@ -384,7 +400,7 @@ def run_el_toy_example(nlp): print() # Q4426480 is her husband - text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine. "\ + text = "Ada Lovelace was the countess of Lovelace. She's known for her programming work on the analytical engine. "\ "She loved her husband William King dearly. " doc = nlp(text) print(text) @@ -393,7 +409,7 @@ def run_el_toy_example(nlp): print() # Q3568763 is her tutor - text = "Ada Lovelace was the countess of Lovelace. She is known for her programming work on the analytical engine. "\ + text = "Ada Lovelace was the countess of Lovelace. She's known for her programming work on the analytical engine. "\ "She was tutored by her favorite physics tutor William King." doc = nlp(text) print(text) diff --git a/spacy/_ml.py b/spacy/_ml.py index 29772c5ee..9139152aa 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -661,10 +661,11 @@ def build_nel_encoder(in_width, hidden_width, end_width, **cfg): LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces)))) encoder = SpacyVectors \ - >> with_flatten(LN(Maxout(hidden_width, in_width)) >> convolution ** conv_depth, pad=conv_depth) \ + >> with_flatten(Affine(hidden_width, in_width))\ + >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \ >> flatten_add_lengths \ >> ParametricAttention(hidden_width) \ - >> Pooling(mean_pool) \ + >> Pooling(sum_pool) \ >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0)) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index fbdca8280..7d90c4438 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1078,33 +1078,19 @@ class EntityLinker(Pipe): raise ValueError("entity_width not found") embed_width = cfg.get("embed_width", 300) - hidden_width = cfg.get("hidden_width", 32) - entity_width = cfg.get("entity_width") # no default because this needs to correspond with the KB - sent_width = entity_width + hidden_width = cfg.get("hidden_width", 128) + + # no default because this needs to correspond with the KB entity length + sent_width = cfg.get("entity_width") model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg) - # dimension of the mention encoder needs to match the dimension of the entity encoder - # article_width = cfg.get("article_width", 128) - # sent_width = cfg.get("sent_width", 64) - # article_encoder = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=article_width, **cfg) - # mention_width = article_width + sent_width - # mention_encoder = Affine(entity_width, mention_width, drop_factor=0.0) - # return article_encoder, sent_encoder, mention_encoder - return model def __init__(self, **cfg): - # self.article_encoder = True - # self.sent_encoder = True - # self.mention_encoder = True self.model = True self.kb = None self.cfg = dict(cfg) - self.doc_cutoff = self.cfg.get("doc_cutoff", 5) - # self.sgd_article = None - # self.sgd_sent = None - # self.sgd_mention = None def set_kb(self, kb): self.kb = kb @@ -1131,13 +1117,6 @@ class EntityLinker(Pipe): sgd = self.create_optimizer() return sgd - # if self.mention_encoder is True: - # self.article_encoder, self.sent_encoder, self.mention_encoder = self.Model(**self.cfg) - # self.sgd_article = create_default_optimizer(self.article_encoder.ops) - # self.sgd_sent = create_default_optimizer(self.sent_encoder.ops) - # self.sgd_mention = create_default_optimizer(self.mention_encoder.ops) - # return self.sgd_article - def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): self.require_model() self.require_kb() @@ -1166,15 +1145,11 @@ class EntityLinker(Pipe): mention = doc.text[start:end] sent_start = 0 sent_end = len(doc) - first_par_end = len(doc) for index, sent in enumerate(doc.sents): if start >= sent.start_char and end <= sent.end_char: sent_start = sent.start sent_end = sent.end - if index == self.doc_cutoff-1: - first_par_end = sent.end sentence = doc[sent_start:sent_end].as_doc() - first_par = doc[0:first_par_end].as_doc() candidates = self.kb.get_candidates(mention) for c in candidates: @@ -1184,32 +1159,15 @@ class EntityLinker(Pipe): prior_prob = c.prior_prob entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) - # article_docs.append(first_par) sentence_docs.append(sentence) if len(entity_encodings) > 0: - # doc_encodings, bp_doc = self.article_encoder.begin_update(article_docs, drop=drop) - # sent_encodings, bp_sent = self.sent_encoder.begin_update(sentence_docs, drop=drop) - - # concat_encodings = [list(doc_encodings[i]) + list(sent_encodings[i]) for i in range(len(article_docs))] - # mention_encodings, bp_mention = self.mention_encoder.begin_update(np.asarray(concat_encodings), drop=drop) - sent_encodings, bp_sent = self.model.begin_update(sentence_docs, drop=drop) entity_encodings = np.asarray(entity_encodings, dtype=np.float32) loss, d_scores = self.get_loss(scores=sent_encodings, golds=entity_encodings, docs=None) bp_sent(d_scores, sgd=sgd) - # gradient : concat (doc+sent) vs. desc - # sent_start = self.article_encoder.nO - # sent_gradients = list() - # doc_gradients = list() - # for x in mention_gradient: - # doc_gradients.append(list(x[0:sent_start])) - # sent_gradients.append(list(x[sent_start:])) - # bp_doc(doc_gradients, sgd=self.sgd_article) - # bp_sent(sent_gradients, sgd=self.sgd_sent) - if losses is not None: losses[self.name] += loss return loss @@ -1264,21 +1222,9 @@ class EntityLinker(Pipe): for i, doc in enumerate(docs): if len(doc) > 0: - first_par_end = len(doc) - for index, sent in enumerate(doc.sents): - if index == self.doc_cutoff-1: - first_par_end = sent.end - first_par = doc[0:first_par_end].as_doc() - - # doc_encoding = self.article_encoder([first_par]) for ent in doc.ents: sent_doc = ent.sent.as_doc() if len(sent_doc) > 0: - # sent_encoding = self.sent_encoder([sent_doc]) - # concat_encoding = [list(doc_encoding[0]) + list(sent_encoding[0])] - # mention_encoding = self.mention_encoder(np.asarray([concat_encoding[0]])) - # mention_enc_t = np.transpose(mention_encoding) - sent_encoding = self.model([sent_doc]) sent_enc_t = np.transpose(sent_encoding) From 478305cd3f16cbfad2ea6cb9ccf49f434c3395aa Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 18 Jun 2019 18:38:09 +0200 Subject: [PATCH 083/148] small tweaks and documentation --- bin/wiki_entity_linking/train_descriptions.py | 5 ++ .../training_set_creator.py | 9 ++- bin/wiki_entity_linking/wikidata_processor.py | 2 +- .../wikipedia_processor.py | 3 +- examples/pipeline/wikidata_entity_linking.py | 14 +++-- spacy/language.py | 2 +- spacy/pipeline/pipes.pyx | 60 ++++++++----------- 7 files changed, 49 insertions(+), 46 deletions(-) diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index cc5016237..82db582dc 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -12,6 +12,10 @@ from thinc.neural._classes.affine import Affine class EntityEncoder: + """ + Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D). + This entity vector will be stored in the KB, and context vectors will be trained to be similar to them. + """ DROP = 0 EPOCHS = 5 @@ -102,6 +106,7 @@ class EntityEncoder: def _build_network(self, orig_width, hidden_with): with Model.define_operators({">>": chain}): + # very simple encoder-decoder model self.encoder = ( Affine(hidden_with, orig_width) ) diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index a0d130824..90df5d9fc 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -10,7 +10,8 @@ from spacy.gold import GoldParse from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp """ -Process Wikipedia interlinks to generate a training dataset for the EL algorithm +Process Wikipedia interlinks to generate a training dataset for the EL algorithm. +Gold-standard entities are stored in one file in standoff format (by character offset). """ # ENTITY_FILE = "gold_entities.csv" @@ -321,12 +322,16 @@ def read_training(nlp, training_dir, dev, limit): current_article_id = article_id ents_by_offset = dict() for ent in current_doc.ents: - ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent + sent_length = len(ent.sent) + # custom filtering to avoid too long or too short sentences + if 5 < sent_length < 100: + ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent else: skip_articles.add(current_article_id) current_doc = None except Exception as e: print("Problem parsing article", article_id, e) + skip_articles.add(current_article_id) # repeat checking this condition in case an exception was thrown if current_doc and (current_article_id == article_id): diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py index 899c607cc..85d3d8488 100644 --- a/bin/wiki_entity_linking/wikidata_processor.py +++ b/bin/wiki_entity_linking/wikidata_processor.py @@ -10,7 +10,7 @@ WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.js def read_wikidata_entities_json(limit=None, to_print=False): - """ Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. """ + # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. lang = 'en' site_filter = 'enwiki' diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index 0747c9db7..d957fc58c 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -8,6 +8,7 @@ import datetime """ Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions. +Write these results to file for downstream KB and training data generation. """ @@ -142,7 +143,7 @@ def _capitalize_first(text): def write_entity_counts(prior_prob_input, count_output, to_print=False): - """ Write entity counts for quick access later """ + # Write entity counts for quick access later entity_to_count = dict() total_count = 0 diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index d537cce7e..c282c7262 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -195,10 +195,11 @@ def run_pipeline(): print("STEP 7: performance measurement of Entity Linking pipe", datetime.datetime.now()) print() - acc_r, acc_r_by_label, acc_p, acc_p_by_label, acc_o, acc_o_by_label = _measure_baselines(dev_data, kb_2) - print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_by_label.items()]) - print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_by_label.items()]) - print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_by_label.items()]) + counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(dev_data, kb_2) + print("dev counts:", sorted(counts)) + print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()]) + print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()]) + print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()]) with el_pipe.model.use_params(optimizer.averages): # measuring combined accuracy (prior + context) @@ -288,6 +289,8 @@ def _measure_accuracy(data, el_pipe): def _measure_baselines(data, kb): # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound + counts_by_label = dict() + random_correct_by_label = dict() random_incorrect_by_label = dict() @@ -315,6 +318,7 @@ def _measure_baselines(data, kb): # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' if gold_entity is not None: + counts_by_label[ent_label] = counts_by_label.get(ent_label, 0) + 1 candidates = kb.get_candidates(ent.text) oracle_candidate = "" best_candidate = "" @@ -353,7 +357,7 @@ def _measure_baselines(data, kb): acc_random, acc_random_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label) acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label) - return acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label + return counts_by_label, acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label def calculate_acc(correct_by_label, incorrect_by_label): diff --git a/spacy/language.py b/spacy/language.py index 0e5e29244..2225a763e 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -11,7 +11,7 @@ from copy import copy, deepcopy from thinc.neural import Model import srsly -from spacy.kb import KnowledgeBase +from .kb import KnowledgeBase from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 7d90c4438..99c361964 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -14,7 +14,6 @@ from thinc.misc import LayerNorm from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module -from spacy.kb import KnowledgeBase from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -1081,9 +1080,9 @@ class EntityLinker(Pipe): hidden_width = cfg.get("hidden_width", 128) # no default because this needs to correspond with the KB entity length - sent_width = cfg.get("entity_width") + entity_width = cfg.get("entity_width") - model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=sent_width, **cfg) + model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg) return model @@ -1135,21 +1134,13 @@ class EntityLinker(Pipe): docs = [docs] golds = [golds] - # article_docs = list() - sentence_docs = list() + context_docs = list() entity_encodings = list() for doc, gold in zip(docs, golds): for entity in gold.links: start, end, gold_kb = entity mention = doc.text[start:end] - sent_start = 0 - sent_end = len(doc) - for index, sent in enumerate(doc.sents): - if start >= sent.start_char and end <= sent.end_char: - sent_start = sent.start - sent_end = sent.end - sentence = doc[sent_start:sent_end].as_doc() candidates = self.kb.get_candidates(mention) for c in candidates: @@ -1159,14 +1150,14 @@ class EntityLinker(Pipe): prior_prob = c.prior_prob entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) - sentence_docs.append(sentence) + context_docs.append(doc) if len(entity_encodings) > 0: - sent_encodings, bp_sent = self.model.begin_update(sentence_docs, drop=drop) + context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop) entity_encodings = np.asarray(entity_encodings, dtype=np.float32) - loss, d_scores = self.get_loss(scores=sent_encodings, golds=entity_encodings, docs=None) - bp_sent(d_scores, sgd=sgd) + loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None) + bp_context(d_scores, sgd=sgd) if losses is not None: losses[self.name] += loss @@ -1222,28 +1213,25 @@ class EntityLinker(Pipe): for i, doc in enumerate(docs): if len(doc) > 0: + context_encoding = self.model([doc]) + context_enc_t = np.transpose(context_encoding) for ent in doc.ents: - sent_doc = ent.sent.as_doc() - if len(sent_doc) > 0: - sent_encoding = self.model([sent_doc]) - sent_enc_t = np.transpose(sent_encoding) + candidates = self.kb.get_candidates(ent.text) + if candidates: + scores = list() + for c in candidates: + prior_prob = c.prior_prob * self.prior_weight + kb_id = c.entity_ + entity_encoding = c.entity_vector + sim = float(cosine(np.asarray([entity_encoding]), context_enc_t)) * self.context_weight + score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? + scores.append(score) - candidates = self.kb.get_candidates(ent.text) - if candidates: - scores = list() - for c in candidates: - prior_prob = c.prior_prob * self.prior_weight - kb_id = c.entity_ - entity_encoding = c.entity_vector - sim = float(cosine(np.asarray([entity_encoding]), sent_enc_t)) * self.context_weight - score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? - scores.append(score) - - # TODO: thresholding - best_index = scores.index(max(scores)) - best_candidate = candidates[best_index] - final_entities.append(ent) - final_kb_ids.append(best_candidate.entity_) + # TODO: thresholding + best_index = scores.index(max(scores)) + best_candidate = candidates[best_index] + final_entities.append(ent) + final_kb_ids.append(best_candidate.entity_) return final_entities, final_kb_ids From a31648d28be3ed10a3f8ba5cefc85f94ce22b715 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Jun 2019 09:15:43 +0200 Subject: [PATCH 084/148] further code cleanup --- bin/wiki_entity_linking/kb_creator.py | 36 ++++----- bin/wiki_entity_linking/train_descriptions.py | 7 -- .../training_set_creator.py | 27 +++---- bin/wiki_entity_linking/wikidata_processor.py | 10 +-- .../wikipedia_processor.py | 21 ++--- examples/pipeline/wikidata_entity_linking.py | 77 ++++++++----------- spacy/kb.pxd | 2 - spacy/kb.pyx | 50 +----------- spacy/pipeline/pipes.pyx | 12 +-- 9 files changed, 76 insertions(+), 166 deletions(-) diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py index 8d293a0a1..bd82e5b4e 100644 --- a/bin/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -1,31 +1,31 @@ # coding: utf-8 from __future__ import unicode_literals -from bin.wiki_entity_linking.train_descriptions import EntityEncoder +from .train_descriptions import EntityEncoder +from . import wikidata_processor as wd, wikipedia_processor as wp from spacy.kb import KnowledgeBase import csv import datetime -from bin.wiki_entity_linking import wikidata_processor as wd, wikipedia_processor as wp -INPUT_DIM = 300 # dimension of pre-trained vectors -DESC_WIDTH = 64 +INPUT_DIM = 300 # dimension of pre-trained input vectors +DESC_WIDTH = 64 # dimension of output entity vectors def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, entity_def_output, entity_descr_output, - count_input, prior_prob_input, to_print=False): + count_input, prior_prob_input, wikidata_input): # Create the knowledge base from Wikidata entries kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH) # disable this part of the pipeline when rerunning the KB generation from preprocessed files - read_raw_data = False + read_raw_data = True if read_raw_data: print() print(" * _read_wikidata_entities", datetime.datetime.now()) - title_to_id, id_to_descr = wd.read_wikidata_entities_json(limit=None) + title_to_id, id_to_descr = wd.read_wikidata_entities_json(wikidata_input) # write the title-ID and ID-description mappings to file _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_to_descr) @@ -40,7 +40,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, print() entity_frequencies = wp.get_all_frequencies(count_input=count_input) - # filter the entities for in the KB by frequency, because there's just too much data otherwise + # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id = dict() entity_list = list() description_list = list() @@ -60,11 +60,10 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, print() print(" * train entity encoder", datetime.datetime.now()) print() - encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH) encoder.train(description_list=description_list, to_print=True) - print() + print() print(" * get entity embeddings", datetime.datetime.now()) print() embeddings = encoder.apply_encoder(description_list) @@ -80,12 +79,10 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input) - if to_print: - print() - print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) + print() + print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) print("done with kb", datetime.datetime.now()) - return kb @@ -94,6 +91,7 @@ def _write_entity_files(entity_def_output, entity_descr_output, title_to_id, id_ id_file.write("WP_title" + "|" + "WD_id" + "\n") for title, qid in title_to_id.items(): id_file.write(title + "|" + str(qid) + "\n") + with open(entity_descr_output, mode='w', encoding='utf8') as descr_file: descr_file.write("WD_id" + "|" + "description" + "\n") for qid, descr in id_to_descr.items(): @@ -108,7 +106,6 @@ def get_entity_to_id(entity_def_output): next(csvreader) for row in csvreader: entity_to_id[row[0]] = row[1] - return entity_to_id @@ -120,16 +117,12 @@ def _get_id_to_description(entity_descr_output): next(csvreader) for row in csvreader: id_to_desc[row[0]] = row[1] - return id_to_desc -def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input, to_print=False): +def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input): wp_titles = title_to_id.keys() - if to_print: - print("wp titles:", wp_titles) - # adding aliases with prior probabilities # we can read this file sequentially, it's sorted by alias, and then by count with open(prior_prob_input, mode='r', encoding='utf8') as prior_file: @@ -176,6 +169,3 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in line = prior_file.readline() - if to_print: - print("added", kb.get_size_aliases(), "aliases:", kb.get_alias_strings()) - diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index 82db582dc..948a0e2d1 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -32,8 +32,6 @@ class EntityEncoder: if self.encoder is None: raise ValueError("Can not apply encoder before training it") - print("Encoding", len(description_list), "entities") - batch_size = 100000 start = 0 @@ -48,13 +46,11 @@ class EntityEncoder: start = start + batch_size stop = min(stop + batch_size, len(description_list)) - print("encoded :", len(encodings)) return encodings def train(self, description_list, to_print=False): processed, loss = self._train_model(description_list) - if to_print: print("Trained on", processed, "entities across", self.EPOCHS, "epochs") print("Final loss:", loss) @@ -111,15 +107,12 @@ class EntityEncoder: Affine(hidden_with, orig_width) ) self.model = self.encoder >> zero_init(Affine(orig_width, hidden_with, drop_factor=0.0)) - self.sgd = create_default_optimizer(self.model.ops) def _update(self, vectors): predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP) - loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors)) bp_model(d_scores, sgd=self.sgd) - return loss / len(vectors) @staticmethod diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index 90df5d9fc..eb9f8af78 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -18,23 +18,21 @@ Gold-standard entities are stored in one file in standoff format (by character o ENTITY_FILE = "gold_entities_1000000.csv" # use this file for faster processing -def create_training(entity_def_input, training_output): +def create_training(wikipedia_input, entity_def_input, training_output): wp_to_id = kb_creator.get_entity_to_id(entity_def_input) - _process_wikipedia_texts(wp_to_id, training_output, limit=None) + _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None) -def _process_wikipedia_texts(wp_to_id, training_output, limit=None): +def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None): """ Read the XML wikipedia data to parse out training data: raw text data + positive instances """ - title_regex = re.compile(r'(?<=).*(?=)') id_regex = re.compile(r'(?<=)\d*(?=)') read_ids = set() - - entityfile_loc = training_output + "/" + ENTITY_FILE + entityfile_loc = training_output / ENTITY_FILE with open(entityfile_loc, mode="w", encoding='utf8') as entityfile: # write entity training header file _write_training_entity(outputfile=entityfile, @@ -44,7 +42,7 @@ def _process_wikipedia_texts(wp_to_id, training_output, limit=None): start="start", end="end") - with bz2.open(wp.ENWIKI_DUMP, mode='rb') as file: + with bz2.open(wikipedia_input, mode='rb') as file: line = file.readline() cnt = 0 article_text = "" @@ -104,7 +102,7 @@ def _process_wikipedia_texts(wp_to_id, training_output, limit=None): print("Found duplicate article ID", article_id, clean_line) # This should never happen ... read_ids.add(article_id) - # read the title of this article (outside the revision portion of the document) + # read the title of this article (outside the revision portion of the document) if not reading_revision: titles = title_regex.search(clean_line) if titles: @@ -134,7 +132,7 @@ def _process_wp_text(wp_to_id, entityfile, article_id, article_title, article_te # get the raw text without markup etc, keeping only interwiki links clean_text = _get_clean_wp_text(text) - # read the text char by char to get the right offsets of the interwiki links + # read the text char by char to get the right offsets for the interwiki links final_text = "" open_read = 0 reading_text = True @@ -274,7 +272,7 @@ def _get_clean_wp_text(article_text): def _write_training_article(article_id, clean_text, training_output): - file_loc = training_output + "/" + str(article_id) + ".txt" + file_loc = training_output / str(article_id) + ".txt" with open(file_loc, mode='w', encoding='utf8') as outputfile: outputfile.write(clean_text) @@ -289,11 +287,10 @@ def is_dev(article_id): def read_training(nlp, training_dir, dev, limit): # This method provides training examples that correspond to the entity annotations found by the nlp object - - entityfile_loc = training_dir + "/" + ENTITY_FILE + entityfile_loc = training_dir / ENTITY_FILE data = [] - # we assume the data is written sequentially + # assume the data is written sequentially, so we can reuse the article docs current_article_id = None current_doc = None ents_by_offset = dict() @@ -347,10 +344,10 @@ def read_training(nlp, training_dir, dev, limit): gold_end = int(end) - found_ent.sent.start_char gold_entities = list() gold_entities.append((gold_start, gold_end, wp_title)) - gold = GoldParse(doc=current_doc, links=gold_entities) + gold = GoldParse(doc=sent, links=gold_entities) data.append((sent, gold)) total_entities += 1 - if len(data) % 500 == 0: + if len(data) % 2500 == 0: print(" -read", total_entities, "entities") print(" -read", total_entities, "entities") diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py index 85d3d8488..a32a0769a 100644 --- a/bin/wiki_entity_linking/wikidata_processor.py +++ b/bin/wiki_entity_linking/wikidata_processor.py @@ -5,17 +5,15 @@ import bz2 import json import datetime -# TODO: remove hardcoded paths -WIKIDATA_JSON = 'C:/Users/Sofie/Documents/data/wikidata/wikidata-20190304-all.json.bz2' - -def read_wikidata_entities_json(limit=None, to_print=False): +def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False): # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines. + # get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ lang = 'en' site_filter = 'enwiki' - # filter currently disabled to get ALL data + # properties filter (currently disabled to get ALL data) prop_filter = dict() # prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected @@ -30,7 +28,7 @@ def read_wikidata_entities_json(limit=None, to_print=False): parse_aliases = False parse_claims = False - with bz2.open(WIKIDATA_JSON, mode='rb') as file: + with bz2.open(wikidata_file, mode='rb') as file: line = file.readline() cnt = 0 while line and (not limit or cnt < limit): diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index d957fc58c..c02e472bc 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -11,11 +11,6 @@ Process a Wikipedia dump to calculate entity frequencies and prior probabilities Write these results to file for downstream KB and training data generation. """ - -# TODO: remove hardcoded paths -ENWIKI_DUMP = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream.xml.bz2' -ENWIKI_INDEX = 'C:/Users/Sofie/Documents/data/wikipedia/enwiki-20190320-pages-articles-multistream-index.txt.bz2' - map_alias_to_link = dict() # these will/should be matched ignoring case @@ -46,15 +41,13 @@ for ns in wiki_namespaces: ns_regex = re.compile(ns_regex, re.IGNORECASE) -def read_wikipedia_prior_probs(prior_prob_output): +def read_wikipedia_prior_probs(wikipedia_input, prior_prob_output): """ - Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities - The full file takes about 2h to parse 1100M lines (update printed every 5M lines). - It works relatively fast because we don't care about which article we parsed the interwiki from, - we just process line by line. + Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities. + The full file takes about 2h to parse 1100M lines. + It works relatively fast because it runs line by line, irrelevant of which article the intrawiki is from. """ - - with bz2.open(ENWIKI_DUMP, mode='rb') as file: + with bz2.open(wikipedia_input, mode='rb') as file: line = file.readline() cnt = 0 while line: @@ -70,7 +63,7 @@ def read_wikipedia_prior_probs(prior_prob_output): line = file.readline() cnt += 1 - # write all aliases and their entities and occurrences to file + # write all aliases and their entities and count occurrences to file with open(prior_prob_output, mode='w', encoding='utf8') as outputfile: outputfile.write("alias" + "|" + "count" + "|" + "entity" + "\n") for alias, alias_dict in sorted(map_alias_to_link.items(), key=lambda x: x[0]): @@ -108,7 +101,7 @@ def get_wp_links(text): if ns_regex.match(match): pass # ignore namespaces at the beginning of the string - # this is a simple link, with the alias the same as the mention + # this is a simple [[link]], with the alias the same as the mention elif "|" not in match: aliases.append(match) entities.append(match) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index c282c7262..aa1c00996 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -2,35 +2,45 @@ from __future__ import unicode_literals import random - -from spacy.util import minibatch, compounding +import datetime +from pathlib import Path from bin.wiki_entity_linking import training_set_creator, kb_creator, wikipedia_processor as wp from bin.wiki_entity_linking.kb_creator import DESC_WIDTH import spacy from spacy.kb import KnowledgeBase -import datetime +from spacy.util import minibatch, compounding """ Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. """ -PRIOR_PROB = 'C:/Users/Sofie/Documents/data/wikipedia/prior_prob.csv' -ENTITY_COUNTS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_freq.csv' -ENTITY_DEFS = 'C:/Users/Sofie/Documents/data/wikipedia/entity_defs.csv' -ENTITY_DESCR = 'C:/Users/Sofie/Documents/data/wikipedia/entity_descriptions.csv' +ROOT_DIR = Path("C:/Users/Sofie/Documents/data/") +OUTPUT_DIR = ROOT_DIR / 'wikipedia' +TRAINING_DIR = OUTPUT_DIR / 'training_data_nel' -KB_FILE = 'C:/Users/Sofie/Documents/data/wikipedia/kb_1/kb' -NLP_1_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_1' -NLP_2_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/nlp_2' +PRIOR_PROB = OUTPUT_DIR / 'prior_prob.csv' +ENTITY_COUNTS = OUTPUT_DIR / 'entity_freq.csv' +ENTITY_DEFS = OUTPUT_DIR / 'entity_defs.csv' +ENTITY_DESCR = OUTPUT_DIR / 'entity_descriptions.csv' -TRAINING_DIR = 'C:/Users/Sofie/Documents/data/wikipedia/training_data_nel/' +KB_FILE = OUTPUT_DIR / 'kb_1' / 'kb' +NLP_1_DIR = OUTPUT_DIR / 'nlp_1' +NLP_2_DIR = OUTPUT_DIR / 'nlp_2' +# get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ +WIKIDATA_JSON = ROOT_DIR / 'wikidata' / 'wikidata-20190304-all.json.bz2' + +# get enwiki-latest-pages-articles-multistream.xml.bz2 from https://dumps.wikimedia.org/enwiki/latest/ +ENWIKI_DUMP = ROOT_DIR / 'wikipedia' / 'enwiki-20190320-pages-articles-multistream.xml.bz2' + +# KB construction parameters MAX_CANDIDATES = 10 MIN_ENTITY_FREQ = 20 MIN_PAIR_OCC = 5 +# model training parameters EPOCHS = 10 DROPOUT = 0.1 LEARN_RATE = 0.005 @@ -38,6 +48,7 @@ L2 = 1e-6 def run_pipeline(): + # set the appropriate booleans to define which parts of the pipeline should be re(run) print("START", datetime.datetime.now()) print() nlp_1 = spacy.load('en_core_web_lg') @@ -67,22 +78,19 @@ def run_pipeline(): to_write_nlp = False to_read_nlp = False - # STEP 1 : create prior probabilities from WP - # run only once ! + # STEP 1 : create prior probabilities from WP (run only once) if to_create_prior_probs: print("STEP 1: to_create_prior_probs", datetime.datetime.now()) - wp.read_wikipedia_prior_probs(prior_prob_output=PRIOR_PROB) + wp.read_wikipedia_prior_probs(wikipedia_input=ENWIKI_DUMP, prior_prob_output=PRIOR_PROB) print() - # STEP 2 : deduce entity frequencies from WP - # run only once ! + # STEP 2 : deduce entity frequencies from WP (run only once) if to_create_entity_counts: print("STEP 2: to_create_entity_counts", datetime.datetime.now()) wp.write_entity_counts(prior_prob_input=PRIOR_PROB, count_output=ENTITY_COUNTS, to_print=False) print() - # STEP 3 : create KB and write to file - # run only once ! + # STEP 3 : create KB and write to file (run only once) if to_create_kb: print("STEP 3a: to_create_kb", datetime.datetime.now()) kb_1 = kb_creator.create_kb(nlp_1, @@ -93,7 +101,7 @@ def run_pipeline(): entity_descr_output=ENTITY_DESCR, count_input=ENTITY_COUNTS, prior_prob_input=PRIOR_PROB, - to_print=False) + wikidata_input=WIKIDATA_JSON) print("kb entities:", kb_1.get_size_entities()) print("kb aliases:", kb_1.get_size_aliases()) print() @@ -121,7 +129,9 @@ def run_pipeline(): # STEP 5: create a training dataset from WP if create_wp_training: print("STEP 5: create training dataset", datetime.datetime.now()) - training_set_creator.create_training(entity_def_input=ENTITY_DEFS, training_output=TRAINING_DIR) + training_set_creator.create_training(wikipedia_input=ENWIKI_DUMP, + entity_def_input=ENTITY_DEFS, + training_output=TRAINING_DIR) # STEP 6: create and train the entity linking pipe el_pipe = nlp_2.create_pipe(name='entity_linker', config={}) @@ -136,7 +146,8 @@ def run_pipeline(): if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - train_limit = 25000 + # define the size (nr of entities) of training and dev set + train_limit = 10000 dev_limit = 5000 train_data = training_set_creator.read_training(nlp=nlp_2, @@ -157,7 +168,6 @@ def run_pipeline(): if not train_data: print("Did not find any training data") - else: for itn in range(EPOCHS): random.shuffle(train_data) @@ -196,7 +206,7 @@ def run_pipeline(): print() counts, acc_r, acc_r_label, acc_p, acc_p_label, acc_o, acc_o_label = _measure_baselines(dev_data, kb_2) - print("dev counts:", sorted(counts)) + print("dev counts:", sorted(counts.items(), key=lambda x: x[0])) print("dev acc oracle:", round(acc_o, 3), [(x, round(y, 3)) for x, y in acc_o_label.items()]) print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()]) print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()]) @@ -215,7 +225,6 @@ def run_pipeline(): dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe) print("dev acc context avg:", round(dev_acc_context, 3), [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()]) - print() # reset for follow-up tests el_pipe.context_weight = 1 @@ -227,7 +236,6 @@ def run_pipeline(): print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) print() run_el_toy_example(nlp=nlp_2) - print() # STEP 9: write the NLP pipeline (including entity linker) to file if to_write_nlp: @@ -400,26 +408,9 @@ def run_el_toy_example(nlp): doc = nlp(text) print(text) for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) + print(" ent", ent.text, ent.label_, ent.kb_id_) print() - # Q4426480 is her husband - text = "Ada Lovelace was the countess of Lovelace. She's known for her programming work on the analytical engine. "\ - "She loved her husband William King dearly. " - doc = nlp(text) - print(text) - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) - print() - - # Q3568763 is her tutor - text = "Ada Lovelace was the countess of Lovelace. She's known for her programming work on the analytical engine. "\ - "She was tutored by her favorite physics tutor William King." - doc = nlp(text) - print(text) - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) - if __name__ == "__main__": run_pipeline() diff --git a/spacy/kb.pxd b/spacy/kb.pxd index 9c5a73d59..ccf150cd2 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -18,7 +18,6 @@ ctypedef vector[float_vec] float_matrix # Object used by the Entity Linker that summarizes one entity-alias candidate combination. cdef class Candidate: - cdef readonly KnowledgeBase kb cdef hash_t entity_hash cdef float entity_freq @@ -143,7 +142,6 @@ cdef class KnowledgeBase: cpdef load_bulk(self, loc) cpdef set_entities(self, entity_list, prob_list, vector_list) - cpdef set_aliases(self, alias_list, entities_list, probabilities_list) cdef class Writer: diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 9a84439ea..72f66b107 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -1,23 +1,16 @@ # cython: infer_types=True # cython: profile=True # coding: utf8 -from collections import OrderedDict -from pathlib import Path, WindowsPath - -from cpython.exc cimport PyErr_CheckSignals - -from spacy import util from spacy.errors import Errors, Warnings, user_warning +from pathlib import Path from cymem.cymem cimport Pool from preshed.maps cimport PreshMap -from cpython.mem cimport PyMem_Malloc from cpython.exc cimport PyErr_SetFromErrno -from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek +from libc.stdio cimport fopen, fclose, fread, fwrite, feof, fseek from libc.stdint cimport int32_t, int64_t -from libc.stdlib cimport qsort from .typedefs cimport hash_t @@ -25,7 +18,6 @@ from os import path from libcpp.vector cimport vector - cdef class Candidate: def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): @@ -79,8 +71,6 @@ cdef class KnowledgeBase: self._entry_index = PreshMap() self._alias_index = PreshMap() - # Should we initialize self._entries and self._aliases_table to specific starting size ? - self.vocab.strings.add("") self._create_empty_vectors(dummy_hash=self.vocab.strings[""]) @@ -165,47 +155,11 @@ cdef class KnowledgeBase: i += 1 - # TODO: this method is untested - cpdef set_aliases(self, alias_list, entities_list, probabilities_list): - nr_aliases = len(alias_list) - self._alias_index = PreshMap(nr_aliases+1) - self._aliases_table = alias_vec(nr_aliases+1) - - i = 0 - cdef AliasC alias - cdef int32_t dummy_value = 342 - while i <= nr_aliases: - alias_hash = self.vocab.strings.add(alias_list[i]) - entities = entities_list[i] - probabilities = probabilities_list[i] - - nr_candidates = len(entities) - entry_indices = vector[int64_t](nr_candidates) - probs = vector[float](nr_candidates) - - for j in range(0, nr_candidates): - entity = entities[j] - entity_hash = self.vocab.strings[entity] - if not entity_hash in self._entry_index: - raise ValueError(Errors.E134.format(alias=alias, entity=entity)) - - entry_index = self._entry_index.get(entity_hash) - entry_indices[j] = entry_index - - alias.entry_indices = entry_indices - alias.probs = probs - - self._aliases_table[i] = alias - self._alias_index[alias_hash] = i - - i += 1 - def add_alias(self, unicode alias, entities, probabilities): """ For a given alias, add its potential entities and prior probabilies to the KB. Return the alias_hash at the end """ - # Throw an error if the length of entities and probabilities are not the same if not len(entities) == len(probabilities): raise ValueError(Errors.E132.format(alias=alias, diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 99c361964..1c430a90b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1068,8 +1068,6 @@ class EntityLinker(Pipe): DOCS: TODO """ name = 'entity_linker' - context_weight = 1 - prior_weight = 1 @classmethod def Model(cls, **cfg): @@ -1078,18 +1076,17 @@ class EntityLinker(Pipe): embed_width = cfg.get("embed_width", 300) hidden_width = cfg.get("hidden_width", 128) - - # no default because this needs to correspond with the KB entity length - entity_width = cfg.get("entity_width") + entity_width = cfg.get("entity_width") # this needs to correspond with the KB entity length model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg) - return model def __init__(self, **cfg): self.model = True self.kb = None self.cfg = dict(cfg) + self.context_weight = cfg.get("context_weight", 1) + self.prior_weight = cfg.get("prior_weight", 1) def set_kb(self, kb): self.kb = kb @@ -1162,7 +1159,6 @@ class EntityLinker(Pipe): if losses is not None: losses[self.name] += loss return loss - return 0 def get_loss(self, docs, golds, scores): @@ -1224,7 +1220,7 @@ class EntityLinker(Pipe): kb_id = c.entity_ entity_encoding = c.entity_vector sim = float(cosine(np.asarray([entity_encoding]), context_enc_t)) * self.context_weight - score = prior_prob + sim - (prior_prob*sim) # put weights on the different factors ? + score = prior_prob + sim - (prior_prob*sim) scores.append(score) # TODO: thresholding From cc9ae28a52df2bfc8ee96c38392522d9752a3058 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Jun 2019 12:35:26 +0200 Subject: [PATCH 085/148] custom error and warning messages --- bin/wiki_entity_linking/training_set_creator.py | 7 ++++--- spacy/errors.py | 3 +++ spacy/kb.pyx | 16 +++++----------- spacy/pipeline/pipes.pyx | 5 ++--- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index eb9f8af78..d9600048c 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -324,18 +324,19 @@ def read_training(nlp, training_dir, dev, limit): if 5 < sent_length < 100: ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent else: - skip_articles.add(current_article_id) + skip_articles.add(article_id) current_doc = None except Exception as e: print("Problem parsing article", article_id, e) - skip_articles.add(current_article_id) + skip_articles.add(article_id) + raise e # repeat checking this condition in case an exception was thrown if current_doc and (current_article_id == article_id): found_ent = ents_by_offset.get(start + "_" + end, None) if found_ent: if found_ent.text != alias: - skip_articles.add(current_article_id) + skip_articles.add(article_id) current_doc = None else: sent = found_ent.sent.as_doc() diff --git a/spacy/errors.py b/spacy/errors.py index fcc3132c6..5684721ae 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -399,6 +399,9 @@ class Errors(object): E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input includes either the " "`text` or `tokens` key. For more info, see the docs:\n" "https://spacy.io/api/cli#pretrain-jsonl") + E139 = ("Knowledge base for component '{name}' not initialized. Did you forget to call set_kb()?") + E140 = ("The list of entities, prior probabilities and entity vectors should be of equal length.") + E141 = ("Entity vectors should be of length {required} instead of the provided {found}.") @add_codes diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 72f66b107..4d9d2b89b 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -106,9 +106,9 @@ cdef class KnowledgeBase: user_warning(Warnings.W018.format(entity=entity)) return + # Raise an error if the provided entity vector is not of the correct length if len(entity_vector) != self.entity_vector_length: - # TODO: proper error - raise ValueError("Entity vector length should have been", self.entity_vector_length) + raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) vector_index = self.c_add_vector(entity_vector=entity_vector) @@ -121,13 +121,8 @@ cdef class KnowledgeBase: return entity_hash cpdef set_entities(self, entity_list, prob_list, vector_list): - if len(entity_list) != len(prob_list): - # TODO: proper error - raise ValueError("Entity list and prob list should have the same length") - - if len(entity_list) != len(vector_list): - # TODO: proper error - raise ValueError("Entity list and vector list should have the same length") + if len(entity_list) != len(prob_list) or len(entity_list) != len(vector_list): + raise ValueError(Errors.E140) nr_entities = len(entity_list) self._entry_index = PreshMap(nr_entities+1) @@ -138,8 +133,7 @@ cdef class KnowledgeBase: while i < nr_entities: entity_vector = vector_list[i] if len(entity_vector) != self.entity_vector_length: - # TODO: proper error - raise ValueError("Entity vector is", len(entity_vector), "length but should have been", self.entity_vector_length) + raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) entity_hash = self.vocab.strings.add(entity_list[i]) entry.entity_hash = entity_hash diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index a191a7906..2f7856fe0 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1002,7 +1002,7 @@ cdef class DependencyParser(Parser): @property def postprocesses(self): - return [nonproj.deprojectivize, merge_subtokens] + return [nonproj.deprojectivize] # , merge_subtokens] def add_multitask_objective(self, target): if target == "cloze": @@ -1100,8 +1100,7 @@ class EntityLinker(Pipe): def require_kb(self): # Raise an error if the knowledge base is not initialized. if getattr(self, "kb", None) in (None, True, False): - # TODO: custom error - raise ValueError(Errors.E109.format(name=self.name)) + raise ValueError(Errors.E139.format(name=self.name)) def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): self.require_kb() From 0b0959b363bd0e8eeb4a9b5aa8f24f618525dbbf Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Jun 2019 13:11:39 +0200 Subject: [PATCH 086/148] UTF8 encoding --- spacy/tests/serialize/test_serialize_kb.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 7a8022890..bcf27990b 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,5 +1,4 @@ -import spacy -from spacy.lang.en import English +# coding: utf-8 from ..util import make_tempdir from ...util import ensure_path From b76a43bee4b085944f661f4cfce2bbcd11af138f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 19 Jun 2019 13:26:33 +0200 Subject: [PATCH 087/148] unicode strings --- spacy/tests/serialize/test_serialize_kb.py | 32 +++++++++++----------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index bcf27990b..26e912738 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -29,14 +29,14 @@ def test_serialize_kb_disk(en_vocab): def _get_dummy_kb(vocab): kb = KnowledgeBase(vocab=vocab, entity_vector_length=3) - kb.add_entity(entity="Q53", prob=0.33, entity_vector=[0, 5, 3]) - kb.add_entity(entity="Q17", prob=0.2, entity_vector=[7, 1, 0]) - kb.add_entity(entity="Q007", prob=0.7, entity_vector=[0, 0, 7]) - kb.add_entity(entity="Q44", prob=0.4, entity_vector=[4, 4, 4]) + kb.add_entity(entity=u'Q53', prob=0.33, entity_vector=[0, 5, 3]) + kb.add_entity(entity=u'Q17', prob=0.2, entity_vector=[7, 1, 0]) + kb.add_entity(entity=u'Q007', prob=0.7, entity_vector=[0, 0, 7]) + kb.add_entity(entity=u'Q44', prob=0.4, entity_vector=[4, 4, 4]) - kb.add_alias(alias="double07", entities=["Q17", "Q007"], probabilities=[0.1, 0.9]) - kb.add_alias(alias="guy", entities=["Q53", "Q007", "Q17", "Q44"], probabilities=[0.3, 0.3, 0.2, 0.1]) - kb.add_alias(alias="random", entities=["Q007"], probabilities=[1.0]) + kb.add_alias(alias=u'double07', entities=[u'Q17', u'Q007'], probabilities=[0.1, 0.9]) + kb.add_alias(alias=u'guy', entities=[u'Q53', u'Q007', u'Q17', u'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1]) + kb.add_alias(alias=u'random', entities=[u'Q007'], probabilities=[1.0]) return kb @@ -44,30 +44,30 @@ def _get_dummy_kb(vocab): def _check_kb(kb): # check entities assert kb.get_size_entities() == 4 - for entity_string in ["Q53", "Q17", "Q007", "Q44"]: + for entity_string in [u'Q53', u'Q17', u'Q007', u'Q44']: assert entity_string in kb.get_entity_strings() - for entity_string in ["", "Q0"]: + for entity_string in [u'', u'Q0']: assert entity_string not in kb.get_entity_strings() # check aliases assert kb.get_size_aliases() == 3 - for alias_string in ["double07", "guy", "random"]: + for alias_string in [u'double07', u'guy', u'random']: assert alias_string in kb.get_alias_strings() - for alias_string in ["nothingness", "", "randomnoise"]: + for alias_string in [u'nothingness', u'', u'randomnoise']: assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb.get_candidates("double07"), key=lambda x: x.entity_) + candidates = sorted(kb.get_candidates(u'double07'), key=lambda x: x.entity_) assert len(candidates) == 2 - assert candidates[0].entity_ == "Q007" + assert candidates[0].entity_ == u'Q007' assert 0.6999 < candidates[0].entity_freq < 0.701 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].alias_ == "double07" + assert candidates[0].alias_ == u'double07' assert 0.899 < candidates[0].prior_prob < 0.901 - assert candidates[1].entity_ == "Q17" + assert candidates[1].entity_ == u'Q17' assert 0.199 < candidates[1].entity_freq < 0.201 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].alias_ == "double07" + assert candidates[1].alias_ == u'double07' assert 0.099 < candidates[1].prior_prob < 0.101 From 872121955c1ba3e8b9d4b2ee9b9ac89b2e85d1d5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 20 Jun 2019 10:35:51 +0200 Subject: [PATCH 088/148] Update error code --- spacy/cli/pretrain.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 7afd10520..a95a40980 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -252,7 +252,7 @@ def get_vectors_loss(ops, docs, prediction, objective="L2"): elif objective == "cosine": loss, d_target = get_cossim_loss(prediction, target) else: - raise ValueError(Errors.E139.format(loss_func=objective)) + raise ValueError(Errors.E142.format(loss_func=objective)) return loss, d_target From b58bace84b56cc3dcc4f78e0b9dae15effdcd51e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 24 Jun 2019 10:55:04 +0200 Subject: [PATCH 089/148] small fixes --- bin/ud/conll17_ud_eval.py | 4 ++-- bin/wiki_entity_linking/kb_creator.py | 18 +++++++++--------- .../training_set_creator.py | 2 +- examples/pipeline/wikidata_entity_linking.py | 4 ++-- spacy/pipeline/pipes.pyx | 15 +++++++-------- 5 files changed, 21 insertions(+), 22 deletions(-) diff --git a/bin/ud/conll17_ud_eval.py b/bin/ud/conll17_ud_eval.py index 78a976a6d..88acfabac 100644 --- a/bin/ud/conll17_ud_eval.py +++ b/bin/ud/conll17_ud_eval.py @@ -292,8 +292,8 @@ def evaluate(gold_ud, system_ud, deprel_weights=None, check_parse=True): def spans_score(gold_spans, system_spans): correct, gi, si = 0, 0, 0 - undersegmented = list() - oversegmented = list() + undersegmented = [] + oversegmented = [] combo = 0 previous_end_si_earlier = False previous_end_gi_earlier = False diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py index bd82e5b4e..6ee139174 100644 --- a/bin/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -42,9 +42,9 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, # filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise filtered_title_to_id = dict() - entity_list = list() - description_list = list() - frequency_list = list() + entity_list = [] + description_list = [] + frequency_list = [] for title, entity in title_to_id.items(): freq = entity_frequencies.get(title, 0) desc = id_to_descr.get(entity, None) @@ -131,8 +131,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in line = prior_file.readline() previous_alias = None total_count = 0 - counts = list() - entities = list() + counts = [] + entities = [] while line: splits = line.replace('\n', "").split(sep='|') new_alias = splits[0] @@ -142,8 +142,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in if new_alias != previous_alias and previous_alias: # done reading the previous alias --> output if len(entities) > 0: - selected_entities = list() - prior_probs = list() + selected_entities = [] + prior_probs = [] for ent_count, ent_string in zip(counts, entities): if ent_string in wp_titles: wd_id = title_to_id[ent_string] @@ -157,8 +157,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in except ValueError as e: print(e) total_count = 0 - counts = list() - entities = list() + counts = [] + entities = [] total_count += count diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index d9600048c..51105ce09 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -343,7 +343,7 @@ def read_training(nlp, training_dir, dev, limit): # currently feeding the gold data one entity per sentence at a time gold_start = int(start) - found_ent.sent.start_char gold_end = int(end) - found_ent.sent.start_char - gold_entities = list() + gold_entities = [] gold_entities.append((gold_start, gold_end, wp_title)) gold = GoldParse(doc=sent, links=gold_entities) data.append((sent, gold)) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index aa1c00996..2759da135 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -147,7 +147,7 @@ def run_pipeline(): if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) # define the size (nr of entities) of training and dev set - train_limit = 10000 + train_limit = 5000 dev_limit = 5000 train_data = training_set_creator.read_training(nlp=nlp_2, @@ -332,7 +332,7 @@ def _measure_baselines(data, kb): best_candidate = "" random_candidate = "" if candidates: - scores = list() + scores = [] for c in candidates: scores.append(c.prior_prob) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 2f7856fe0..2eaedd73a 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1131,8 +1131,8 @@ class EntityLinker(Pipe): docs = [docs] golds = [golds] - context_docs = list() - entity_encodings = list() + context_docs = [] + entity_encodings = [] for doc, gold in zip(docs, golds): for entity in gold.links: @@ -1198,8 +1198,8 @@ class EntityLinker(Pipe): self.require_model() self.require_kb() - final_entities = list() - final_kb_ids = list() + final_entities = [] + final_kb_ids = [] if not docs: return final_entities, final_kb_ids @@ -1214,7 +1214,7 @@ class EntityLinker(Pipe): for ent in doc.ents: candidates = self.kb.get_candidates(ent.text) if candidates: - scores = list() + scores = [] for c in candidates: prior_prob = c.prior_prob * self.prior_weight kb_id = c.entity_ @@ -1259,11 +1259,10 @@ class EntityLinker(Pipe): return self def rehearse(self, docs, sgd=None, losses=None, **config): - # TODO - pass + raise NotImplementedError def add_label(self, label): - pass + raise NotImplementedError class Sentencizer(object): From ddc73b11a9caae7497b5d8d90e97a5b13b9dc6fa Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 24 Jun 2019 12:58:18 +0200 Subject: [PATCH 090/148] fix unicode literals --- spacy/tests/pipeline/test_entity_linker.py | 54 +++++++++++----------- spacy/tests/serialize/test_serialize_kb.py | 35 +++++++------- 2 files changed, 45 insertions(+), 44 deletions(-) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index b44332df4..b12ad3917 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -17,13 +17,13 @@ def test_kb_valid_entities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) - mykb.add_entity(entity=u'Q2', prob=0.5, entity_vector=[2]) - mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) + mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity='Q2', prob=0.5, entity_vector=[2]) + mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3]) # adding aliases - mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) - mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9]) + mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2]) + mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9]) # test the size of the corresponding KB assert(mykb.get_size_entities() == 3) @@ -35,13 +35,13 @@ def test_kb_invalid_entities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) - mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) - mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) + mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3]) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): - mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q342'], probabilities=[0.8, 0.2]) + mykb.add_alias(alias='douglas', entities=['Q2', 'Q342'], probabilities=[0.8, 0.2]) def test_kb_invalid_probabilities(nlp): @@ -49,13 +49,13 @@ def test_kb_invalid_probabilities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) - mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) - mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) + mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3]) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): - mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.4]) + mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.4]) def test_kb_invalid_combination(nlp): @@ -63,13 +63,13 @@ def test_kb_invalid_combination(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) - mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) - mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) + mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3]) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): - mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.3, 0.4, 0.1]) + mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.3, 0.4, 0.1]) def test_kb_invalid_entity_vector(nlp): @@ -77,11 +77,11 @@ def test_kb_invalid_entity_vector(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1, 2, 3]) + mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1, 2, 3]) # this should fail because the kb's expected entity vector length is 3 with pytest.raises(ValueError): - mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2]) def test_candidate_generation(nlp): @@ -89,15 +89,15 @@ def test_candidate_generation(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity=u'Q1', prob=0.9, entity_vector=[1]) - mykb.add_entity(entity=u'Q2', prob=0.2, entity_vector=[2]) - mykb.add_entity(entity=u'Q3', prob=0.5, entity_vector=[3]) + mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity='Q2', prob=0.2, entity_vector=[2]) + mykb.add_entity(entity='Q3', prob=0.5, entity_vector=[3]) # adding aliases - mykb.add_alias(alias=u'douglas', entities=[u'Q2', u'Q3'], probabilities=[0.8, 0.2]) - mykb.add_alias(alias=u'adam', entities=[u'Q2'], probabilities=[0.9]) + mykb.add_alias(alias='douglas', entities=['Q2', 'Q3'], probabilities=[0.8, 0.2]) + mykb.add_alias(alias='adam', entities=['Q2'], probabilities=[0.9]) # test the size of the relevant candidates - assert(len(mykb.get_candidates(u'douglas')) == 2) - assert(len(mykb.get_candidates(u'adam')) == 1) - assert(len(mykb.get_candidates(u'shrubbery')) == 0) + assert(len(mykb.get_candidates('douglas')) == 2) + assert(len(mykb.get_candidates('adam')) == 1) + assert(len(mykb.get_candidates('shrubbery')) == 0) diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 26e912738..fa7253fa1 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,4 +1,6 @@ # coding: utf-8 +from __future__ import unicode_literals + from ..util import make_tempdir from ...util import ensure_path @@ -16,7 +18,6 @@ def test_serialize_kb_disk(en_vocab): if not dir_path.exists(): dir_path.mkdir() file_path = dir_path / "kb" - print(file_path, type(file_path)) kb1.dump(str(file_path)) kb2 = KnowledgeBase(vocab=en_vocab, entity_vector_length=3) @@ -29,14 +30,14 @@ def test_serialize_kb_disk(en_vocab): def _get_dummy_kb(vocab): kb = KnowledgeBase(vocab=vocab, entity_vector_length=3) - kb.add_entity(entity=u'Q53', prob=0.33, entity_vector=[0, 5, 3]) - kb.add_entity(entity=u'Q17', prob=0.2, entity_vector=[7, 1, 0]) - kb.add_entity(entity=u'Q007', prob=0.7, entity_vector=[0, 0, 7]) - kb.add_entity(entity=u'Q44', prob=0.4, entity_vector=[4, 4, 4]) + kb.add_entity(entity='Q53', prob=0.33, entity_vector=[0, 5, 3]) + kb.add_entity(entity='Q17', prob=0.2, entity_vector=[7, 1, 0]) + kb.add_entity(entity='Q007', prob=0.7, entity_vector=[0, 0, 7]) + kb.add_entity(entity='Q44', prob=0.4, entity_vector=[4, 4, 4]) - kb.add_alias(alias=u'double07', entities=[u'Q17', u'Q007'], probabilities=[0.1, 0.9]) - kb.add_alias(alias=u'guy', entities=[u'Q53', u'Q007', u'Q17', u'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1]) - kb.add_alias(alias=u'random', entities=[u'Q007'], probabilities=[1.0]) + kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9]) + kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1]) + kb.add_alias(alias='random', entities=['Q007'], probabilities=[1.0]) return kb @@ -44,30 +45,30 @@ def _get_dummy_kb(vocab): def _check_kb(kb): # check entities assert kb.get_size_entities() == 4 - for entity_string in [u'Q53', u'Q17', u'Q007', u'Q44']: + for entity_string in ['Q53', 'Q17', 'Q007', 'Q44']: assert entity_string in kb.get_entity_strings() - for entity_string in [u'', u'Q0']: + for entity_string in ['', 'Q0']: assert entity_string not in kb.get_entity_strings() # check aliases assert kb.get_size_aliases() == 3 - for alias_string in [u'double07', u'guy', u'random']: + for alias_string in ['double07', 'guy', 'random']: assert alias_string in kb.get_alias_strings() - for alias_string in [u'nothingness', u'', u'randomnoise']: + for alias_string in ['nothingness', '', 'randomnoise']: assert alias_string not in kb.get_alias_strings() # check candidates & probabilities - candidates = sorted(kb.get_candidates(u'double07'), key=lambda x: x.entity_) + candidates = sorted(kb.get_candidates('double07'), key=lambda x: x.entity_) assert len(candidates) == 2 - assert candidates[0].entity_ == u'Q007' + assert candidates[0].entity_ == 'Q007' assert 0.6999 < candidates[0].entity_freq < 0.701 assert candidates[0].entity_vector == [0, 0, 7] - assert candidates[0].alias_ == u'double07' + assert candidates[0].alias_ == 'double07' assert 0.899 < candidates[0].prior_prob < 0.901 - assert candidates[1].entity_ == u'Q17' + assert candidates[1].entity_ == 'Q17' assert 0.199 < candidates[1].entity_freq < 0.201 assert candidates[1].entity_vector == [7, 1, 0] - assert candidates[1].alias_ == u'double07' + assert candidates[1].alias_ == 'double07' assert 0.099 < candidates[1].prior_prob < 0.101 From 58a5b40ef6e58d30afd57eec8189b54bded32a47 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 24 Jun 2019 15:19:58 +0200 Subject: [PATCH 091/148] clean up duplicate code --- spacy/pipeline/pipes.pyx | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 2eaedd73a..47ba4dc05 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -12,8 +12,8 @@ from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm from thinc.neural.util import to_categorical -from thinc.neural.util import get_array_module +from ..cli.pretrain import get_cossim_loss from .functions import merge_subtokens from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser @@ -1162,26 +1162,11 @@ class EntityLinker(Pipe): return 0 def get_loss(self, docs, golds, scores): - targets = [[1] for _ in golds] # assuming we're only using positive examples - loss, gradients = self.get_cossim_loss_2(yh=scores, y=golds, t=targets) + # this loss function assumes we're only using positive examples + loss, gradients = get_cossim_loss(yh=scores, y=golds) loss = loss / len(golds) return loss, gradients - def get_cossim_loss_2(self, yh, y, t): - # Add a small constant to avoid 0 vectors - yh = yh + 1e-8 - y = y + 1e-8 - # https://math.stackexchange.com/questions/1923613/partial-derivative-of-cosine-similarity - xp = get_array_module(yh) - norm_yh = xp.linalg.norm(yh, axis=1, keepdims=True) - norm_y = xp.linalg.norm(y, axis=1, keepdims=True) - mul_norms = norm_yh * norm_y - cos = (yh * y).sum(axis=1, keepdims=True) / mul_norms - d_yh = (y / mul_norms) - (cos * (yh / norm_yh ** 2)) - loss = xp.abs(cos - t).sum() - inverse = np.asarray([int(t[i][0]) * d_yh[i] for i in range(len(t))]) - return loss, -inverse - def __call__(self, doc): entities, kb_ids = self.predict([doc]) self.set_annotations([doc], entities, kb_ids) From 86086855436f15bf24a9d3d0993a1dafc4003d1a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 25 Jun 2019 15:28:51 +0200 Subject: [PATCH 092/148] ensure Span.as_doc keeps the entity links + unit test --- spacy/attrs.pxd | 1 + spacy/attrs.pyx | 1 + spacy/symbols.pxd | 1 + spacy/symbols.pyx | 1 + spacy/tests/pipeline/test_entity_linker.py | 42 ++++++++++++++++++++++ spacy/tokens/doc.pyx | 7 ++-- spacy/tokens/span.pyx | 2 +- spacy/tokens/token.pxd | 4 +++ 8 files changed, 56 insertions(+), 3 deletions(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index 79a177ba9..c5ba8d765 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -82,6 +82,7 @@ cdef enum attr_id_t: DEP ENT_IOB ENT_TYPE + ENT_KB_ID HEAD SENT_START SPACY diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index ed1f39a3f..8eeea363f 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -84,6 +84,7 @@ IDS = { "DEP": DEP, "ENT_IOB": ENT_IOB, "ENT_TYPE": ENT_TYPE, + "ENT_KB_ID": ENT_KB_ID, "HEAD": HEAD, "SENT_START": SENT_START, "SPACY": SPACY, diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 051b92edb..4501861a2 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -81,6 +81,7 @@ cdef enum symbol_t: DEP ENT_IOB ENT_TYPE + ENT_KB_ID HEAD SENT_START SPACY diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index 949621820..b65ae9628 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -86,6 +86,7 @@ IDS = { "DEP": DEP, "ENT_IOB": ENT_IOB, "ENT_TYPE": ENT_TYPE, + "ENT_KB_ID": ENT_KB_ID, "HEAD": HEAD, "SENT_START": SENT_START, "SPACY": SPACY, diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index b12ad3917..7ea893408 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -5,6 +5,7 @@ import pytest from spacy.kb import KnowledgeBase from spacy.lang.en import English +from spacy.pipeline import EntityRuler @pytest.fixture @@ -101,3 +102,44 @@ def test_candidate_generation(nlp): assert(len(mykb.get_candidates('douglas')) == 2) assert(len(mykb.get_candidates('adam')) == 1) assert(len(mykb.get_candidates('shrubbery')) == 0) + + +def test_preserving_links_asdoc(nlp): + """Test that Span.as_doc preserves the existing entity links""" + mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) + + # adding entities + mykb.add_entity(entity='Q1', prob=0.9, entity_vector=[1]) + mykb.add_entity(entity='Q2', prob=0.8, entity_vector=[1]) + + # adding aliases + mykb.add_alias(alias='Boston', entities=['Q1'], probabilities=[0.7]) + mykb.add_alias(alias='Denver', entities=['Q2'], probabilities=[0.6]) + + # set up pipeline with NER (Entity Ruler) and NEL (prior probability only, model not trained) + sentencizer = nlp.create_pipe("sentencizer") + nlp.add_pipe(sentencizer) + + ruler = EntityRuler(nlp) + patterns = [{"label": "GPE", "pattern": "Boston"}, + {"label": "GPE", "pattern": "Denver"}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + el_pipe = nlp.create_pipe(name='entity_linker', config={}) + el_pipe.set_kb(mykb) + el_pipe.begin_training() + el_pipe.context_weight = 0 + el_pipe.prior_weight = 1 + nlp.add_pipe(el_pipe, last=True) + + # test whether the entity links are preserved by the `as_doc()` function + text = "She lives in Boston. He lives in Denver." + doc = nlp(text) + for ent in doc.ents: + orig_text = ent.text + orig_kb_id = ent.kb_id_ + sent_doc = ent.sent.as_doc() + for s_ent in sent_doc.ents: + if s_ent.text == orig_text: + assert s_ent.kb_id_ == orig_kb_id diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 131c43d37..10f57ed60 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -22,7 +22,7 @@ from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, CLUSTER from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB -from ..attrs cimport ENT_TYPE, SENT_START, attr_id_t +from ..attrs cimport ENT_TYPE, ENT_KB_ID, SENT_START, attr_id_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..attrs import intify_attrs, IDS @@ -64,6 +64,8 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: return token.ent_iob elif feat_name == ENT_TYPE: return token.ent_type + elif feat_name == ENT_KB_ID: + return token.ent_kb_id else: return Lexeme.get_struct_attr(token.lex, feat_name) @@ -850,7 +852,7 @@ cdef class Doc: DOCS: https://spacy.io/api/doc#to_bytes """ - array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] + array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] # TODO: ENT_KB_ID ? if self.is_tagged: array_head.append(TAG) # If doc parsed add head and dep attribute @@ -1004,6 +1006,7 @@ cdef class Doc: """ cdef unicode tag, lemma, ent_type deprecation_warning(Warnings.W013.format(obj="Doc")) + # TODO: ENT_KB_ID ? if len(args) == 3: deprecation_warning(Warnings.W003) tag, lemma, ent_type = args diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 97b6a1adc..3f4f4418b 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -210,7 +210,7 @@ cdef class Span: words = [t.text for t in self] spaces = [bool(t.whitespace_) for t in self] cdef Doc doc = Doc(self.doc.vocab, words=words, spaces=spaces) - array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE] + array_head = [LENGTH, SPACY, LEMMA, ENT_IOB, ENT_TYPE, ENT_KB_ID] if self.doc.is_tagged: array_head.append(TAG) # If doc parsed add head and dep attribute diff --git a/spacy/tokens/token.pxd b/spacy/tokens/token.pxd index bb9f7d070..ec5df3fac 100644 --- a/spacy/tokens/token.pxd +++ b/spacy/tokens/token.pxd @@ -53,6 +53,8 @@ cdef class Token: return token.ent_iob elif feat_name == ENT_TYPE: return token.ent_type + elif feat_name == ENT_KB_ID: + return token.ent_kb_id elif feat_name == SENT_START: return token.sent_start else: @@ -79,5 +81,7 @@ cdef class Token: token.ent_iob = value elif feat_name == ENT_TYPE: token.ent_type = value + elif feat_name == ENT_KB_ID: + token.ent_kb_id = value elif feat_name == SENT_START: token.sent_start = value From bee23cd8af0cfde6027e00fe506033a03c05170a Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 25 Jun 2019 16:09:22 +0200 Subject: [PATCH 093/148] try Tok2Vec instead of SpacyVectors --- examples/pipeline/wikidata_entity_linking.py | 87 ++++++++++++-------- spacy/_ml.py | 25 ++++-- 2 files changed, 69 insertions(+), 43 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 2759da135..9dc2e514f 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -61,22 +61,23 @@ def run_pipeline(): to_create_kb = False # read KB back in from file - to_read_kb = True + to_read_kb = False to_test_kb = False # create training dataset create_wp_training = False # train the EL pipe - train_pipe = True - measure_performance = True + train_pipe = False + measure_performance = False # test the EL pipe on a simple example - to_test_pipeline = True + to_test_pipeline = False # write the NLP object, read back in and test again to_write_nlp = False - to_read_nlp = False + to_read_nlp = True + test_from_file = True # STEP 1 : create prior probabilities from WP (run only once) if to_create_prior_probs: @@ -134,21 +135,21 @@ def run_pipeline(): training_output=TRAINING_DIR) # STEP 6: create and train the entity linking pipe - el_pipe = nlp_2.create_pipe(name='entity_linker', config={}) - el_pipe.set_kb(kb_2) - nlp_2.add_pipe(el_pipe, last=True) - - other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"] - with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking - optimizer = nlp_2.begin_training() - optimizer.learn_rate = LEARN_RATE - optimizer.L2 = L2 - if train_pipe: + el_pipe = nlp_2.create_pipe(name='entity_linker', config={}) + el_pipe.set_kb(kb_2) + nlp_2.add_pipe(el_pipe, last=True) + + other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"] + with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking + optimizer = nlp_2.begin_training() + optimizer.learn_rate = LEARN_RATE + optimizer.L2 = L2 + print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) # define the size (nr of entities) of training and dev set train_limit = 5000 - dev_limit = 5000 + dev_limit = 10000 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -230,40 +231,56 @@ def run_pipeline(): el_pipe.context_weight = 1 el_pipe.prior_weight = 1 - # STEP 8: apply the EL pipe on a toy example - if to_test_pipeline: - print() - print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) - print() - run_el_toy_example(nlp=nlp_2) + # STEP 8: apply the EL pipe on a toy example + if to_test_pipeline: + print() + print("STEP 8: applying Entity Linking to toy example", datetime.datetime.now()) + print() + run_el_toy_example(nlp=nlp_2) - # STEP 9: write the NLP pipeline (including entity linker) to file - if to_write_nlp: - print() - print("STEP 9: testing NLP IO", datetime.datetime.now()) - print() - print("writing to", NLP_2_DIR) - nlp_2.to_disk(NLP_2_DIR) - print() + # STEP 9: write the NLP pipeline (including entity linker) to file + if to_write_nlp: + print() + print("STEP 9: testing NLP IO", datetime.datetime.now()) + print() + print("writing to", NLP_2_DIR) + nlp_2.to_disk(NLP_2_DIR) + print() + + # verify that the IO has gone correctly + if to_read_nlp: print("reading from", NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR) - # verify that the IO has gone correctly - if to_read_nlp: + if test_from_file: + dev_limit = 5000 + dev_data = training_set_creator.read_training(nlp=nlp_3, + training_dir=TRAINING_DIR, + dev=True, + limit=dev_limit) + + print("Dev testing from file on", len(dev_data), "articles") print() - print("running toy example with NLP 2") + + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data) + print("dev acc combo avg:", round(dev_acc_combo, 3), + [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) + else: + print("running toy example with NLP 3") run_el_toy_example(nlp=nlp_3) print() print("STOP", datetime.datetime.now()) -def _measure_accuracy(data, el_pipe): +def _measure_accuracy(data, el_pipe=None): + # If the docs in the data require further processing with an entity linker, set el_pipe correct_by_label = dict() incorrect_by_label = dict() docs = [d for d, g in data if len(d) > 0] - docs = el_pipe.pipe(docs) + if el_pipe is not None: + docs = el_pipe.pipe(docs) golds = [g for d, g in data if len(d) > 0] for doc, gold in zip(docs, golds): diff --git a/spacy/_ml.py b/spacy/_ml.py index 9139152aa..82db0fc05 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -655,23 +655,32 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, def build_nel_encoder(in_width, hidden_width, end_width, **cfg): conv_depth = cfg.get("conv_depth", 2) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) + pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name + + tok2vec = Tok2Vec(width=hidden_width, embed_size=in_width, pretrained_vectors=pretrained_vectors, + cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, bilstm_depth=0) with Model.define_operators({">>": chain, "**": clone}): - convolution = Residual((ExtractWindow(nW=1) >> - LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces)))) + # convolution = Residual((ExtractWindow(nW=1) >> + # LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces)))) - encoder = SpacyVectors \ - >> with_flatten(Affine(hidden_width, in_width))\ - >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \ - >> flatten_add_lengths \ - >> ParametricAttention(hidden_width) \ - >> Pooling(sum_pool) \ + # encoder = SpacyVectors \ + # >> with_flatten(Affine(hidden_width, in_width)) \ + # >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \ + # >> flatten_add_lengths \ + # >> ParametricAttention(hidden_width) \ + # >> Pooling(sum_pool) \ + # >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ + # >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0)) + + encoder = tok2vec >> flatten_add_lengths >> Pooling(mean_pool)\ >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0)) # TODO: ReLu or LN(Maxout) ? # sum_pool or mean_pool ? + encoder.tok2vec = tok2vec encoder.nO = end_width return encoder From 1de61f68d645f0157f3902713bb69be1cf1421e2 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 26 Jun 2019 13:53:10 +0200 Subject: [PATCH 094/148] improve speed of prediction loop --- examples/pipeline/wikidata_entity_linking.py | 39 ++++++++++++-------- spacy/pipeline/pipes.pyx | 33 ++++++++++------- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 9dc2e514f..2d300f699 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -76,7 +76,7 @@ def run_pipeline(): # write the NLP object, read back in and test again to_write_nlp = False - to_read_nlp = True + to_read_nlp = False test_from_file = True # STEP 1 : create prior probabilities from WP (run only once) @@ -252,22 +252,27 @@ def run_pipeline(): print("reading from", NLP_2_DIR) nlp_3 = spacy.load(NLP_2_DIR) - if test_from_file: - dev_limit = 5000 - dev_data = training_set_creator.read_training(nlp=nlp_3, - training_dir=TRAINING_DIR, - dev=True, - limit=dev_limit) + print("running toy example with NLP 3") + run_el_toy_example(nlp=nlp_3) - print("Dev testing from file on", len(dev_data), "articles") - print() + # testing performance with an NLP model from file + if test_from_file: + nlp_2 = spacy.load(NLP_1_DIR) + nlp_3 = spacy.load(NLP_2_DIR) + el_pipe = nlp_3.get_pipe("entity_linker") - dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data) - print("dev acc combo avg:", round(dev_acc_combo, 3), - [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) - else: - print("running toy example with NLP 3") - run_el_toy_example(nlp=nlp_3) + dev_limit = 10000 + dev_data = training_set_creator.read_training(nlp=nlp_2, + training_dir=TRAINING_DIR, + dev=True, + limit=dev_limit) + + print("Dev testing from file on", len(dev_data), "articles") + print() + + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe) + print("dev acc combo avg:", round(dev_acc_combo, 3), + [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) print() print("STOP", datetime.datetime.now()) @@ -280,7 +285,9 @@ def _measure_accuracy(data, el_pipe=None): docs = [d for d, g in data if len(d) > 0] if el_pipe is not None: - docs = el_pipe.pipe(docs) + print("applying el_pipe", datetime.datetime.now()) + docs = list(el_pipe.pipe(docs, batch_size=10000000000)) + print("done applying el_pipe", datetime.datetime.now()) golds = [g for d, g in data if len(d) > 0] for doc, gold in zip(docs, golds): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 47ba4dc05..33b3baf8d 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -3,8 +3,6 @@ # coding: utf8 from __future__ import unicode_literals -import numpy as np - import numpy import srsly from collections import OrderedDict @@ -12,6 +10,7 @@ from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax from thinc.misc import LayerNorm from thinc.neural.util import to_categorical +from thinc.neural.util import get_array_module from ..cli.pretrain import get_cossim_loss from .functions import merge_subtokens @@ -1151,7 +1150,7 @@ class EntityLinker(Pipe): if len(entity_encodings) > 0: context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop) - entity_encodings = np.asarray(entity_encodings, dtype=np.float32) + entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None) bp_context(d_scores, sgd=sgd) @@ -1192,24 +1191,30 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] + context_encodings = self.model(docs) + xp = get_array_module(context_encodings) + for i, doc in enumerate(docs): if len(doc) > 0: - context_encoding = self.model([doc]) - context_enc_t = np.transpose(context_encoding) + context_encoding = context_encodings[i] + context_enc_t = context_encoding.T + norm_1 = xp.linalg.norm(context_enc_t) for ent in doc.ents: candidates = self.kb.get_candidates(ent.text) if candidates: - scores = [] - for c in candidates: - prior_prob = c.prior_prob * self.prior_weight - kb_id = c.entity_ - entity_encoding = c.entity_vector - sim = float(cosine(np.asarray([entity_encoding]), context_enc_t)) * self.context_weight - score = prior_prob + sim - (prior_prob*sim) - scores.append(score) + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + prior_probs *= self.prior_weight + + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + norm_2 = xp.linalg.norm(entity_encodings, axis=1) + + # cosine similarity + sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2) + sims *= self.context_weight + scores = prior_probs + sims - (prior_probs*sims) + best_index = scores.argmax() # TODO: thresholding - best_index = scores.index(max(scores)) best_candidate = candidates[best_index] final_entities.append(ent) final_kb_ids.append(best_candidate.entity_) From dbc53b9870a76840d50c29cd1708e02c02414756 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 26 Jun 2019 15:55:26 +0200 Subject: [PATCH 095/148] rename to KBEntryC --- examples/pipeline/wikidata_entity_linking.py | 18 ++++++++---------- spacy/kb.pxd | 10 +++++----- spacy/kb.pyx | 4 ++-- spacy/structs.pxd | 2 +- 4 files changed, 16 insertions(+), 18 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 2d300f699..9ce3b9559 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -61,23 +61,23 @@ def run_pipeline(): to_create_kb = False # read KB back in from file - to_read_kb = False + to_read_kb = True to_test_kb = False # create training dataset create_wp_training = False # train the EL pipe - train_pipe = False - measure_performance = False + train_pipe = True + measure_performance = True # test the EL pipe on a simple example - to_test_pipeline = False + to_test_pipeline = True # write the NLP object, read back in and test again - to_write_nlp = False + to_write_nlp = True to_read_nlp = False - test_from_file = True + test_from_file = False # STEP 1 : create prior probabilities from WP (run only once) if to_create_prior_probs: @@ -149,7 +149,7 @@ def run_pipeline(): print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) # define the size (nr of entities) of training and dev set train_limit = 5000 - dev_limit = 10000 + dev_limit = 5000 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -285,9 +285,7 @@ def _measure_accuracy(data, el_pipe=None): docs = [d for d, g in data if len(d) > 0] if el_pipe is not None: - print("applying el_pipe", datetime.datetime.now()) - docs = list(el_pipe.pipe(docs, batch_size=10000000000)) - print("done applying el_pipe", datetime.datetime.now()) + docs = list(el_pipe.pipe(docs)) golds = [g for d, g in data if len(d) > 0] for doc, gold in zip(docs, golds): diff --git a/spacy/kb.pxd b/spacy/kb.pxd index ccf150cd2..40b22b275 100644 --- a/spacy/kb.pxd +++ b/spacy/kb.pxd @@ -9,8 +9,8 @@ from libc.stdio cimport FILE from spacy.vocab cimport Vocab from .typedefs cimport hash_t -from .structs cimport EntryC, AliasC -ctypedef vector[EntryC] entry_vec +from .structs cimport KBEntryC, AliasC +ctypedef vector[KBEntryC] entry_vec ctypedef vector[AliasC] alias_vec ctypedef vector[float] float_vec ctypedef vector[float_vec] float_matrix @@ -32,7 +32,7 @@ cdef class KnowledgeBase: cdef int64_t entity_vector_length # This maps 64bit keys (hash of unique entity string) - # to 64bit values (position of the _EntryC struct in the _entries vector). + # to 64bit values (position of the _KBEntryC struct in the _entries vector). # The PreshMap is pretty space efficient, as it uses open addressing. So # the only overhead is the vacancy rate, which is approximately 30%. cdef PreshMap _entry_index @@ -88,7 +88,7 @@ cdef class KnowledgeBase: cdef int64_t new_index = self._entries.size() # Avoid struct initializer to enable nogil, cf https://github.com/cython/cython/issues/1642 - cdef EntryC entry + cdef KBEntryC entry entry.entity_hash = entity_hash entry.vector_index = vector_index entry.feats_row = feats_row @@ -121,7 +121,7 @@ cdef class KnowledgeBase: cdef int32_t dummy_value = 0 # Avoid struct initializer to enable nogil - cdef EntryC entry + cdef KBEntryC entry entry.entity_hash = dummy_hash entry.vector_index = dummy_value entry.feats_row = dummy_value diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 4d9d2b89b..7c2daa659 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -129,7 +129,7 @@ cdef class KnowledgeBase: self._entries = entry_vec(nr_entities+1) i = 0 - cdef EntryC entry + cdef KBEntryC entry while i < nr_entities: entity_vector = vector_list[i] if len(entity_vector) != self.entity_vector_length: @@ -250,7 +250,7 @@ cdef class KnowledgeBase: cdef int64_t entry_index cdef float prob cdef int32_t vector_index - cdef EntryC entry + cdef KBEntryC entry cdef AliasC alias cdef float vector_element diff --git a/spacy/structs.pxd b/spacy/structs.pxd index 8de4d5f4c..e80b1b4d6 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -79,7 +79,7 @@ cdef struct TokenC: # Internal struct, for storage and disambiguation of entities. -cdef struct EntryC: +cdef struct KBEntryC: # The hash of this entry's unique ID/name in the kB hash_t entity_hash From 68a0662019760a20bbc740be43b2ec58aa5a816e Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 28 Jun 2019 08:29:31 +0200 Subject: [PATCH 096/148] context encoder with Tok2Vec + linking model instead of cosine --- bin/wiki_entity_linking/kb_creator.py | 4 +- bin/wiki_entity_linking/train_descriptions.py | 4 +- .../training_set_creator.py | 3 +- examples/pipeline/wikidata_entity_linking.py | 9 +-- spacy/_ml.py | 45 +++++++------ spacy/pipeline/pipes.pyx | 66 ++++++++++++------- 6 files changed, 73 insertions(+), 58 deletions(-) diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py index 6ee139174..e8e081cef 100644 --- a/bin/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -33,7 +33,7 @@ def create_kb(nlp, max_entities_per_alias, min_entity_freq, min_occ, else: # read the mappings from file title_to_id = get_entity_to_id(entity_def_output) - id_to_descr = _get_id_to_description(entity_descr_output) + id_to_descr = get_id_to_description(entity_descr_output) print() print(" * _get_entity_frequencies", datetime.datetime.now()) @@ -109,7 +109,7 @@ def get_entity_to_id(entity_def_output): return entity_to_id -def _get_id_to_description(entity_descr_output): +def get_id_to_description(entity_descr_output): id_to_desc = dict() with open(entity_descr_output, 'r', encoding='utf8') as csvfile: csvreader = csv.reader(csvfile, delimiter='|') diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index 948a0e2d1..6a4d046e5 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -14,7 +14,7 @@ from thinc.neural._classes.affine import Affine class EntityEncoder: """ Train the embeddings of entity descriptions to fit a fixed-size entity vector (e.g. 64D). - This entity vector will be stored in the KB, and context vectors will be trained to be similar to them. + This entity vector will be stored in the KB, for further downstream use in the entity model. """ DROP = 0 @@ -97,7 +97,7 @@ class EntityEncoder: else: indices[i] = 0 word_vectors = doc.vocab.vectors.data[indices] - doc_vector = np.mean(word_vectors, axis=0) # TODO: min? max? + doc_vector = np.mean(word_vectors, axis=0) return doc_vector def _build_network(self, orig_width, hidden_with): diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index 51105ce09..436154409 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -14,8 +14,7 @@ Process Wikipedia interlinks to generate a training dataset for the EL algorithm Gold-standard entities are stored in one file in standoff format (by character offset). """ -# ENTITY_FILE = "gold_entities.csv" -ENTITY_FILE = "gold_entities_1000000.csv" # use this file for faster processing +ENTITY_FILE = "gold_entities.csv" def create_training(wikipedia_input, entity_def_input, training_output): diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 9ce3b9559..600436a1d 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -42,9 +42,10 @@ MIN_PAIR_OCC = 5 # model training parameters EPOCHS = 10 -DROPOUT = 0.1 +DROPOUT = 0.2 LEARN_RATE = 0.005 L2 = 1e-6 +CONTEXT_WIDTH=128 def run_pipeline(): @@ -136,7 +137,8 @@ def run_pipeline(): # STEP 6: create and train the entity linking pipe if train_pipe: - el_pipe = nlp_2.create_pipe(name='entity_linker', config={}) + print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) + el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH}) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) @@ -146,9 +148,8 @@ def run_pipeline(): optimizer.learn_rate = LEARN_RATE optimizer.L2 = L2 - print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) # define the size (nr of entities) of training and dev set - train_limit = 5000 + train_limit = 500000 dev_limit = 5000 train_data = training_set_creator.read_training(nlp=nlp_2, diff --git a/spacy/_ml.py b/spacy/_ml.py index 82db0fc05..b00ceda62 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -652,37 +652,36 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, return model -def build_nel_encoder(in_width, hidden_width, end_width, **cfg): +def build_nel_encoder(embed_width, hidden_width, **cfg): + # TODO proper error + if "entity_width" not in cfg: + raise ValueError("entity_width not found") + if "context_width" not in cfg: + raise ValueError("context_width not found") + conv_depth = cfg.get("conv_depth", 2) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name - - tok2vec = Tok2Vec(width=hidden_width, embed_size=in_width, pretrained_vectors=pretrained_vectors, - cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, bilstm_depth=0) + context_width = cfg.get("context_width") + entity_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): - # convolution = Residual((ExtractWindow(nW=1) >> - # LN(Maxout(hidden_width, hidden_width * 3, pieces=cnn_maxout_pieces)))) + model = Affine(1, entity_width+context_width+1, drop_factor=0.0)\ + >> logistic - # encoder = SpacyVectors \ - # >> with_flatten(Affine(hidden_width, in_width)) \ - # >> with_flatten(LN(Maxout(hidden_width, hidden_width)) >> convolution ** conv_depth, pad=conv_depth) \ - # >> flatten_add_lengths \ - # >> ParametricAttention(hidden_width) \ - # >> Pooling(sum_pool) \ - # >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ - # >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0)) + # context encoder + tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors, + cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, + bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\ + >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ + >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0)) - encoder = tok2vec >> flatten_add_lengths >> Pooling(mean_pool)\ - >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ - >> zero_init(Affine(end_width, hidden_width, drop_factor=0.0)) + model.tok2vec = tok2vec - # TODO: ReLu or LN(Maxout) ? - # sum_pool or mean_pool ? - - encoder.tok2vec = tok2vec - encoder.nO = end_width - return encoder + model.tok2vec = tok2vec + model.tok2vec.nO = context_width + model.nO = 1 + return model @layerize def flatten(seqs, drop=0.0): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 33b3baf8d..25df31f70 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -5,6 +5,7 @@ from __future__ import unicode_literals import numpy import srsly +import random from collections import OrderedDict from thinc.api import chain from thinc.v2v import Affine, Maxout, Softmax @@ -229,7 +230,7 @@ class Tensorizer(Pipe): vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab` instance with the `Doc` objects it will process. - model (Model): A `Model` instance or `True` allocate one later. + model (Model): A `Model` instance or `True` to allocate one later. **cfg: Config parameters. EXAMPLE: @@ -386,7 +387,7 @@ class Tagger(Pipe): def predict(self, docs): self.require_model() if not any(len(doc) for doc in docs): - # Handle case where there are no tokens in any docs. + # Handle cases where there are no tokens in any docs. n_labels = len(self.labels) guesses = [self.model.ops.allocate((0, n_labels)) for doc in docs] tokvecs = self.model.ops.allocate((0, self.model.tok2vec.nO)) @@ -1071,22 +1072,20 @@ class EntityLinker(Pipe): @classmethod def Model(cls, **cfg): - if "entity_width" not in cfg: - raise ValueError("entity_width not found") - embed_width = cfg.get("embed_width", 300) hidden_width = cfg.get("hidden_width", 128) - entity_width = cfg.get("entity_width") # this needs to correspond with the KB entity length - model = build_nel_encoder(in_width=embed_width, hidden_width=hidden_width, end_width=entity_width, **cfg) + model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg) return model def __init__(self, **cfg): self.model = True self.kb = None + self.sgd_context = None self.cfg = dict(cfg) self.context_weight = cfg.get("context_weight", 1) self.prior_weight = cfg.get("prior_weight", 1) + self.context_width = cfg.get("context_width") def set_kb(self, kb): self.kb = kb @@ -1107,6 +1106,7 @@ class EntityLinker(Pipe): if self.model is True: self.model = self.Model(**self.cfg) + self.sgd_context = self.create_optimizer() if sgd is None: sgd = self.create_optimizer() @@ -1132,35 +1132,55 @@ class EntityLinker(Pipe): context_docs = [] entity_encodings = [] + labels = [] for doc, gold in zip(docs, golds): for entity in gold.links: start, end, gold_kb = entity mention = doc.text[start:end] - candidates = self.kb.get_candidates(mention) + random.shuffle(candidates) + nr_neg = 0 for c in candidates: kb_id = c.entity_ - # Currently only training on the positive instances if kb_id == gold_kb: - prior_prob = c.prior_prob entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) context_docs.append(doc) + labels.append([1]) + else: # elif nr_neg < 1: + nr_neg += 1 + entity_encoding = c.entity_vector + entity_encodings.append(entity_encoding) + context_docs.append(doc) + labels.append([0]) if len(entity_encodings) > 0: - context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop) + context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop) entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - loss, d_scores = self.get_loss(scores=context_encodings, golds=entity_encodings, docs=None) - bp_context(d_scores, sgd=sgd) + mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) for i in range(len(entity_encodings))] + pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop) + labels = self.model.ops.asarray(labels, dtype="float32") + + loss, d_scores = self.get_loss(prediction=pred, golds=labels, docs=None) + mention_gradient = bp_mention(d_scores, sgd=sgd) + + context_gradients = [list(x[0:self.context_width]) for x in mention_gradient] + bp_context(self.model.ops.asarray(context_gradients, dtype="float32"), sgd=self.sgd_context) if losses is not None: losses[self.name] += loss return loss return 0 - def get_loss(self, docs, golds, scores): + def get_loss(self, docs, golds, prediction): + d_scores = (prediction - golds) + loss = (d_scores ** 2).sum() + loss = loss / len(golds) + return loss, d_scores + + def get_loss_old(self, docs, golds, scores): # this loss function assumes we're only using positive examples loss, gradients = get_cossim_loss(yh=scores, y=golds) loss = loss / len(golds) @@ -1191,30 +1211,26 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] - context_encodings = self.model(docs) + context_encodings = self.model.tok2vec(docs) xp = get_array_module(context_encodings) for i, doc in enumerate(docs): if len(doc) > 0: context_encoding = context_encodings[i] - context_enc_t = context_encoding.T - norm_1 = xp.linalg.norm(context_enc_t) for ent in doc.ents: candidates = self.kb.get_candidates(ent.text) if candidates: - prior_probs = xp.asarray([c.prior_prob for c in candidates]) + random.shuffle(candidates) + prior_probs = xp.asarray([[c.prior_prob] for c in candidates]) prior_probs *= self.prior_weight entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - norm_2 = xp.linalg.norm(entity_encodings, axis=1) - - # cosine similarity - sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2) - sims *= self.context_weight - scores = prior_probs + sims - (prior_probs*sims) - best_index = scores.argmax() + mention_encodings = [list(context_encoding) + list(entity_encodings[i]) for i in range(len(entity_encodings))] + predictions = self.model(self.model.ops.asarray(mention_encodings, dtype="float32")) + scores = (prior_probs + predictions - (xp.dot(prior_probs.T, predictions))) # TODO: thresholding + best_index = scores.argmax() best_candidate = candidates[best_index] final_entities.append(ent) final_kb_ids.append(best_candidate.entity_) From 1c80b852414f61f832ba29a3a7aac7d63c55218b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 28 Jun 2019 08:59:23 +0200 Subject: [PATCH 097/148] fix tests --- examples/pipeline/dummy_entity_linking.py | 6 +++++- spacy/_ml.py | 2 +- spacy/tests/pipeline/test_entity_linker.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py index 3f1fabdfd..0e59db304 100644 --- a/examples/pipeline/dummy_entity_linking.py +++ b/examples/pipeline/dummy_entity_linking.py @@ -41,8 +41,12 @@ def create_kb(vocab): def add_el(kb, nlp): - el_pipe = nlp.create_pipe(name='entity_linker', config={"kb": kb}) + el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64}) + el_pipe.set_kb(kb) nlp.add_pipe(el_pipe, last=True) + nlp.begin_training() + el_pipe.context_weight = 0 + el_pipe.prior_weight = 1 for alias in ["Douglas Adams", "Douglas"]: candidates = nlp.linker.kb.get_candidates(alias) diff --git a/spacy/_ml.py b/spacy/_ml.py index b00ceda62..5a5bfa07e 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -666,7 +666,7 @@ def build_nel_encoder(embed_width, hidden_width, **cfg): entity_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): - model = Affine(1, entity_width+context_width+1, drop_factor=0.0)\ + model = Affine(1, entity_width+context_width, drop_factor=0.0)\ >> logistic # context encoder diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 7ea893408..cafc380ba 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -126,7 +126,7 @@ def test_preserving_links_asdoc(nlp): ruler.add_patterns(patterns) nlp.add_pipe(ruler) - el_pipe = nlp.create_pipe(name='entity_linker', config={}) + el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64}) el_pipe.set_kb(mykb) el_pipe.begin_training() el_pipe.context_weight = 0 From c664f58246b5ec2a8233f28f2006dacb60681200 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 28 Jun 2019 16:22:58 +0200 Subject: [PATCH 098/148] adding prior probability as feature in the model --- examples/pipeline/wikidata_entity_linking.py | 10 +++-- spacy/_ml.py | 7 +-- spacy/pipeline/pipes.pyx | 47 +++++++++++++------- 3 files changed, 40 insertions(+), 24 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index 600436a1d..a61af3660 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -45,7 +45,7 @@ EPOCHS = 10 DROPOUT = 0.2 LEARN_RATE = 0.005 L2 = 1e-6 -CONTEXT_WIDTH=128 +CONTEXT_WIDTH = 128 def run_pipeline(): @@ -138,7 +138,9 @@ def run_pipeline(): # STEP 6: create and train the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) - el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH}) + el_pipe = nlp_2.create_pipe(name='entity_linker', + config={"context_width": CONTEXT_WIDTH, + "pretrained_vectors": nlp_2.vocab.vectors.name}) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) @@ -195,11 +197,11 @@ def run_pipeline(): if batchnr > 0: with el_pipe.model.use_params(optimizer.averages): el_pipe.context_weight = 1 - el_pipe.prior_weight = 0 + el_pipe.prior_weight = 1 dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe) losses['entity_linker'] = losses['entity_linker'] / batchnr print("Epoch, train loss", itn, round(losses['entity_linker'], 2), - " / dev acc context avg", round(dev_acc_context, 3)) + " / dev acc avg", round(dev_acc_context, 3)) # STEP 7: measure the performance of our trained pipe on an independent dev set if len(dev_data) and measure_performance: diff --git a/spacy/_ml.py b/spacy/_ml.py index 5a5bfa07e..07037f653 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -666,15 +666,16 @@ def build_nel_encoder(embed_width, hidden_width, **cfg): entity_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): - model = Affine(1, entity_width+context_width, drop_factor=0.0)\ + model = Affine(entity_width, entity_width+context_width+1)\ + >> Affine(1, entity_width, drop_factor=0.0)\ >> logistic # context encoder tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors, - cnn_maxout_pieces=cnn_maxout_pieces, subword_features=False, conv_depth=conv_depth, + cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth, bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\ >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ - >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0)) + >> zero_init(Affine(context_width, hidden_width)) model.tok2vec = tok2vec diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 25df31f70..d3f6fa776 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1132,7 +1132,8 @@ class EntityLinker(Pipe): context_docs = [] entity_encodings = [] - labels = [] + cats = [] + priors = [] for doc, gold in zip(docs, golds): for entity in gold.links: @@ -1143,27 +1144,33 @@ class EntityLinker(Pipe): nr_neg = 0 for c in candidates: kb_id = c.entity_ + entity_encoding = c.entity_vector + entity_encodings.append(entity_encoding) + context_docs.append(doc) + + if self.prior_weight > 0: + priors.append([c.prior_prob]) + else: + priors.append([0]) + if kb_id == gold_kb: - entity_encoding = c.entity_vector - entity_encodings.append(entity_encoding) - context_docs.append(doc) - labels.append([1]) - else: # elif nr_neg < 1: + cats.append([1]) + else: nr_neg += 1 - entity_encoding = c.entity_vector - entity_encodings.append(entity_encoding) - context_docs.append(doc) - labels.append([0]) + cats.append([0]) if len(entity_encodings) > 0: + assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) + context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop) entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) for i in range(len(entity_encodings))] + mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + for i in range(len(entity_encodings))] pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop) - labels = self.model.ops.asarray(labels, dtype="float32") + cats = self.model.ops.asarray(cats, dtype="float32") - loss, d_scores = self.get_loss(prediction=pred, golds=labels, docs=None) + loss, d_scores = self.get_loss(prediction=pred, golds=cats, docs=None) mention_gradient = bp_mention(d_scores, sgd=sgd) context_gradients = [list(x[0:self.context_width]) for x in mention_gradient] @@ -1221,13 +1228,19 @@ class EntityLinker(Pipe): candidates = self.kb.get_candidates(ent.text) if candidates: random.shuffle(candidates) + + # this will set the prior probabilities to 0 (just like in training) if their weight is 0 prior_probs = xp.asarray([[c.prior_prob] for c in candidates]) prior_probs *= self.prior_weight + scores = prior_probs - entity_encodings = xp.asarray([c.entity_vector for c in candidates]) - mention_encodings = [list(context_encoding) + list(entity_encodings[i]) for i in range(len(entity_encodings))] - predictions = self.model(self.model.ops.asarray(mention_encodings, dtype="float32")) - scores = (prior_probs + predictions - (xp.dot(prior_probs.T, predictions))) + if self.context_weight > 0: + entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + assert len(entity_encodings) == len(prior_probs) + mention_encodings = [list(context_encoding) + list(entity_encodings[i]) + + list(prior_probs[i]) + for i in range(len(entity_encodings))] + scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32")) # TODO: thresholding best_index = scores.argmax() From 2d2dea99244b520cd62813fb2de62cf78b5f09be Mon Sep 17 00:00:00 2001 From: svlandeg Date: Sat, 29 Jun 2019 14:52:36 +0200 Subject: [PATCH 099/148] experiment with adding NER types to the feature vector --- examples/pipeline/wikidata_entity_linking.py | 27 ++++++++++++-------- spacy/_ml.py | 4 +-- spacy/pipeline/pipes.pyx | 26 ++++++++++++++++--- 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index a61af3660..c0a7e3c66 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -42,7 +42,7 @@ MIN_PAIR_OCC = 5 # model training parameters EPOCHS = 10 -DROPOUT = 0.2 +DROPOUT = 0.5 LEARN_RATE = 0.005 L2 = 1e-6 CONTEXT_WIDTH = 128 @@ -73,10 +73,10 @@ def run_pipeline(): measure_performance = True # test the EL pipe on a simple example - to_test_pipeline = True + to_test_pipeline = False # write the NLP object, read back in and test again - to_write_nlp = True + to_write_nlp = False to_read_nlp = False test_from_file = False @@ -138,9 +138,12 @@ def run_pipeline(): # STEP 6: create and train the entity linking pipe if train_pipe: print("STEP 6: training Entity Linking pipe", datetime.datetime.now()) + type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)} + print(" -analysing", len(type_to_int), "different entity types") el_pipe = nlp_2.create_pipe(name='entity_linker', config={"context_width": CONTEXT_WIDTH, - "pretrained_vectors": nlp_2.vocab.vectors.name}) + "pretrained_vectors": nlp_2.vocab.vectors.name, + "type_to_int": type_to_int}) el_pipe.set_kb(kb_2) nlp_2.add_pipe(el_pipe, last=True) @@ -151,8 +154,8 @@ def run_pipeline(): optimizer.L2 = L2 # define the size (nr of entities) of training and dev set - train_limit = 500000 - dev_limit = 5000 + train_limit = 50000 + dev_limit = 50000 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -219,7 +222,7 @@ def run_pipeline(): # measuring combined accuracy (prior + context) el_pipe.context_weight = 1 el_pipe.prior_weight = 1 - dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe) + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) @@ -264,7 +267,7 @@ def run_pipeline(): nlp_3 = spacy.load(NLP_2_DIR) el_pipe = nlp_3.get_pipe("entity_linker") - dev_limit = 10000 + dev_limit = 5000 dev_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, @@ -273,7 +276,7 @@ def run_pipeline(): print("Dev testing from file on", len(dev_data), "articles") print() - dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe) + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe=el_pipe, error_analysis=False) print("dev acc combo avg:", round(dev_acc_combo, 3), [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) @@ -281,7 +284,7 @@ def run_pipeline(): print("STOP", datetime.datetime.now()) -def _measure_accuracy(data, el_pipe=None): +def _measure_accuracy(data, el_pipe=None, error_analysis=False): # If the docs in the data require further processing with an entity linker, set el_pipe correct_by_label = dict() incorrect_by_label = dict() @@ -312,6 +315,10 @@ def _measure_accuracy(data, el_pipe=None): else: incorrect = incorrect_by_label.get(ent_label, 0) incorrect_by_label[ent_label] = incorrect + 1 + if error_analysis: + print(ent.text, "in", doc) + print("Predicted", pred_entity, "should have been", gold_entity) + print() except Exception as e: print("Error assessing accuracy", e) diff --git a/spacy/_ml.py b/spacy/_ml.py index 07037f653..cca324b45 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -652,7 +652,7 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, return model -def build_nel_encoder(embed_width, hidden_width, **cfg): +def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): # TODO proper error if "entity_width" not in cfg: raise ValueError("entity_width not found") @@ -666,7 +666,7 @@ def build_nel_encoder(embed_width, hidden_width, **cfg): entity_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): - model = Affine(entity_width, entity_width+context_width+1)\ + model = Affine(entity_width, entity_width+context_width+1+ner_types)\ >> Affine(1, entity_width, drop_factor=0.0)\ >> logistic diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index d3f6fa776..f1a864fcf 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1074,8 +1074,9 @@ class EntityLinker(Pipe): def Model(cls, **cfg): embed_width = cfg.get("embed_width", 300) hidden_width = cfg.get("hidden_width", 128) + type_to_int = cfg.get("type_to_int", dict()) - model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, **cfg) + model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg) return model def __init__(self, **cfg): @@ -1086,6 +1087,7 @@ class EntityLinker(Pipe): self.context_weight = cfg.get("context_weight", 1) self.prior_weight = cfg.get("prior_weight", 1) self.context_width = cfg.get("context_width") + self.type_to_int = cfg.get("type_to_int", dict()) def set_kb(self, kb): self.kb = kb @@ -1134,11 +1136,22 @@ class EntityLinker(Pipe): entity_encodings = [] cats = [] priors = [] + type_vectors = [] for doc, gold in zip(docs, golds): + ents_by_offset = dict() + for ent in doc.ents: + ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] = ent for entity in gold.links: start, end, gold_kb = entity mention = doc.text[start:end] + + gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] + assert gold_ent is not None + type_vector = [0 for i in range(len(self.type_to_int))] + if len(self.type_to_int) > 0: + type_vector[self.type_to_int[gold_ent.label_]] = 1 + candidates = self.kb.get_candidates(mention) random.shuffle(candidates) nr_neg = 0 @@ -1147,6 +1160,7 @@ class EntityLinker(Pipe): entity_encoding = c.entity_vector entity_encodings.append(entity_encoding) context_docs.append(doc) + type_vectors.append(type_vector) if self.prior_weight > 0: priors.append([c.prior_prob]) @@ -1160,12 +1174,12 @@ class EntityLinker(Pipe): cats.append([0]) if len(entity_encodings) > 0: - assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) + assert len(priors) == len(entity_encodings) == len(context_docs) == len(cats) == len(type_vectors) context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop) entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + type_vectors[i] for i in range(len(entity_encodings))] pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop) cats = self.model.ops.asarray(cats, dtype="float32") @@ -1225,6 +1239,10 @@ class EntityLinker(Pipe): if len(doc) > 0: context_encoding = context_encodings[i] for ent in doc.ents: + type_vector = [0 for i in range(len(self.type_to_int))] + if len(self.type_to_int) > 0: + type_vector[self.type_to_int[ent.label_]] = 1 + candidates = self.kb.get_candidates(ent.text) if candidates: random.shuffle(candidates) @@ -1238,7 +1256,7 @@ class EntityLinker(Pipe): entity_encodings = xp.asarray([c.entity_vector for c in candidates]) assert len(entity_encodings) == len(prior_probs) mention_encodings = [list(context_encoding) + list(entity_encodings[i]) - + list(prior_probs[i]) + + list(prior_probs[i]) + type_vector for i in range(len(entity_encodings))] scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32")) From 3420cbe49639ab77e36612d7c7ab5abeffe9cd46 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jul 2019 10:25:51 +0200 Subject: [PATCH 100/148] small fixes --- bin/wiki_entity_linking/training_set_creator.py | 5 ++--- examples/pipeline/wikidata_entity_linking.py | 4 ++-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index 436154409..5d401bb3f 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -7,7 +7,7 @@ import bz2 import datetime from spacy.gold import GoldParse -from bin.wiki_entity_linking import kb_creator, wikipedia_processor as wp +from bin.wiki_entity_linking import kb_creator """ Process Wikipedia interlinks to generate a training dataset for the EL algorithm. @@ -342,8 +342,7 @@ def read_training(nlp, training_dir, dev, limit): # currently feeding the gold data one entity per sentence at a time gold_start = int(start) - found_ent.sent.start_char gold_end = int(end) - found_ent.sent.start_char - gold_entities = [] - gold_entities.append((gold_start, gold_end, wp_title)) + gold_entities = [(gold_start, gold_end, wp_title)] gold = GoldParse(doc=sent, links=gold_entities) data.append((sent, gold)) total_entities += 1 diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index c0a7e3c66..d914f033c 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -394,10 +394,10 @@ def _measure_baselines(data, kb): print("Error assessing accuracy", e) acc_prior, acc_prior_by_label = calculate_acc(prior_correct_by_label, prior_incorrect_by_label) - acc_random, acc_random_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label) + acc_rand, acc_rand_by_label = calculate_acc(random_correct_by_label, random_incorrect_by_label) acc_oracle, acc_oracle_by_label = calculate_acc(oracle_correct_by_label, oracle_incorrect_by_label) - return counts_by_label, acc_random, acc_random_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label + return counts_by_label, acc_rand, acc_rand_by_label, acc_prior, acc_prior_by_label, acc_oracle, acc_oracle_by_label def calculate_acc(correct_by_label, incorrect_by_label): From 8840d4b1b3ac9aa9e774b576ea405a205b353f64 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jul 2019 13:35:36 +0200 Subject: [PATCH 101/148] fix for context encoder optimizer --- examples/pipeline/wikidata_entity_linking.py | 13 ++++---- spacy/pipeline/pipes.pyx | 31 ++++++++++---------- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index d914f033c..b57d9f541 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -73,11 +73,11 @@ def run_pipeline(): measure_performance = True # test the EL pipe on a simple example - to_test_pipeline = False + to_test_pipeline = True # write the NLP object, read back in and test again - to_write_nlp = False - to_read_nlp = False + to_write_nlp = True + to_read_nlp = True test_from_file = False # STEP 1 : create prior probabilities from WP (run only once) @@ -154,8 +154,8 @@ def run_pipeline(): optimizer.L2 = L2 # define the size (nr of entities) of training and dev set - train_limit = 50000 - dev_limit = 50000 + train_limit = 5 + dev_limit = 5 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -250,7 +250,8 @@ def run_pipeline(): print("STEP 9: testing NLP IO", datetime.datetime.now()) print() print("writing to", NLP_2_DIR) - nlp_2.to_disk(NLP_2_DIR) + with el_pipe.model.use_params(optimizer.averages) and el_pipe.model.tok2vec.use_params(el_pipe.sgd_context.averages): + nlp_2.to_disk(NLP_2_DIR) print() # verify that the IO has gone correctly diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index f1a864fcf..91f5e7044 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1082,12 +1082,8 @@ class EntityLinker(Pipe): def __init__(self, **cfg): self.model = True self.kb = None - self.sgd_context = None self.cfg = dict(cfg) - self.context_weight = cfg.get("context_weight", 1) - self.prior_weight = cfg.get("prior_weight", 1) - self.context_width = cfg.get("context_width") - self.type_to_int = cfg.get("type_to_int", dict()) + self.sgd_context = None def set_kb(self, kb): self.kb = kb @@ -1112,6 +1108,7 @@ class EntityLinker(Pipe): if sgd is None: sgd = self.create_optimizer() + return sgd def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None): @@ -1138,6 +1135,8 @@ class EntityLinker(Pipe): priors = [] type_vectors = [] + type_to_int = self.cfg.get("type_to_int", dict()) + for doc, gold in zip(docs, golds): ents_by_offset = dict() for ent in doc.ents: @@ -1148,9 +1147,9 @@ class EntityLinker(Pipe): gold_ent = ents_by_offset[str(ent.start_char) + "_" + str(ent.end_char)] assert gold_ent is not None - type_vector = [0 for i in range(len(self.type_to_int))] - if len(self.type_to_int) > 0: - type_vector[self.type_to_int[gold_ent.label_]] = 1 + type_vector = [0 for i in range(len(type_to_int))] + if len(type_to_int) > 0: + type_vector[type_to_int[gold_ent.label_]] = 1 candidates = self.kb.get_candidates(mention) random.shuffle(candidates) @@ -1162,7 +1161,7 @@ class EntityLinker(Pipe): context_docs.append(doc) type_vectors.append(type_vector) - if self.prior_weight > 0: + if self.cfg.get("prior_weight", 1) > 0: priors.append([c.prior_prob]) else: priors.append([0]) @@ -1187,7 +1186,7 @@ class EntityLinker(Pipe): loss, d_scores = self.get_loss(prediction=pred, golds=cats, docs=None) mention_gradient = bp_mention(d_scores, sgd=sgd) - context_gradients = [list(x[0:self.context_width]) for x in mention_gradient] + context_gradients = [list(x[0:self.cfg.get("context_width")]) for x in mention_gradient] bp_context(self.model.ops.asarray(context_gradients, dtype="float32"), sgd=self.sgd_context) if losses is not None: @@ -1235,13 +1234,15 @@ class EntityLinker(Pipe): context_encodings = self.model.tok2vec(docs) xp = get_array_module(context_encodings) + type_to_int = self.cfg.get("type_to_int", dict()) + for i, doc in enumerate(docs): if len(doc) > 0: context_encoding = context_encodings[i] for ent in doc.ents: - type_vector = [0 for i in range(len(self.type_to_int))] - if len(self.type_to_int) > 0: - type_vector[self.type_to_int[ent.label_]] = 1 + type_vector = [0 for i in range(len(type_to_int))] + if len(type_to_int) > 0: + type_vector[type_to_int[ent.label_]] = 1 candidates = self.kb.get_candidates(ent.text) if candidates: @@ -1249,10 +1250,10 @@ class EntityLinker(Pipe): # this will set the prior probabilities to 0 (just like in training) if their weight is 0 prior_probs = xp.asarray([[c.prior_prob] for c in candidates]) - prior_probs *= self.prior_weight + prior_probs *= self.cfg.get("prior_weight", 1) scores = prior_probs - if self.context_weight > 0: + if self.cfg.get("context_weight", 1) > 0: entity_encodings = xp.asarray([c.entity_vector for c in candidates]) assert len(entity_encodings) == len(prior_probs) mention_encodings = [list(context_encoding) + list(entity_encodings[i]) From 668b17ea4a7f5133f68b16586e6f4a1f45279bee Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jul 2019 15:00:42 +0200 Subject: [PATCH 102/148] deuglify kb deserializer --- spacy/language.py | 9 +-------- spacy/pipeline/pipes.pyx | 12 +++++++++++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 2225a763e..570630eb3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -118,7 +118,7 @@ class Language(object): "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), - "entity_linker": lambda nlp, **cfg: EntityLinker(**cfg), + "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg), "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg), @@ -811,13 +811,6 @@ class Language(object): exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) - # download the KB for the entity linking component - requires the vocab - for pipe_name, pipe in self.pipeline: - if pipe_name == "entity_linker": - kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=pipe.cfg["entity_width"]) - kb.load_bulk(path / pipe_name / "kb") - pipe.set_kb(kb) - self._path = path return self diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 91f5e7044..f4dc08251 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -13,6 +13,7 @@ from thinc.misc import LayerNorm from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module +from spacy.kb import KnowledgeBase from ..cli.pretrain import get_cossim_loss from .functions import merge_subtokens from ..tokens.doc cimport Doc @@ -1079,7 +1080,8 @@ class EntityLinker(Pipe): model = build_nel_encoder(embed_width=embed_width, hidden_width=hidden_width, ner_types=len(type_to_int), **cfg) return model - def __init__(self, **cfg): + def __init__(self, vocab, **cfg): + self.vocab = vocab self.model = True self.kb = None self.cfg = dict(cfg) @@ -1277,6 +1279,7 @@ class EntityLinker(Pipe): def to_disk(self, path, exclude=tuple(), **kwargs): serialize = OrderedDict() serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg) + serialize["vocab"] = lambda p: self.vocab.to_disk(p) serialize["kb"] = lambda p: self.kb.dump(p) if self.model not in (None, True, False): serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes()) @@ -1289,8 +1292,15 @@ class EntityLinker(Pipe): self.model = self.Model(**self.cfg) self.model.from_bytes(p.open("rb").read()) + def load_kb(p): + kb = KnowledgeBase(vocab=self.vocab, entity_vector_length=self.cfg["entity_width"]) + kb.load_bulk(p) + self.set_kb(kb) + deserialize = OrderedDict() deserialize["cfg"] = lambda p: self.cfg.update(_load_cfg(p)) + deserialize["vocab"] = lambda p: self.vocab.from_disk(p) + deserialize["kb"] = load_kb deserialize["model"] = load_model exclude = util.get_serialization_exclude(deserialize, exclude, kwargs) util.from_disk(path, deserialize, exclude) From 0ea52c86b89e65b5bdca23dd331985033a9f0c2d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jul 2019 15:02:10 +0200 Subject: [PATCH 103/148] remove redundancy --- spacy/language.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 570630eb3..39d95c689 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -11,7 +11,6 @@ from copy import copy, deepcopy from thinc.neural import Model import srsly -from .kb import KnowledgeBase from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer @@ -810,7 +809,6 @@ class Language(object): # Convert to list here in case exclude is (default) tuple exclude = list(exclude) + ["vocab"] util.from_disk(path, deserializers, exclude) - self._path = path return self From b7a0c9bf60757acdf0586b35ec755ccd8fab5099 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 3 Jul 2019 17:48:09 +0200 Subject: [PATCH 104/148] fixing the context/prior weight settings --- examples/pipeline/wikidata_entity_linking.py | 49 +++++++++----------- 1 file changed, 21 insertions(+), 28 deletions(-) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py index b57d9f541..17c2976dd 100644 --- a/examples/pipeline/wikidata_entity_linking.py +++ b/examples/pipeline/wikidata_entity_linking.py @@ -154,8 +154,8 @@ def run_pipeline(): optimizer.L2 = L2 # define the size (nr of entities) of training and dev set - train_limit = 5 - dev_limit = 5 + train_limit = 5000 + dev_limit = 5000 train_data = training_set_creator.read_training(nlp=nlp_2, training_dir=TRAINING_DIR, @@ -198,13 +198,12 @@ def run_pipeline(): print("Error updating batch:", e) if batchnr > 0: - with el_pipe.model.use_params(optimizer.averages): - el_pipe.context_weight = 1 - el_pipe.prior_weight = 1 - dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe) - losses['entity_linker'] = losses['entity_linker'] / batchnr - print("Epoch, train loss", itn, round(losses['entity_linker'], 2), - " / dev acc avg", round(dev_acc_context, 3)) + el_pipe.cfg["context_weight"] = 1 + el_pipe.cfg["prior_weight"] = 1 + dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe) + losses['entity_linker'] = losses['entity_linker'] / batchnr + print("Epoch, train loss", itn, round(losses['entity_linker'], 2), + " / dev acc avg", round(dev_acc_context, 3)) # STEP 7: measure the performance of our trained pipe on an independent dev set if len(dev_data) and measure_performance: @@ -218,24 +217,19 @@ def run_pipeline(): print("dev acc random:", round(acc_r, 3), [(x, round(y, 3)) for x, y in acc_r_label.items()]) print("dev acc prior:", round(acc_p, 3), [(x, round(y, 3)) for x, y in acc_p_label.items()]) - with el_pipe.model.use_params(optimizer.averages): - # measuring combined accuracy (prior + context) - el_pipe.context_weight = 1 - el_pipe.prior_weight = 1 - dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False) - print("dev acc combo avg:", round(dev_acc_combo, 3), - [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) + # using only context + el_pipe.cfg["context_weight"] = 1 + el_pipe.cfg["prior_weight"] = 0 + dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe) + print("dev acc context avg:", round(dev_acc_context, 3), + [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()]) - # using only context - el_pipe.context_weight = 1 - el_pipe.prior_weight = 0 - dev_acc_context, dev_acc_context_dict = _measure_accuracy(dev_data, el_pipe) - print("dev acc context avg:", round(dev_acc_context, 3), - [(x, round(y, 3)) for x, y in dev_acc_context_dict.items()]) - - # reset for follow-up tests - el_pipe.context_weight = 1 - el_pipe.prior_weight = 1 + # measuring combined accuracy (prior + context) + el_pipe.cfg["context_weight"] = 1 + el_pipe.cfg["prior_weight"] = 1 + dev_acc_combo, dev_acc_combo_dict = _measure_accuracy(dev_data, el_pipe, error_analysis=False) + print("dev acc combo avg:", round(dev_acc_combo, 3), + [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()]) # STEP 8: apply the EL pipe on a toy example if to_test_pipeline: @@ -250,8 +244,7 @@ def run_pipeline(): print("STEP 9: testing NLP IO", datetime.datetime.now()) print() print("writing to", NLP_2_DIR) - with el_pipe.model.use_params(optimizer.averages) and el_pipe.model.tok2vec.use_params(el_pipe.sgd_context.averages): - nlp_2.to_disk(NLP_2_DIR) + nlp_2.to_disk(NLP_2_DIR) print() # verify that the IO has gone correctly From 6d577f0b92f6c3d0333b6816ed9b443a39817e3a Mon Sep 17 00:00:00 2001 From: Alejandro Alcalde Date: Tue, 9 Jul 2019 20:54:59 +0200 Subject: [PATCH 105/148] Evaluation of NER model per entity type, closes #3490 (#3911) * Evaluation of NER model per entity type, closes ##3490 Now each ent score is tracked individually in order to have its own Precision, Recall and F1 Score * Keep track of each entity individually using dicts * Improving how to compute the scores for each entity * Fixed bug computing scores for ents * Formatting with black * Added key ents_per_type to the scores function The key `ents_per_type` contains the metrics Precision, Recall and F1-Score for each entity individually --- spacy/scorer.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/spacy/scorer.py b/spacy/scorer.py index 32716b852..c01353520 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -52,6 +52,7 @@ class Scorer(object): self.labelled = PRFScore() self.tags = PRFScore() self.ner = PRFScore() + self.ner_per_ents = dict() self.eval_punct = eval_punct @property @@ -104,6 +105,15 @@ class Scorer(object): "ents_f": self.ents_f, "tags_acc": self.tags_acc, "token_acc": self.token_acc, + "ents_per_type": self.__scores_per_ents(), + } + + def __scores_per_ents(self): + """RETURNS (dict): Scores per NER entity + """ + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.ner_per_ents.items() } def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")): @@ -149,13 +159,31 @@ class Scorer(object): cand_deps.add((gold_i, gold_head, token.dep_.lower())) if "-" not in [token[-1] for token in gold.orig_annot]: cand_ents = set() + current_ent = {k.label_: set() for k in doc.ents} + current_gold = {k.label_: set() for k in doc.ents} for ent in doc.ents: + if ent.label_ not in self.ner_per_ents: + self.ner_per_ents[ent.label_] = PRFScore() first = gold.cand_to_gold[ent.start] last = gold.cand_to_gold[ent.end - 1] if first is None or last is None: self.ner.fp += 1 + self.ner_per_ents[ent.label_].fp += 1 else: cand_ents.add((ent.label_, first, last)) + current_ent[ent.label_].add( + tuple(x for x in cand_ents if x[0] == ent.label_) + ) + current_gold[ent.label_].add( + tuple(x for x in gold_ents if x[0] == ent.label_) + ) + # Scores per ent + [ + v.score_set(current_ent[k], current_gold[k]) + for k, v in self.ner_per_ents.items() + if k in current_ent + ] + # Score for all ents self.ner.score_set(cand_ents, gold_ents) self.tags.score_set(cand_tags, gold_tags) self.labelled.score_set(cand_deps, gold_deps) From 04982ccc4033ec15864bba659430a8408ca94774 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20B=C3=B6ing?= <33514570+BreakBB@users.noreply.github.com> Date: Tue, 9 Jul 2019 21:48:30 +0200 Subject: [PATCH 106/148] =?UTF-8?q?Update=20pretrain=20to=20prevent=20unin?= =?UTF-8?q?tended=20overwriting=20of=20weight=20fil=E2=80=A6=20(#3902)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update pretrain to prevent unintended overwriting of weight files for #3859 * Add '--epoch-start' to pretrain docs * Add mising pretrain arguments to bash example * Update doc tag for v2.1.5 --- spacy/cli/pretrain.py | 33 +++++++++++++++++++++++++++++++-- website/docs/api/cli.md | 9 +++++---- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py index 2fe5b247a..678f12be1 100644 --- a/spacy/cli/pretrain.py +++ b/spacy/cli/pretrain.py @@ -5,6 +5,7 @@ import plac import random import numpy import time +import re from collections import Counter from pathlib import Path from thinc.v2v import Affine, Maxout @@ -65,6 +66,13 @@ from .train import _load_pretrained_tok2vec "t2v", Path, ), + epoch_start=( + "The epoch to start counting at. Only relevant when using '--init-tok2vec' and the given weight file has been " + "renamed. Prevents unintended overwriting of existing weight files.", + "option", + "es", + int + ), ) def pretrain( texts_loc, @@ -83,6 +91,7 @@ def pretrain( seed=0, n_save_every=None, init_tok2vec=None, + epoch_start=None, ): """ Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, @@ -151,9 +160,29 @@ def pretrain( if init_tok2vec is not None: components = _load_pretrained_tok2vec(nlp, init_tok2vec) msg.text("Loaded pretrained tok2vec for: {}".format(components)) + # Parse the epoch number from the given weight file + model_name = re.search(r"model\d+\.bin", str(init_tok2vec)) + if model_name: + # Default weight file name so read epoch_start from it by cutting off 'model' and '.bin' + epoch_start = int(model_name.group(0)[5:][:-4]) + 1 + else: + if not epoch_start: + msg.fail( + "You have to use the '--epoch-start' argument when using a renamed weight file for " + "'--init-tok2vec'", exits=True + ) + elif epoch_start < 0: + msg.fail( + "The argument '--epoch-start' has to be greater or equal to 0. '%d' is invalid" % epoch_start, + exits=True + ) + else: + # Without '--init-tok2vec' the '--epoch-start' argument is ignored + epoch_start = 0 + optimizer = create_default_optimizer(model.ops) tracker = ProgressTracker(frequency=10000) - msg.divider("Pre-training tok2vec layer") + msg.divider("Pre-training tok2vec layer - starting at epoch %d" % epoch_start) row_settings = {"widths": (3, 10, 10, 6, 4), "aligns": ("r", "r", "r", "r", "r")} msg.row(("#", "# Words", "Total Loss", "Loss", "w/s"), **row_settings) @@ -174,7 +203,7 @@ def pretrain( file_.write(srsly.json_dumps(log) + "\n") skip_counter = 0 - for epoch in range(n_iter): + for epoch in range(epoch_start, n_iter + epoch_start): for batch_id, batch in enumerate( util.minibatch_by_words(((text, None) for text in texts), size=batch_size) ): diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index a69e62219..7af134e40 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -284,9 +284,9 @@ same between pretraining and training. The API and errors around this need some improvement. ```bash -$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width] -[--depth] [--embed-rows] [--loss_func] [--dropout] [--seed] [--n-iter] [--use-vectors] -[--n-save_every] +$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] +[--width] [--depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length] +[--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start] ``` | Argument | Type | Description | @@ -306,7 +306,8 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir] [--width] | `--n-iter`, `-i` | option | Number of iterations to pretrain. | | `--use-vectors`, `-uv` | flag | Whether to use the static vectors as input features. | | `--n-save-every`, `-se` | option | Save model every X batches. | -| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.| +| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental.| +| `--epoch-start`, `-es` 2.1.5 | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files.| | **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | ### JSONL format for raw text {#pretrain-jsonl} From 547464609d8da5230bf2bcbb020b2abfde5dd216 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 9 Jul 2019 21:50:30 +0200 Subject: [PATCH 107/148] Remove merge_subtokens from parser postprocessing for now --- spacy/pipeline/pipes.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 1d4eeadce..1f4dd4253 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1001,7 +1001,7 @@ cdef class DependencyParser(Parser): @property def postprocesses(self): - return [nonproj.deprojectivize, merge_subtokens] + return [nonproj.deprojectivize] def add_multitask_objective(self, target): if target == "cloze": From 58f06e61800a477f67d13911068fd24892ccfa15 Mon Sep 17 00:00:00 2001 From: cedar101 Date: Wed, 10 Jul 2019 05:23:16 +0900 Subject: [PATCH 108/148] Korean support (#3901) * start lang/ko * add test codes * using natto-py * add test_ko_tokenizer_full_tags() * spaCy contributor agreement * external dependency for ko * collections.namedtuple for python version < 3.5 * case fix * tuple unpacking * add jongseong(final consonant) * apply mecab option * Remove Pipfile for now Co-authored-by: Ines Montani --- .github/contributors/cedar101.md | 106 +++++++++++++++++++ .gitignore | 2 + setup.py | 1 + spacy/lang/ko/__init__.py | 118 ++++++++++++++++++++++ spacy/lang/ko/examples.py | 15 +++ spacy/lang/ko/stop_words.py | 68 +++++++++++++ spacy/lang/ko/tag_map.py | 66 ++++++++++++ spacy/tests/conftest.py | 6 ++ spacy/tests/lang/ko/__init__.py | 0 spacy/tests/lang/ko/test_lemmatization.py | 13 +++ spacy/tests/lang/ko/test_tokenizer.py | 46 +++++++++ website/meta/languages.json | 11 ++ 12 files changed, 452 insertions(+) create mode 100644 .github/contributors/cedar101.md create mode 100644 spacy/lang/ko/__init__.py create mode 100644 spacy/lang/ko/examples.py create mode 100644 spacy/lang/ko/stop_words.py create mode 100644 spacy/lang/ko/tag_map.py create mode 100644 spacy/tests/lang/ko/__init__.py create mode 100644 spacy/tests/lang/ko/test_lemmatization.py create mode 100644 spacy/tests/lang/ko/test_tokenizer.py diff --git a/.github/contributors/cedar101.md b/.github/contributors/cedar101.md new file mode 100644 index 000000000..4d04ebacf --- /dev/null +++ b/.github/contributors/cedar101.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | ------------------------ | +| Name | Kim, Baeg-il | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2019-07-03 | +| GitHub username | cedar101 | +| Website (optional) | | diff --git a/.gitignore b/.gitignore index ef586ac8d..35d431d48 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,8 @@ parts/ sdist/ var/ *.egg-info/ +pip-wheel-metadata/ +Pipfile.lock .installed.cfg *.egg .eggs diff --git a/setup.py b/setup.py index 33623588c..544188f4a 100755 --- a/setup.py +++ b/setup.py @@ -246,6 +246,7 @@ def setup_package(): "cuda100": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda100>=5.0.0b4"], # Language tokenizers with external dependencies "ja": ["mecab-python3==0.7"], + "ko": ["natto-py==0.9.0"], }, python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*", classifiers=[ diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py new file mode 100644 index 000000000..111d01720 --- /dev/null +++ b/spacy/lang/ko/__init__.py @@ -0,0 +1,118 @@ +# encoding: utf8 +from __future__ import unicode_literals, print_function + +import re +import sys + + +from .stop_words import STOP_WORDS +from .tag_map import TAG_MAP, POS +from ...attrs import LANG +from ...language import Language +from ...tokens import Doc +from ...compat import copy_reg +from ...util import DummyTokenizer +from ...compat import is_python3, is_python_pre_3_5 + +is_python_post_3_7 = is_python3 and sys.version_info[1] >= 7 + +# fmt: off +if is_python_pre_3_5: + from collections import namedtuple + Morpheme = namedtuple("Morpheme", "surface lemma tag") +elif is_python_post_3_7: + from dataclasses import dataclass + @dataclass(frozen=True) + class Morpheme: + surface: str + lemma: str + tag: str +else: + from typing import NamedTuple + class Morpheme(NamedTuple): + surface: str + lemma: str + tag: str + + +def try_mecab_import(): + try: + from natto import MeCab + return MeCab + except ImportError: + raise ImportError( + "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), " + "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), " + "and [natto-py](https://github.com/buruzaemon/natto-py)" + ) +# fmt: on + + +def check_spaces(text, tokens): + token_pattern = re.compile(r"\s?".join(f"({t})" for t in tokens)) + m = token_pattern.match(text) + if m is not None: + for i in range(1, m.lastindex): + yield m.end(i) < m.start(i + 1) + yield False + + +class KoreanTokenizer(DummyTokenizer): + def __init__(self, cls, nlp=None): + self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) + self.Tokenizer = try_mecab_import() + + def __call__(self, text): + dtokens = list(self.detailed_tokens(text)) + surfaces = [dt.surface for dt in dtokens] + doc = Doc(self.vocab, words=surfaces, spaces=list(check_spaces(text, surfaces))) + for token, dtoken in zip(doc, dtokens): + first_tag, sep, eomi_tags = dtoken.tag.partition("+") + token.tag_ = first_tag # stem(어간) or pre-final(선어말 어미) + token.lemma_ = dtoken.lemma + doc.user_data["full_tags"] = [dt.tag for dt in dtokens] + return doc + + def detailed_tokens(self, text): + # 품사 태그(POS)[0], 의미 부류(semantic class)[1], 종성 유무(jongseong)[2], 읽기(reading)[3], + # 타입(type)[4], 첫번째 품사(start pos)[5], 마지막 품사(end pos)[6], 표현(expression)[7], * + with self.Tokenizer("-F%f[0],%f[7]") as tokenizer: + for node in tokenizer.parse(text, as_nodes=True): + if node.is_eos(): + break + surface = node.surface + feature = node.feature + tag, _, expr = feature.partition(",") + lemma, _, remainder = expr.partition("/") + if lemma == "*": + lemma = surface + yield Morpheme(surface, lemma, tag) + + +class KoreanDefaults(Language.Defaults): + lex_attr_getters = dict(Language.Defaults.lex_attr_getters) + lex_attr_getters[LANG] = lambda _text: "ko" + stop_words = STOP_WORDS + tag_map = TAG_MAP + writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} + + @classmethod + def create_tokenizer(cls, nlp=None): + return KoreanTokenizer(cls, nlp) + + +class Korean(Language): + lang = "ko" + Defaults = KoreanDefaults + + def make_doc(self, text): + return self.tokenizer(text) + + +def pickle_korean(instance): + return Korean, tuple() + + +copy_reg.pickle(Korean, pickle_korean) + +__all__ = ["Korean"] diff --git a/spacy/lang/ko/examples.py b/spacy/lang/ko/examples.py new file mode 100644 index 000000000..10a6ea9bd --- /dev/null +++ b/spacy/lang/ko/examples.py @@ -0,0 +1,15 @@ +# coding: utf8 +from __future__ import unicode_literals +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ko.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + +sentences = [ + "애플이 영국의 신생 기업을 10억 달러에 구매를 고려중이다.", + "자동 운전 자동차의 손해 배상 책임에 자동차 메이커에 일정한 부담을 요구하겠다.", + "자동 배달 로봇이 보도를 주행하는 것을 샌프란시스코시가 금지를 검토중이라고 합니다.", + "런던은 영국의 수도이자 가장 큰 도시입니다." +] diff --git a/spacy/lang/ko/stop_words.py b/spacy/lang/ko/stop_words.py new file mode 100644 index 000000000..53cf6f29a --- /dev/null +++ b/spacy/lang/ko/stop_words.py @@ -0,0 +1,68 @@ +# coding: utf8 +from __future__ import unicode_literals + +STOP_WORDS = set(""" +이 +있 +하 +것 +들 +그 +되 +수 +이 +보 +않 +없 +나 +주 +아니 +등 +같 +때 +년 +가 +한 +지 +오 +말 +일 +그렇 +위하 +때문 +그것 +두 +말하 +알 +그러나 +받 +못하 +일 +그런 +또 +더 +많 +그리고 +좋 +크 +시키 +그러 +하나 +살 +데 +안 +어떤 +번 +나 +다른 +어떻 +들 +이렇 +점 +싶 +말 +좀 +원 +잘 +놓 +""".split()) diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py new file mode 100644 index 000000000..ed6b58170 --- /dev/null +++ b/spacy/lang/ko/tag_map.py @@ -0,0 +1,66 @@ +# encoding: utf8 +from __future__ import unicode_literals +from collections import defaultdict + +from ...symbols import (POS, PUNCT, INTJ, X, SYM, + ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN, + NUM, DET) + +# 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴 +# https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265 +# https://universaldependencies.org/u/pos/ +TAG_MAP = { + # J.{1,2} 조사 + "JKS": {POS: ADP}, + "JKC": {POS: ADP}, + "JKG": {POS: ADP}, + "JKO": {POS: ADP}, + "JKB": {POS: ADP}, + "JKV": {POS: ADP}, + "JKQ": {POS: ADP}, + "JX": {POS: ADP}, # 보조사 + "JC": {POS: CONJ}, # 접속 조사 + "MAJ": {POS: CONJ}, # 접속 부사 + "MAG": {POS: ADV}, # 일반 부사 + "MM": {POS: DET}, # 관형사 + + "XPN": {POS: X}, # 접두사 + # XS. 접미사 + "XSN": {POS: X}, + "XSV": {POS: X}, + "XSA": {POS: X}, + "XR": {POS: X}, # 어근 + # E.{1,2} 어미 + "EP": {POS: X}, + "EF": {POS: X}, + "EC": {POS: X}, + "ETN": {POS: X}, + "ETM": {POS: X}, + + "IC": {POS: INTJ}, # 감탄사 + + "VV": {POS: VERB}, # 동사 + "VA": {POS: ADJ}, # 형용사 + "VX": {POS: AUX}, # 보조 용언 + "VCP": {POS: ADP}, # 긍정 지정사(이다) + "VCN": {POS: ADJ}, # 부정 지정사(아니다) + + "NNG": {POS: NOUN}, # 일반 명사(general noun) + "NNB": {POS: NOUN}, # 의존 명사 + "NNBC": {POS: NOUN}, # 의존 명사(단위: unit) + "NNP": {POS: PROPN}, # 고유 명사(proper noun) + "NP": {POS: PRON}, # 대명사 + "NR": {POS: NUM}, # 수사(numerals) + "SN": {POS: NUM}, # 숫자 + + # S.{1,2} 부호 + # 문장 부호 + "SF": {POS: PUNCT}, # period or other EOS marker + "SE": {POS: PUNCT}, + "SC": {POS: PUNCT}, # comma, etc. + "SSO": {POS: PUNCT}, # open bracket + "SSC": {POS: PUNCT}, # close bracket + "SY": {POS: SYM}, # 기타 기호 + "SL": {POS: X}, # 외국어 + "SH": {POS: X}, # 한자 +} diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 4bef85a1b..fdd86616d 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -124,6 +124,12 @@ def ja_tokenizer(): return get_lang_class("ja").Defaults.create_tokenizer() +@pytest.fixture(scope="session") +def ko_tokenizer(): + pytest.importorskip("natto") + return get_lang_class("ko").Defaults.create_tokenizer() + + @pytest.fixture(scope="session") def lt_tokenizer(): return get_lang_class("lt").Defaults.create_tokenizer() diff --git a/spacy/tests/lang/ko/__init__.py b/spacy/tests/lang/ko/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py new file mode 100644 index 000000000..67371d4ce --- /dev/null +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -0,0 +1,13 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + + +@pytest.mark.parametrize( + "word,lemma", + [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")], +) +def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): + test_lemma = ko_tokenizer(word)[0].lemma_ + assert test_lemma == lemma diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py new file mode 100644 index 000000000..bd1d94aec --- /dev/null +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import pytest + +# fmt: off +TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."), + ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")] + +TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", + "NNP NNG NNG JKB VV EC VX EF SF"), + ("영등포구에 있는 맛집 좀 알려주세요.", + "NNP JKB VV ETM NNG MAG VV VX EP SF")] + +FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.", + "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")] + +POS_TESTS = [("서울 타워 근처에 살고 있습니다.", + "PROPN NOUN NOUN ADP VERB X AUX X PUNCT"), + ("영등포구에 있는 맛집 좀 알려주세요.", + "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")] +# fmt: on + + +@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS) +def test_ko_tokenizer(ko_tokenizer, text, expected_tokens): + tokens = [token.text for token in ko_tokenizer(text)] + assert tokens == expected_tokens.split() + + +@pytest.mark.parametrize("text,expected_tags", TAG_TESTS) +def test_ko_tokenizer_tags(ko_tokenizer, text, expected_tags): + tags = [token.tag_ for token in ko_tokenizer(text)] + assert tags == expected_tags.split() + + +@pytest.mark.parametrize("text,expected_tags", FULL_TAG_TESTS) +def test_ko_tokenizer_full_tags(ko_tokenizer, text, expected_tags): + tags = ko_tokenizer(text).user_data["full_tags"] + assert tags == expected_tags.split() + + +@pytest.mark.parametrize("text,expected_pos", POS_TESTS) +def test_ko_tokenizer_pos(ko_tokenizer, text, expected_pos): + pos = [token.pos_ for token in ko_tokenizer(text)] + assert pos == expected_pos.split() diff --git a/website/meta/languages.json b/website/meta/languages.json index cfa468d7f..1169a3397 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -153,6 +153,17 @@ "example": "これは文章です。", "has_examples": true }, + { + "code": "ko", + "name": "Korean", + "dependencies": [ + { "name": "mecab-ko", "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" }, + { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, + { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py"} + ], + "example": "이것은 문장입니다.", + "has_examples": true + }, { "code": "vi", "name": "Vietnamese", From 205c73a58914b3fd9aebdd0708582fb7a80fd625 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20B=C3=B6ing?= Date: Wed, 10 Jul 2019 10:16:48 +0200 Subject: [PATCH 109/148] Update tokenizer and doc init example (#3939) * Fix Doc.to_json hyperlink * Update tokenizer and doc init examples * Change "matchin rules" to "punctuation rules" * Auto-format --- spacy/tokens/doc.pyx | 5 +++-- website/docs/api/doc.md | 2 +- website/docs/api/tokenizer.md | 8 +++++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 131c43d37..373771247 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -85,13 +85,14 @@ cdef class Doc: Python-level `Token` and `Span` objects are views of this array, i.e. they don't own the data themselves. - EXAMPLE: Construction 1 + EXAMPLE: + Construction 1 >>> doc = nlp(u'Some text') Construction 2 >>> from spacy.tokens import Doc >>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], - spaces=[True, False, False]) + >>> spaces=[True, False, False]) DOCS: https://spacy.io/api/doc """ diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index f5a94335f..bf9801564 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -264,7 +264,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | ----------- | -------------------------------------- | ----------------------------------------------- | | **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Doc`. | -## Doc.to_json {#to_json, tag="method" new="2.1"} +## Doc.to_json {#to_json tag="method" new="2.1"} Convert a Doc to JSON. The format it produces will be the new format for the [`spacy train`](/api/cli#train) command (not implemented yet). If custom diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md index 5bc0df625..67e67f5c9 100644 --- a/website/docs/api/tokenizer.md +++ b/website/docs/api/tokenizer.md @@ -9,7 +9,10 @@ Segment text, and create `Doc` objects with the discovered segment boundaries. ## Tokenizer.\_\_init\_\_ {#init tag="method"} -Create a `Tokenizer`, to create `Doc` objects given unicode text. +Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples +of how to construct a custom tokenizer with different tokenization rules, see +the +[usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers). > #### Example > @@ -18,11 +21,14 @@ Create a `Tokenizer`, to create `Doc` objects given unicode text. > from spacy.tokenizer import Tokenizer > from spacy.lang.en import English > nlp = English() +> # Create a blank Tokenizer with just the English vocab > tokenizer = Tokenizer(nlp.vocab) > > # Construction 2 > from spacy.lang.en import English > nlp = English() +> # Create a Tokenizer with the default settings for English +> # including punctuation rules and exceptions > tokenizer = nlp.Defaults.create_tokenizer(nlp) > ``` From 881f5bc401a2c16294a4152d05981ebd8e7691c5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 10:27:29 +0200 Subject: [PATCH 110/148] Auto-format --- website/docs/usage/training.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 91513588c..773b70f05 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -630,13 +630,13 @@ should be somewhat larger, especially if your documents are long. ### Learning rate, regularization and gradient clipping {#tips-hyperparams} -By default spaCy uses the Adam solver, with default settings (`learn_rate=0.001`, -`beta1=0.9`, `beta2=0.999`). Some researchers have said they found -these settings terrible on their problems – but they've always performed very -well in training spaCy's models, in combination with the rest of our recipe. You -can change these settings directly, by modifying the corresponding attributes on -the `optimizer` object. You can also set environment variables, to adjust the -defaults. +By default spaCy uses the Adam solver, with default settings +(`learn_rate=0.001`, `beta1=0.9`, `beta2=0.999`). Some researchers have said +they found these settings terrible on their problems – but they've always +performed very well in training spaCy's models, in combination with the rest of +our recipe. You can change these settings directly, by modifying the +corresponding attributes on the `optimizer` object. You can also set environment +variables, to adjust the defaults. There are two other key hyper-parameters of the solver: `L2` **regularization**, and **gradient clipping** (`max_grad_norm`). Gradient clipping is a hack that's From ebe58e7fa18af919eb69b9e468d0ec30c9338dcc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 10:27:33 +0200 Subject: [PATCH 111/148] Document gold.docs_to_json [ci skip] --- website/docs/api/annotation.md | 4 +++- website/docs/api/goldparse.md | 21 +++++++++++++++++++++ website/docs/usage/training.md | 3 +++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md index a5bb30b6f..ed0e0b3e0 100644 --- a/website/docs/api/annotation.md +++ b/website/docs/api/annotation.md @@ -520,7 +520,9 @@ spaCy takes training data in JSON format. The built-in [`convert`](/api/cli#convert) command helps you convert the `.conllu` format used by the [Universal Dependencies corpora](https://github.com/UniversalDependencies) to -spaCy's training format. +spaCy's training format. To convert one or more existing `Doc` objects to +spaCy's JSON format, you can use the +[`gold.docs_to_json`](/api/goldparse#docs_to_json) helper. > #### Annotating entities > diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md index ca5b6a811..13f68a85d 100644 --- a/website/docs/api/goldparse.md +++ b/website/docs/api/goldparse.md @@ -55,6 +55,27 @@ Whether the provided syntactic annotations form a projective dependency tree. ## Utilities {#util} +### gold.docs_to_json {#docs_to_json tag="function"} + +Convert a list of Doc objects into the +[JSON-serializable format](/api/annotation#json-input) used by the +[`spacy train`](/api/cli#train) command. + +> #### Example +> +> ```python +> from spacy.gold import docs_to_json +> +> doc = nlp(u"I like London") +> json_data = docs_to_json([doc]) +> ``` + +| Name | Type | Description | +| ----------- | ---------------- | ------------------------------------------ | +| `docs` | iterable / `Doc` | The `Doc` object(s) to convert. | +| `id` | int | ID to assign to the JSON. Defaults to `0`. | +| **RETURNS** | list | The data in spaCy's JSON format. | + ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} Encode labelled spans into per-token tags, using the diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 773b70f05..b84bf4e12 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -39,6 +39,9 @@ mkdir models python -m spacy train es models ancora-json/es_ancora-ud-train.json ancora-json/es_ancora-ud-dev.json ``` +You can also use the [`gold.docs_to_json`](/api/goldparse#docs_to_json) helper +to convert a list of `Doc` objects to spaCy's JSON training format. + #### Understanding the training output When you train a model using the [`spacy train`](/api/cli#train) command, you'll From 8721849423e42fe99cdd6905aa98b94af446d82b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 11:19:28 +0200 Subject: [PATCH 112/148] Update Scorer.ents_per_type --- spacy/scorer.py | 19 ++++++++++--------- website/docs/api/scorer.md | 21 +++++++++++---------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/spacy/scorer.py b/spacy/scorer.py index c01353520..b9994e3f2 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -92,6 +92,15 @@ class Scorer(object): """RETURNS (float): Named entity accuracy (F-score).""" return self.ner.fscore * 100 + @property + def ents_per_type(self): + """RETURNS (dict): Scores per entity label. + """ + return { + k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} + for k, v in self.ner_per_ents.items() + } + @property def scores(self): """RETURNS (dict): All scores with keys `uas`, `las`, `ents_p`, @@ -103,17 +112,9 @@ class Scorer(object): "ents_p": self.ents_p, "ents_r": self.ents_r, "ents_f": self.ents_f, + "ents_per_type": self.ents_per_type, "tags_acc": self.tags_acc, "token_acc": self.token_acc, - "ents_per_type": self.__scores_per_ents(), - } - - def __scores_per_ents(self): - """RETURNS (dict): Scores per NER entity - """ - return { - k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100} - for k, v in self.ner_per_ents.items() } def score(self, doc, gold, verbose=False, punct_labels=("p", "punct")): diff --git a/website/docs/api/scorer.md b/website/docs/api/scorer.md index e6a8595fd..2af4ec0ce 100644 --- a/website/docs/api/scorer.md +++ b/website/docs/api/scorer.md @@ -46,13 +46,14 @@ Update the evaluation scores from a single [`Doc`](/api/doc) / ## Properties -| Name | Type | Description | -| ----------- | ----- | -------------------------------------------------------------------------------------------- | -| `token_acc` | float | Tokenization accuracy. | -| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | -| `uas` | float | Unlabelled dependency score. | -| `las` | float | Labelled dependency score. | -| `ents_p` | float | Named entity accuracy (precision). | -| `ents_r` | float | Named entity accuracy (recall). | -| `ents_f` | float | Named entity accuracy (F-score). | -| `scores` | dict | All scores with keys `uas`, `las`, `ents_p`, `ents_r`, `ents_f`, `tags_acc` and `token_acc`. | +| Name | Type | Description | +| ---------------------------------------------- | ----- | ------------------------------------------------------------------------------------------------------------- | +| `token_acc` | float | Tokenization accuracy. | +| `tags_acc` | float | Part-of-speech tag accuracy (fine grained tags, i.e. `Token.tag`). | +| `uas` | float | Unlabelled dependency score. | +| `las` | float | Labelled dependency score. | +| `ents_p` | float | Named entity accuracy (precision). | +| `ents_r` | float | Named entity accuracy (recall). | +| `ents_f` | float | Named entity accuracy (F-score). | +| `ents_per_type` 2.1.5 | dict | Scores per entity label. Keyed by label, mapped to a dict of `p`, `r` and `f` scores. | +| `scores` | dict | All scores with keys `uas`, `las`, `ents_p`, `ents_r`, `ents_f`, `ents_per_type`, `tags_acc` and `token_acc`. | From 4ebb4865fe057192b8649e6a5c4bd33c60d49981 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 11:19:48 +0200 Subject: [PATCH 113/148] Update languages.json --- website/meta/languages.json | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/website/meta/languages.json b/website/meta/languages.json index 1169a3397..ef336ef5f 100644 --- a/website/meta/languages.json +++ b/website/meta/languages.json @@ -104,6 +104,7 @@ { "code": "ga", "name": "Irish" }, { "code": "bn", "name": "Bengali", "has_examples": true }, { "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true }, + { "code": "mr", "name": "Marathi" }, { "code": "kn", "name": "Kannada" }, { "code": "ta", "name": "Tamil", "has_examples": true }, { @@ -157,9 +158,12 @@ "code": "ko", "name": "Korean", "dependencies": [ - { "name": "mecab-ko", "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" }, + { + "name": "mecab-ko", + "url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md" + }, { "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" }, - { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py"} + { "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" } ], "example": "이것은 문장입니다.", "has_examples": true From ea2050079b61b89a5c6e75951c4565aa504a2510 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 12:03:05 +0200 Subject: [PATCH 114/148] Auto-format --- spacy/pipeline/entityruler.py | 64 +++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 4f89e4186..35fefd02c 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -10,7 +10,7 @@ from ..util import ensure_path, to_disk, from_disk from ..tokens import Span from ..matcher import Matcher, PhraseMatcher -DEFAULT_ENT_ID_SEP = '||' +DEFAULT_ENT_ID_SEP = "||" class EntityRuler(object): @@ -53,7 +53,9 @@ class EntityRuler(object): self.matcher = Matcher(nlp.vocab) if phrase_matcher_attr is not None: self.phrase_matcher_attr = phrase_matcher_attr - self.phrase_matcher = PhraseMatcher(nlp.vocab, attr=self.phrase_matcher_attr) + self.phrase_matcher = PhraseMatcher( + nlp.vocab, attr=self.phrase_matcher_attr + ) else: self.phrase_matcher_attr = None self.phrase_matcher = PhraseMatcher(nlp.vocab) @@ -223,13 +225,14 @@ class EntityRuler(object): """ cfg = srsly.msgpack_loads(patterns_bytes) if isinstance(cfg, dict): - self.add_patterns(cfg.get('patterns', cfg)) - self.overwrite = cfg.get('overwrite', False) - self.phrase_matcher_attr = cfg.get('phrase_matcher_attr', None) + self.add_patterns(cfg.get("patterns", cfg)) + self.overwrite = cfg.get("overwrite", False) + self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None) if self.phrase_matcher_attr is not None: - self.phrase_matcher = PhraseMatcher(self.nlp.vocab, - attr=self.phrase_matcher_attr) - self.ent_id_sep = cfg.get('ent_id_sep', DEFAULT_ENT_ID_SEP) + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr + ) + self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) else: self.add_patterns(cfg) return self @@ -242,11 +245,14 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#to_bytes """ - serial = OrderedDict(( - ('overwrite', self.overwrite), - ('ent_id_sep', self.ent_id_sep), - ('phrase_matcher_attr', self.phrase_matcher_attr), - ('patterns', self.patterns))) + serial = OrderedDict( + ( + ("overwrite", self.overwrite), + ("ent_id_sep", self.ent_id_sep), + ("phrase_matcher_attr", self.phrase_matcher_attr), + ("patterns", self.patterns), + ) + ) return srsly.msgpack_dumps(serial) def from_disk(self, path, **kwargs): @@ -266,17 +272,20 @@ class EntityRuler(object): else: cfg = {} deserializers = { - 'patterns': lambda p: self.add_patterns(srsly.read_jsonl(p.with_suffix('.jsonl'))), - 'cfg': lambda p: cfg.update(srsly.read_json(p)) + "patterns": lambda p: self.add_patterns( + srsly.read_jsonl(p.with_suffix(".jsonl")) + ), + "cfg": lambda p: cfg.update(srsly.read_json(p)), } from_disk(path, deserializers, {}) - self.overwrite = cfg.get('overwrite', False) - self.phrase_matcher_attr = cfg.get('phrase_matcher_attr') - self.ent_id_sep = cfg.get('ent_id_sep', DEFAULT_ENT_ID_SEP) + self.overwrite = cfg.get("overwrite", False) + self.phrase_matcher_attr = cfg.get("phrase_matcher_attr") + self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP) if self.phrase_matcher_attr is not None: - self.phrase_matcher = PhraseMatcher(self.nlp.vocab, - attr=self.phrase_matcher_attr) + self.phrase_matcher = PhraseMatcher( + self.nlp.vocab, attr=self.phrase_matcher_attr + ) return self def to_disk(self, path, **kwargs): @@ -289,13 +298,16 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#to_disk """ - cfg = {'overwrite': self.overwrite, - 'phrase_matcher_attr': self.phrase_matcher_attr, - 'ent_id_sep': self.ent_id_sep} + cfg = { + "overwrite": self.overwrite, + "phrase_matcher_attr": self.phrase_matcher_attr, + "ent_id_sep": self.ent_id_sep, + } serializers = { - 'patterns': lambda p: srsly.write_jsonl(p.with_suffix('.jsonl'), - self.patterns), - 'cfg': lambda p: srsly.write_json(p, cfg) + "patterns": lambda p: srsly.write_jsonl( + p.with_suffix(".jsonl"), self.patterns + ), + "cfg": lambda p: srsly.write_json(p, cfg), } path = ensure_path(path) to_disk(path, serializers, {}) From 874d914a440553f8e4e3964b5677647ca9b2d967 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 12:13:23 +0200 Subject: [PATCH 115/148] Tidy up test --- spacy/tests/regression/test_issue3526.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py index 3949c4b1c..62c9d5532 100644 --- a/spacy/tests/regression/test_issue3526.py +++ b/spacy/tests/regression/test_issue3526.py @@ -7,6 +7,7 @@ from spacy.language import Language from spacy.pipeline import EntityRuler from spacy import load import srsly + from ..util import make_tempdir @@ -79,8 +80,10 @@ def test_entity_ruler_in_pipeline_from_issue(patterns, en_vocab): nlp.add_pipe(ruler) with make_tempdir() as tmpdir: nlp.to_disk(tmpdir) - assert nlp.pipeline[-1][-1].patterns == [{"label": "ORG", "pattern": "Apple"}] - assert nlp.pipeline[-1][-1].overwrite is True + ruler = nlp.get_pipe("entity_ruler") + assert ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert ruler.overwrite is True nlp2 = load(tmpdir) - assert nlp2.pipeline[-1][-1].patterns == [{"label": "ORG", "pattern": "Apple"}] - assert nlp2.pipeline[-1][-1].overwrite is True + new_ruler = nlp2.get_pipe("entity_ruler") + assert new_ruler.patterns == [{"label": "ORG", "pattern": "Apple"}] + assert new_ruler.overwrite is True From 570ab1f481fabbbc520d965e14494637680a22b9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 12:14:12 +0200 Subject: [PATCH 116/148] Fix handling of old entity ruler files Expected an `entity_ruler.jsonl` file in the top-level model directory, so the path passed to from_disk by default (model path plus componentn name), but with the suffix ".jsonl". --- spacy/pipeline/entityruler.py | 5 +++-- spacy/tests/regression/test_issue3526.py | 7 +++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 35fefd02c..9bbbb2c48 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -266,8 +266,9 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#from_disk """ path = ensure_path(path) - if path.is_file(): - patterns = srsly.read_jsonl(path) + depr_patterns_path = path.with_suffix(".jsonl") + if depr_patterns_path.is_file(): + patterns = srsly.read_jsonl(depr_patterns_path) self.add_patterns(patterns) else: cfg = {} diff --git a/spacy/tests/regression/test_issue3526.py b/spacy/tests/regression/test_issue3526.py index 62c9d5532..c6f513730 100644 --- a/spacy/tests/regression/test_issue3526.py +++ b/spacy/tests/regression/test_issue3526.py @@ -62,10 +62,9 @@ def test_entity_ruler_from_disk_old_format_safe(patterns, en_vocab): nlp = Language(vocab=en_vocab) ruler = EntityRuler(nlp, patterns=patterns, overwrite_ents=True) with make_tempdir() as tmpdir: - out_file = tmpdir / "entity_ruler.jsonl" - srsly.write_jsonl(out_file, ruler.patterns) - new_ruler = EntityRuler(nlp) - new_ruler = new_ruler.from_disk(out_file) + out_file = tmpdir / "entity_ruler" + srsly.write_jsonl(out_file.with_suffix(".jsonl"), ruler.patterns) + new_ruler = EntityRuler(nlp).from_disk(out_file) for pattern in ruler.patterns: assert pattern in new_ruler.patterns assert len(new_ruler) == len(ruler) From 40cd03fc358b12568aea95f7d11cc122677ad7dc Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 12:25:45 +0200 Subject: [PATCH 117/148] Improve EntityRuler serialization --- spacy/pipeline/entityruler.py | 9 ++++--- website/docs/api/entityruler.md | 46 +++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 23 deletions(-) diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 9bbbb2c48..35b465ceb 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -293,12 +293,13 @@ class EntityRuler(object): """Save the entity ruler patterns to a directory. The patterns will be saved as newline-delimited JSON (JSONL). - path (unicode / Path): The JSONL file to load. + path (unicode / Path): The JSONL file to save. **kwargs: Other config paramters, mostly for consistency. RETURNS (EntityRuler): The loaded entity ruler. DOCS: https://spacy.io/api/entityruler#to_disk """ + path = ensure_path(path) cfg = { "overwrite": self.overwrite, "phrase_matcher_attr": self.phrase_matcher_attr, @@ -310,5 +311,7 @@ class EntityRuler(object): ), "cfg": lambda p: srsly.write_json(p, cfg), } - path = ensure_path(path) - to_disk(path, serializers, {}) + if path.suffix == ".jsonl": # user wants to save only JSONL + srsly.write_jsonl(path, self.patterns) + else: + to_disk(path, serializers, {}) diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md index dcbf99da5..5c05450f8 100644 --- a/website/docs/api/entityruler.md +++ b/website/docs/api/entityruler.md @@ -30,14 +30,14 @@ be a token pattern (list) or a phrase pattern (string). For example: > ruler = EntityRuler(nlp, overwrite_ents=True) > ``` -| Name | Type | Description | -| ---------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | -| `patterns` | iterable | Optional patterns to load in. | -| `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None` -| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. | -| `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. | -| **RETURNS** | `EntityRuler` | The newly constructed object. | +| Name | Type | Description | +| --------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | +| `patterns` | iterable | Optional patterns to load in. | +| `phrase_matcher_attr` | int / unicode | Optional attr to pass to the internal [`PhraseMatcher`](/api/phtasematcher). defaults to `None` | +| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. | +| `**cfg` | - | Other config parameters. If pipeline component is loaded as part of a model pipeline, this will include all keyword arguments passed to `spacy.load`. | +| **RETURNS** | `EntityRuler` | The newly constructed object. | ## EntityRuler.\_\len\_\_ {#len tag="method"} @@ -123,35 +123,41 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on ## EntityRuler.to_disk {#to_disk tag="method"} Save the entity ruler patterns to a directory. The patterns will be saved as -newline-delimited JSON (JSONL). +newline-delimited JSON (JSONL). If a file with the suffix `.jsonl` is provided, +only the patterns are saved as JSONL. If a directory name is provided, a +`patterns.jsonl` and `cfg` file with the component configuration is exported. > #### Example > > ```python > ruler = EntityRuler(nlp) -> ruler.to_disk("/path/to/rules.jsonl") +> ruler.to_disk("/path/to/patterns.jsonl") # saves patterns only +> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config > ``` -| Name | Type | Description | -| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | +| Name | Type | Description | +| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | ## EntityRuler.from_disk {#from_disk tag="method"} -Load the entity ruler from a file. Expects a file containing newline-delimited -JSON (JSONL) with one entry per line. +Load the entity ruler from a file. Expects either a file containing +newline-delimited JSON (JSONL) with one entry per line, or a directory +containing a `patterns.jsonl` file and a `cfg` file with the component +configuration. > #### Example > > ```python > ruler = EntityRuler(nlp) -> ruler.from_disk("/path/to/rules.jsonl") +> ruler.from_disk("/path/to/patterns.jsonl") # loads patterns only +> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config > ``` -| Name | Type | Description | -| ----------- | ---------------- | --------------------------------------------------------------------------- | -| `path` | unicode / `Path` | A path to a JSONL file. Paths may be either strings or `Path`-like objects. | -| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | +| Name | Type | Description | +| ----------- | ---------------- | ---------------------------------------------------------------------------------------- | +| `path` | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | +| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | ## EntityRuler.to_bytes {#to_bytes tag="method"} From 82045aac8a948c4c1ae0f5a04314c8111c06d34d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 12:49:18 +0200 Subject: [PATCH 118/148] Merge regression tests --- spacy/tests/regression/test_issue2001-2500.py | 10 + spacy/tests/regression/test_issue3001-3500.py | 334 ++++++++++++++++++ spacy/tests/regression/test_issue3002.py | 11 - spacy/tests/regression/test_issue3009.py | 67 ---- spacy/tests/regression/test_issue3012.py | 31 -- spacy/tests/regression/test_issue3199.py | 15 - spacy/tests/regression/test_issue3209.py | 23 -- spacy/tests/regression/test_issue3248.py | 27 -- spacy/tests/regression/test_issue3277.py | 11 - spacy/tests/regression/test_issue3288.py | 18 - spacy/tests/regression/test_issue3289.py | 15 - spacy/tests/regression/test_issue3328.py | 19 - spacy/tests/regression/test_issue3331.py | 21 -- spacy/tests/regression/test_issue3345.py | 26 -- spacy/tests/regression/test_issue3356.py | 72 ---- spacy/tests/regression/test_issue3410.py | 21 -- spacy/tests/regression/test_issue3447.py | 14 - spacy/tests/regression/test_issue3449.py | 21 -- spacy/tests/regression/test_issue3468.py | 21 -- 19 files changed, 344 insertions(+), 433 deletions(-) create mode 100644 spacy/tests/regression/test_issue3001-3500.py delete mode 100644 spacy/tests/regression/test_issue3002.py delete mode 100644 spacy/tests/regression/test_issue3009.py delete mode 100644 spacy/tests/regression/test_issue3012.py delete mode 100644 spacy/tests/regression/test_issue3199.py delete mode 100644 spacy/tests/regression/test_issue3209.py delete mode 100644 spacy/tests/regression/test_issue3248.py delete mode 100644 spacy/tests/regression/test_issue3277.py delete mode 100644 spacy/tests/regression/test_issue3288.py delete mode 100644 spacy/tests/regression/test_issue3289.py delete mode 100644 spacy/tests/regression/test_issue3328.py delete mode 100644 spacy/tests/regression/test_issue3331.py delete mode 100644 spacy/tests/regression/test_issue3345.py delete mode 100644 spacy/tests/regression/test_issue3356.py delete mode 100644 spacy/tests/regression/test_issue3410.py delete mode 100644 spacy/tests/regression/test_issue3447.py delete mode 100644 spacy/tests/regression/test_issue3449.py delete mode 100644 spacy/tests/regression/test_issue3468.py diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py index 82b3a81a9..4292c8d23 100644 --- a/spacy/tests/regression/test_issue2001-2500.py +++ b/spacy/tests/regression/test_issue2001-2500.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import pytest import numpy from spacy.tokens import Doc +from spacy.matcher import Matcher from spacy.displacy import render from spacy.gold import iob_to_biluo from spacy.lang.it import Italian @@ -123,6 +124,15 @@ def test_issue2396(en_vocab): assert (span.get_lca_matrix() == matrix).all() +def test_issue2464(en_vocab): + """Test problem with successive ?. This is the same bug, so putting it here.""" + matcher = Matcher(en_vocab) + doc = Doc(en_vocab, words=["a", "b"]) + matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}]) + matches = matcher(doc) + assert len(matches) == 3 + + def test_issue2482(): """Test we can serialize and deserialize a blank NER or parser model.""" nlp = Italian() diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py new file mode 100644 index 000000000..3b0c2f1ed --- /dev/null +++ b/spacy/tests/regression/test_issue3001-3500.py @@ -0,0 +1,334 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +from spacy.lang.en import English +from spacy.lang.de import German +from spacy.pipeline import EntityRuler, EntityRecognizer +from spacy.matcher import Matcher, PhraseMatcher +from spacy.tokens import Doc +from spacy.vocab import Vocab +from spacy.attrs import ENT_IOB, ENT_TYPE +from spacy.compat import pickle, is_python2, unescape_unicode +from spacy import displacy +from spacy.util import decaying +import numpy +import re + +from ..util import get_doc + + +def test_issue3002(): + """Test that the tokenizer doesn't hang on a long list of dots""" + nlp = German() + doc = nlp( + "880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl" + ) + assert len(doc) == 5 + + +def test_issue3009(en_vocab): + """Test problem with matcher quantifiers""" + patterns = [ + [{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}], + [ + {"LEMMA": "have"}, + {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, + {"LOWER": "to"}, + {"LOWER": "do"}, + {"POS": "ADP"}, + ], + [ + {"LEMMA": "have"}, + {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, + {"LOWER": "to"}, + {"LOWER": "do"}, + {"POS": "ADP"}, + ], + ] + words = ["also", "has", "to", "do", "with"] + tags = ["RB", "VBZ", "TO", "VB", "IN"] + doc = get_doc(en_vocab, words=words, tags=tags) + matcher = Matcher(en_vocab) + for i, pattern in enumerate(patterns): + matcher.add(str(i), None, pattern) + matches = matcher(doc) + assert matches + + +def test_issue3012(en_vocab): + """Test that the is_tagged attribute doesn't get overwritten when we from_array + without tag information.""" + words = ["This", "is", "10", "%", "."] + tags = ["DT", "VBZ", "CD", "NN", "."] + pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] + ents = [(2, 4, "PERCENT")] + doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) + assert doc.is_tagged + + expected = ("10", "NUM", "CD", "PERCENT") + assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + + header = [ENT_IOB, ENT_TYPE] + ent_array = doc.to_array(header) + doc.from_array(header, ent_array) + + assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected + + # Serializing then deserializing + doc_bytes = doc.to_bytes() + doc2 = Doc(en_vocab).from_bytes(doc_bytes) + assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected + + +def test_issue3199(): + """Test that Span.noun_chunks works correctly if no noun chunks iterator + is available. To make this test future-proof, we're constructing a Doc + with a new Vocab here and setting is_parsed to make sure the noun chunks run. + """ + doc = Doc(Vocab(), words=["This", "is", "a", "sentence"]) + doc.is_parsed = True + assert list(doc[0:3].noun_chunks) == [] + + +def test_issue3209(): + """Test issue that occurred in spaCy nightly where NER labels were being + mapped to classes incorrectly after loading the model, when the labels + were added using ner.add_label(). + """ + nlp = English() + ner = nlp.create_pipe("ner") + nlp.add_pipe(ner) + + ner.add_label("ANIMAL") + nlp.begin_training() + move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] + assert ner.move_names == move_names + nlp2 = English() + nlp2.add_pipe(nlp2.create_pipe("ner")) + nlp2.from_bytes(nlp.to_bytes()) + assert nlp2.get_pipe("ner").move_names == move_names + + +def test_issue3248_1(): + """Test that the PhraseMatcher correctly reports its number of rules, not + total number of patterns.""" + nlp = English() + matcher = PhraseMatcher(nlp.vocab) + matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) + matcher.add("TEST2", None, nlp("d")) + assert len(matcher) == 2 + + +def test_issue3248_2(): + """Test that the PhraseMatcher can be pickled correctly.""" + nlp = English() + matcher = PhraseMatcher(nlp.vocab) + matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) + matcher.add("TEST2", None, nlp("d")) + data = pickle.dumps(matcher) + new_matcher = pickle.loads(data) + assert len(new_matcher) == len(matcher) + + +def test_issue3277(es_tokenizer): + """Test that hyphens are split correctly as prefixes.""" + doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") + assert len(doc) == 14 + assert doc[0].text == "\u2014" + assert doc[5].text == "\u2013" + assert doc[9].text == "\u2013" + + +def test_issue3288(en_vocab): + """Test that retokenization works correctly via displaCy when punctuation + is merged onto the preceeding token and tensor is resized.""" + words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] + heads = [1, 0, -1, 1, 0, 1, -2, -3] + deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] + doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) + doc.tensor = numpy.zeros((len(words), 96), dtype="float32") + displacy.render(doc) + + +def test_issue3289(): + """Test that Language.to_bytes handles serializing a pipeline component + with an uninitialized model.""" + nlp = English() + nlp.add_pipe(nlp.create_pipe("textcat")) + bytes_data = nlp.to_bytes() + new_nlp = English() + new_nlp.add_pipe(nlp.create_pipe("textcat")) + new_nlp.from_bytes(bytes_data) + + +def test_issue3328(en_vocab): + doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) + matcher = Matcher(en_vocab) + patterns = [ + [{"LOWER": {"IN": ["hello", "how"]}}], + [{"LOWER": {"IN": ["you", "doing"]}}], + ] + matcher.add("TEST", None, *patterns) + matches = matcher(doc) + assert len(matches) == 4 + matched_texts = [doc[start:end].text for _, start, end in matches] + assert matched_texts == ["Hello", "how", "you", "doing"] + + +@pytest.mark.xfail +def test_issue3331(en_vocab): + """Test that duplicate patterns for different rules result in multiple + matches, one per rule. + """ + matcher = PhraseMatcher(en_vocab) + matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) + matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) + doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) + matches = matcher(doc) + assert len(matches) == 2 + match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] + assert sorted(match_ids) == ["A", "B"] + + +def test_issue3345(): + """Test case where preset entity crosses sentence boundary.""" + nlp = English() + doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) + doc[4].is_sent_start = True + ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) + ner = EntityRecognizer(doc.vocab) + # Add the OUT action. I wouldn't have thought this would be necessary... + ner.moves.add_action(5, "") + ner.add_label("GPE") + doc = ruler(doc) + # Get into the state just before "New" + state = ner.moves.init_batch([doc])[0] + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + ner.moves.apply_transition(state, "O") + # Check that B-GPE is valid. + assert ner.moves.is_valid(state, "B-GPE") + + +if is_python2: + # If we have this test in Python 3, pytest chokes, as it can't print the + # string above in the xpass message. + prefix_search = ( + b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" + b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" + b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" + b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" + b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" + b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" + b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" + b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" + b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" + b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" + b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" + b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" + b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" + b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" + b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" + b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" + b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" + b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" + b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" + b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" + b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" + b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" + b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" + b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" + b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" + b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" + b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" + b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" + b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" + b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" + b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" + b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" + b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" + b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" + b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" + b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" + b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" + b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" + b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" + b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" + b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" + b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" + b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" + b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" + b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" + b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" + b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" + b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" + b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" + b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" + b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" + b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" + b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" + b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" + b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" + b"\\U0001FA60-\\U0001FA6D]" + ) + + def test_issue3356(): + pattern = re.compile(unescape_unicode(prefix_search.decode("utf8"))) + assert not pattern.search("hello") + + +def test_issue3410(): + texts = ["Hello world", "This is a test"] + nlp = English() + matcher = Matcher(nlp.vocab) + phrasematcher = PhraseMatcher(nlp.vocab) + with pytest.deprecated_call(): + docs = list(nlp.pipe(texts, n_threads=4)) + with pytest.deprecated_call(): + docs = list(nlp.tokenizer.pipe(texts, n_threads=4)) + with pytest.deprecated_call(): + list(matcher.pipe(docs, n_threads=4)) + with pytest.deprecated_call(): + list(phrasematcher.pipe(docs, n_threads=4)) + + +def test_issue3447(): + sizes = decaying(10.0, 1.0, 0.5) + size = next(sizes) + assert size == 10.0 + size = next(sizes) + assert size == 10.0 - 0.5 + size = next(sizes) + assert size == 10.0 - 0.5 - 0.5 + + +@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot") +def test_issue3449(): + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + text1 = "He gave the ball to I. Do you want to go to the movies with I?" + text2 = "He gave the ball to I. Do you want to go to the movies with I?" + text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" + t1 = nlp(text1) + t2 = nlp(text2) + t3 = nlp(text3) + assert t1[5].text == "I" + assert t2[5].text == "I" + assert t3[5].text == "I" + + +def test_issue3468(): + """Test that sentence boundaries are set correctly so Doc.is_sentenced can + be restored after serialization.""" + nlp = English() + nlp.add_pipe(nlp.create_pipe("sentencizer")) + doc = nlp("Hello world") + assert doc[0].is_sent_start + assert doc.is_sentenced + assert len(list(doc.sents)) == 1 + doc_bytes = doc.to_bytes() + new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) + assert new_doc[0].is_sent_start + assert new_doc.is_sentenced + assert len(list(new_doc.sents)) == 1 diff --git a/spacy/tests/regression/test_issue3002.py b/spacy/tests/regression/test_issue3002.py deleted file mode 100644 index 54e661d1f..000000000 --- a/spacy/tests/regression/test_issue3002.py +++ /dev/null @@ -1,11 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.lang.de import German - - -def test_issue3002(): - """Test that the tokenizer doesn't hang on a long list of dots""" - nlp = German() - doc = nlp('880.794.982.218.444.893.023.439.794.626.120.190.780.624.990.275.671 ist eine lange Zahl') - assert len(doc) == 5 diff --git a/spacy/tests/regression/test_issue3009.py b/spacy/tests/regression/test_issue3009.py deleted file mode 100644 index 25f208903..000000000 --- a/spacy/tests/regression/test_issue3009.py +++ /dev/null @@ -1,67 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -PATTERNS = [ - ("1", [[{"LEMMA": "have"}, {"LOWER": "to"}, {"LOWER": "do"}, {"POS": "ADP"}]]), - ( - "2", - [ - [ - {"LEMMA": "have"}, - {"IS_ASCII": True, "IS_PUNCT": False, "OP": "*"}, - {"LOWER": "to"}, - {"LOWER": "do"}, - {"POS": "ADP"}, - ] - ], - ), - ( - "3", - [ - [ - {"LEMMA": "have"}, - {"IS_ASCII": True, "IS_PUNCT": False, "OP": "?"}, - {"LOWER": "to"}, - {"LOWER": "do"}, - {"POS": "ADP"}, - ] - ], - ), -] - - -@pytest.fixture -def doc(en_tokenizer): - doc = en_tokenizer("also has to do with") - doc[0].tag_ = "RB" - doc[1].tag_ = "VBZ" - doc[2].tag_ = "TO" - doc[3].tag_ = "VB" - doc[4].tag_ = "IN" - return doc - - -@pytest.fixture -def matcher(en_tokenizer): - return Matcher(en_tokenizer.vocab) - - -@pytest.mark.parametrize("pattern", PATTERNS) -def test_issue3009(doc, matcher, pattern): - """Test problem with matcher quantifiers""" - matcher.add(pattern[0], None, *pattern[1]) - matches = matcher(doc) - assert matches - - -def test_issue2464(matcher): - """Test problem with successive ?. This is the same bug, so putting it here.""" - doc = Doc(matcher.vocab, words=["a", "b"]) - matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}]) - matches = matcher(doc) - assert len(matches) == 3 diff --git a/spacy/tests/regression/test_issue3012.py b/spacy/tests/regression/test_issue3012.py deleted file mode 100644 index 8fdc8b318..000000000 --- a/spacy/tests/regression/test_issue3012.py +++ /dev/null @@ -1,31 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from ...attrs import ENT_IOB, ENT_TYPE -from ...tokens import Doc -from ..util import get_doc - - -def test_issue3012(en_vocab): - """Test that the is_tagged attribute doesn't get overwritten when we from_array - without tag information.""" - words = ["This", "is", "10", "%", "."] - tags = ["DT", "VBZ", "CD", "NN", "."] - pos = ["DET", "VERB", "NUM", "NOUN", "PUNCT"] - ents = [(2, 4, "PERCENT")] - doc = get_doc(en_vocab, words=words, tags=tags, pos=pos, ents=ents) - assert doc.is_tagged - - expected = ("10", "NUM", "CD", "PERCENT") - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected - - header = [ENT_IOB, ENT_TYPE] - ent_array = doc.to_array(header) - doc.from_array(header, ent_array) - - assert (doc[2].text, doc[2].pos_, doc[2].tag_, doc[2].ent_type_) == expected - - # serializing then deserializing - doc_bytes = doc.to_bytes() - doc2 = Doc(en_vocab).from_bytes(doc_bytes) - assert (doc2[2].text, doc2[2].pos_, doc2[2].tag_, doc2[2].ent_type_) == expected diff --git a/spacy/tests/regression/test_issue3199.py b/spacy/tests/regression/test_issue3199.py deleted file mode 100644 index d80a55330..000000000 --- a/spacy/tests/regression/test_issue3199.py +++ /dev/null @@ -1,15 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.tokens import Doc -from spacy.vocab import Vocab - - -def test_issue3199(): - """Test that Span.noun_chunks works correctly if no noun chunks iterator - is available. To make this test future-proof, we're constructing a Doc - with a new Vocab here and setting is_parsed to make sure the noun chunks run. - """ - doc = Doc(Vocab(), words=["This", "is", "a", "sentence"]) - doc.is_parsed = True - assert list(doc[0:3].noun_chunks) == [] diff --git a/spacy/tests/regression/test_issue3209.py b/spacy/tests/regression/test_issue3209.py deleted file mode 100644 index 469e38b8c..000000000 --- a/spacy/tests/regression/test_issue3209.py +++ /dev/null @@ -1,23 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.lang.en import English - - -def test_issue3209(): - """Test issue that occurred in spaCy nightly where NER labels were being - mapped to classes incorrectly after loading the model, when the labels - were added using ner.add_label(). - """ - nlp = English() - ner = nlp.create_pipe("ner") - nlp.add_pipe(ner) - - ner.add_label("ANIMAL") - nlp.begin_training() - move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"] - assert ner.move_names == move_names - nlp2 = English() - nlp2.add_pipe(nlp2.create_pipe("ner")) - nlp2.from_bytes(nlp.to_bytes()) - assert nlp2.get_pipe("ner").move_names == move_names diff --git a/spacy/tests/regression/test_issue3248.py b/spacy/tests/regression/test_issue3248.py deleted file mode 100644 index c4b592f3c..000000000 --- a/spacy/tests/regression/test_issue3248.py +++ /dev/null @@ -1,27 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from spacy.matcher import PhraseMatcher -from spacy.lang.en import English -from spacy.compat import pickle - - -def test_issue3248_1(): - """Test that the PhraseMatcher correctly reports its number of rules, not - total number of patterns.""" - nlp = English() - matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) - matcher.add("TEST2", None, nlp("d")) - assert len(matcher) == 2 - - -def test_issue3248_2(): - """Test that the PhraseMatcher can be pickled correctly.""" - nlp = English() - matcher = PhraseMatcher(nlp.vocab) - matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c")) - matcher.add("TEST2", None, nlp("d")) - data = pickle.dumps(matcher) - new_matcher = pickle.loads(data) - assert len(new_matcher) == len(matcher) diff --git a/spacy/tests/regression/test_issue3277.py b/spacy/tests/regression/test_issue3277.py deleted file mode 100644 index 88ea67774..000000000 --- a/spacy/tests/regression/test_issue3277.py +++ /dev/null @@ -1,11 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -def test_issue3277(es_tokenizer): - """Test that hyphens are split correctly as prefixes.""" - doc = es_tokenizer("—Yo me llamo... –murmuró el niño– Emilio Sánchez Pérez.") - assert len(doc) == 14 - assert doc[0].text == "\u2014" - assert doc[5].text == "\u2013" - assert doc[9].text == "\u2013" diff --git a/spacy/tests/regression/test_issue3288.py b/spacy/tests/regression/test_issue3288.py deleted file mode 100644 index 188bf361c..000000000 --- a/spacy/tests/regression/test_issue3288.py +++ /dev/null @@ -1,18 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import numpy -from spacy import displacy - -from ..util import get_doc - - -def test_issue3288(en_vocab): - """Test that retokenization works correctly via displaCy when punctuation - is merged onto the preceeding token and tensor is resized.""" - words = ["Hello", "World", "!", "When", "is", "this", "breaking", "?"] - heads = [1, 0, -1, 1, 0, 1, -2, -3] - deps = ["intj", "ROOT", "punct", "advmod", "ROOT", "det", "nsubj", "punct"] - doc = get_doc(en_vocab, words=words, heads=heads, deps=deps) - doc.tensor = numpy.zeros((len(words), 96), dtype="float32") - displacy.render(doc) diff --git a/spacy/tests/regression/test_issue3289.py b/spacy/tests/regression/test_issue3289.py deleted file mode 100644 index 0e64f07ce..000000000 --- a/spacy/tests/regression/test_issue3289.py +++ /dev/null @@ -1,15 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from spacy.lang.en import English - - -def test_issue3289(): - """Test that Language.to_bytes handles serializing a pipeline component - with an uninitialized model.""" - nlp = English() - nlp.add_pipe(nlp.create_pipe("textcat")) - bytes_data = nlp.to_bytes() - new_nlp = English() - new_nlp.add_pipe(nlp.create_pipe("textcat")) - new_nlp.from_bytes(bytes_data) diff --git a/spacy/tests/regression/test_issue3328.py b/spacy/tests/regression/test_issue3328.py deleted file mode 100644 index c397feebb..000000000 --- a/spacy/tests/regression/test_issue3328.py +++ /dev/null @@ -1,19 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from spacy.matcher import Matcher -from spacy.tokens import Doc - - -def test_issue3328(en_vocab): - doc = Doc(en_vocab, words=["Hello", ",", "how", "are", "you", "doing", "?"]) - matcher = Matcher(en_vocab) - patterns = [ - [{"LOWER": {"IN": ["hello", "how"]}}], - [{"LOWER": {"IN": ["you", "doing"]}}], - ] - matcher.add("TEST", None, *patterns) - matches = matcher(doc) - assert len(matches) == 4 - matched_texts = [doc[start:end].text for _, start, end in matches] - assert matched_texts == ["Hello", "how", "you", "doing"] diff --git a/spacy/tests/regression/test_issue3331.py b/spacy/tests/regression/test_issue3331.py deleted file mode 100644 index c30712f81..000000000 --- a/spacy/tests/regression/test_issue3331.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import pytest -from spacy.matcher import PhraseMatcher -from spacy.tokens import Doc - - -@pytest.mark.xfail -def test_issue3331(en_vocab): - """Test that duplicate patterns for different rules result in multiple - matches, one per rule. - """ - matcher = PhraseMatcher(en_vocab) - matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"])) - matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"])) - doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"]) - matches = matcher(doc) - assert len(matches) == 2 - match_ids = [en_vocab.strings[matches[0][0]], en_vocab.strings[matches[1][0]]] - assert sorted(match_ids) == ["A", "B"] diff --git a/spacy/tests/regression/test_issue3345.py b/spacy/tests/regression/test_issue3345.py deleted file mode 100644 index c358fd7bc..000000000 --- a/spacy/tests/regression/test_issue3345.py +++ /dev/null @@ -1,26 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.lang.en import English -from spacy.tokens import Doc -from spacy.pipeline import EntityRuler, EntityRecognizer - - -def test_issue3345(): - """Test case where preset entity crosses sentence boundary.""" - nlp = English() - doc = Doc(nlp.vocab, words=["I", "live", "in", "New", "York"]) - doc[4].is_sent_start = True - ruler = EntityRuler(nlp, patterns=[{"label": "GPE", "pattern": "New York"}]) - ner = EntityRecognizer(doc.vocab) - # Add the OUT action. I wouldn't have thought this would be necessary... - ner.moves.add_action(5, "") - ner.add_label("GPE") - doc = ruler(doc) - # Get into the state just before "New" - state = ner.moves.init_batch([doc])[0] - ner.moves.apply_transition(state, "O") - ner.moves.apply_transition(state, "O") - ner.moves.apply_transition(state, "O") - # Check that B-GPE is valid. - assert ner.moves.is_valid(state, "B-GPE") diff --git a/spacy/tests/regression/test_issue3356.py b/spacy/tests/regression/test_issue3356.py deleted file mode 100644 index f8d16459c..000000000 --- a/spacy/tests/regression/test_issue3356.py +++ /dev/null @@ -1,72 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import re -from spacy import compat - -prefix_search = ( - b"^\xc2\xa7|^%|^=|^\xe2\x80\x94|^\xe2\x80\x93|^\\+(?![0-9])" - b"|^\xe2\x80\xa6|^\xe2\x80\xa6\xe2\x80\xa6|^,|^:|^;|^\\!|^\\?" - b"|^\xc2\xbf|^\xd8\x9f|^\xc2\xa1|^\\(|^\\)|^\\[|^\\]|^\\{|^\\}" - b"|^<|^>|^_|^#|^\\*|^&|^\xe3\x80\x82|^\xef\xbc\x9f|^\xef\xbc\x81|" - b"^\xef\xbc\x8c|^\xe3\x80\x81|^\xef\xbc\x9b|^\xef\xbc\x9a|" - b"^\xef\xbd\x9e|^\xc2\xb7|^\xe0\xa5\xa4|^\xd8\x8c|^\xd8\x9b|" - b"^\xd9\xaa|^\\.\\.+|^\xe2\x80\xa6|^\\'|^\"|^\xe2\x80\x9d|" - b"^\xe2\x80\x9c|^`|^\xe2\x80\x98|^\xc2\xb4|^\xe2\x80\x99|" - b"^\xe2\x80\x9a|^,|^\xe2\x80\x9e|^\xc2\xbb|^\xc2\xab|^\xe3\x80\x8c|" - b"^\xe3\x80\x8d|^\xe3\x80\x8e|^\xe3\x80\x8f|^\xef\xbc\x88|" - b"^\xef\xbc\x89|^\xe3\x80\x94|^\xe3\x80\x95|^\xe3\x80\x90|" - b"^\xe3\x80\x91|^\xe3\x80\x8a|^\xe3\x80\x8b|^\xe3\x80\x88|" - b"^\xe3\x80\x89|^\\$|^\xc2\xa3|^\xe2\x82\xac|^\xc2\xa5|^\xe0\xb8\xbf|" - b"^US\\$|^C\\$|^A\\$|^\xe2\x82\xbd|^\xef\xb7\xbc|^\xe2\x82\xb4|" - b"^[\\u00A6\\u00A9\\u00AE\\u00B0\\u0482\\u058D\\u058E\\u060E\\u060F" - b"\\u06DE\\u06E9\\u06FD\\u06FE\\u07F6\\u09FA\\u0B70\\u0BF3-\\u0BF8" - b"\\u0BFA\\u0C7F\\u0D4F\\u0D79\\u0F01-\\u0F03\\u0F13\\u0F15-\\u0F17" - b"\\u0F1A-\\u0F1F\\u0F34\\u0F36\\u0F38\\u0FBE-\\u0FC5\\u0FC7-\\u0FCC" - b"\\u0FCE\\u0FCF\\u0FD5-\\u0FD8\\u109E\\u109F\\u1390-\\u1399\\u1940" - b"\\u19DE-\\u19FF\\u1B61-\\u1B6A\\u1B74-\\u1B7C\\u2100\\u2101\\u2103" - b"-\\u2106\\u2108\\u2109\\u2114\\u2116\\u2117\\u211E-\\u2123\\u2125" - b"\\u2127\\u2129\\u212E\\u213A\\u213B\\u214A\\u214C\\u214D\\u214F" - b"\\u218A\\u218B\\u2195-\\u2199\\u219C-\\u219F\\u21A1\\u21A2\\u21A4" - b"\\u21A5\\u21A7-\\u21AD\\u21AF-\\u21CD\\u21D0\\u21D1\\u21D3\\u21D5" - b"-\\u21F3\\u2300-\\u2307\\u230C-\\u231F\\u2322-\\u2328\\u232B" - b"-\\u237B\\u237D-\\u239A\\u23B4-\\u23DB\\u23E2-\\u2426\\u2440" - b"-\\u244A\\u249C-\\u24E9\\u2500-\\u25B6\\u25B8-\\u25C0\\u25C2" - b"-\\u25F7\\u2600-\\u266E\\u2670-\\u2767\\u2794-\\u27BF\\u2800" - b"-\\u28FF\\u2B00-\\u2B2F\\u2B45\\u2B46\\u2B4D-\\u2B73\\u2B76" - b"-\\u2B95\\u2B98-\\u2BC8\\u2BCA-\\u2BFE\\u2CE5-\\u2CEA\\u2E80" - b"-\\u2E99\\u2E9B-\\u2EF3\\u2F00-\\u2FD5\\u2FF0-\\u2FFB\\u3004" - b"\\u3012\\u3013\\u3020\\u3036\\u3037\\u303E\\u303F\\u3190\\u3191" - b"\\u3196-\\u319F\\u31C0-\\u31E3\\u3200-\\u321E\\u322A-\\u3247\\u3250" - b"\\u3260-\\u327F\\u328A-\\u32B0\\u32C0-\\u32FE\\u3300-\\u33FF\\u4DC0" - b"-\\u4DFF\\uA490-\\uA4C6\\uA828-\\uA82B\\uA836\\uA837\\uA839\\uAA77" - b"-\\uAA79\\uFDFD\\uFFE4\\uFFE8\\uFFED\\uFFEE\\uFFFC\\uFFFD\\U00010137" - b"-\\U0001013F\\U00010179-\\U00010189\\U0001018C-\\U0001018E" - b"\\U00010190-\\U0001019B\\U000101A0\\U000101D0-\\U000101FC\\U00010877" - b"\\U00010878\\U00010AC8\\U0001173F\\U00016B3C-\\U00016B3F\\U00016B45" - b"\\U0001BC9C\\U0001D000-\\U0001D0F5\\U0001D100-\\U0001D126\\U0001D129" - b"-\\U0001D164\\U0001D16A-\\U0001D16C\\U0001D183\\U0001D184\\U0001D18C" - b"-\\U0001D1A9\\U0001D1AE-\\U0001D1E8\\U0001D200-\\U0001D241\\U0001D245" - b"\\U0001D300-\\U0001D356\\U0001D800-\\U0001D9FF\\U0001DA37-\\U0001DA3A" - b"\\U0001DA6D-\\U0001DA74\\U0001DA76-\\U0001DA83\\U0001DA85\\U0001DA86" - b"\\U0001ECAC\\U0001F000-\\U0001F02B\\U0001F030-\\U0001F093\\U0001F0A0" - b"-\\U0001F0AE\\U0001F0B1-\\U0001F0BF\\U0001F0C1-\\U0001F0CF\\U0001F0D1" - b"-\\U0001F0F5\\U0001F110-\\U0001F16B\\U0001F170-\\U0001F1AC\\U0001F1E6" - b"-\\U0001F202\\U0001F210-\\U0001F23B\\U0001F240-\\U0001F248\\U0001F250" - b"\\U0001F251\\U0001F260-\\U0001F265\\U0001F300-\\U0001F3FA\\U0001F400" - b"-\\U0001F6D4\\U0001F6E0-\\U0001F6EC\\U0001F6F0-\\U0001F6F9\\U0001F700" - b"-\\U0001F773\\U0001F780-\\U0001F7D8\\U0001F800-\\U0001F80B\\U0001F810" - b"-\\U0001F847\\U0001F850-\\U0001F859\\U0001F860-\\U0001F887\\U0001F890" - b"-\\U0001F8AD\\U0001F900-\\U0001F90B\\U0001F910-\\U0001F93E\\U0001F940" - b"-\\U0001F970\\U0001F973-\\U0001F976\\U0001F97A\\U0001F97C-\\U0001F9A2" - b"\\U0001F9B0-\\U0001F9B9\\U0001F9C0-\\U0001F9C2\\U0001F9D0-\\U0001F9FF" - b"\\U0001FA60-\\U0001FA6D]" -) - - -if compat.is_python2: - # If we have this test in Python 3, pytest chokes, as it can't print the - # string above in the xpass message. - def test_issue3356(): - pattern = re.compile(compat.unescape_unicode(prefix_search.decode("utf8"))) - assert not pattern.search("hello") diff --git a/spacy/tests/regression/test_issue3410.py b/spacy/tests/regression/test_issue3410.py deleted file mode 100644 index 5d2ac5ba3..000000000 --- a/spacy/tests/regression/test_issue3410.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest -from spacy.lang.en import English -from spacy.matcher import Matcher, PhraseMatcher - - -def test_issue3410(): - texts = ["Hello world", "This is a test"] - nlp = English() - matcher = Matcher(nlp.vocab) - phrasematcher = PhraseMatcher(nlp.vocab) - with pytest.deprecated_call(): - docs = list(nlp.pipe(texts, n_threads=4)) - with pytest.deprecated_call(): - docs = list(nlp.tokenizer.pipe(texts, n_threads=4)) - with pytest.deprecated_call(): - list(matcher.pipe(docs, n_threads=4)) - with pytest.deprecated_call(): - list(phrasematcher.pipe(docs, n_threads=4)) diff --git a/spacy/tests/regression/test_issue3447.py b/spacy/tests/regression/test_issue3447.py deleted file mode 100644 index 0ca1f9e67..000000000 --- a/spacy/tests/regression/test_issue3447.py +++ /dev/null @@ -1,14 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.util import decaying - - -def test_issue3447(): - sizes = decaying(10.0, 1.0, 0.5) - size = next(sizes) - assert size == 10.0 - size = next(sizes) - assert size == 10.0 - 0.5 - size = next(sizes) - assert size == 10.0 - 0.5 - 0.5 diff --git a/spacy/tests/regression/test_issue3449.py b/spacy/tests/regression/test_issue3449.py deleted file mode 100644 index deff49fd6..000000000 --- a/spacy/tests/regression/test_issue3449.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest - -from spacy.lang.en import English - - -@pytest.mark.xfail(reason="default suffix rules avoid one upper-case letter before dot") -def test_issue3449(): - nlp = English() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - text1 = "He gave the ball to I. Do you want to go to the movies with I?" - text2 = "He gave the ball to I. Do you want to go to the movies with I?" - text3 = "He gave the ball to I.\nDo you want to go to the movies with I?" - t1 = nlp(text1) - t2 = nlp(text2) - t3 = nlp(text3) - assert t1[5].text == "I" - assert t2[5].text == "I" - assert t3[5].text == "I" diff --git a/spacy/tests/regression/test_issue3468.py b/spacy/tests/regression/test_issue3468.py deleted file mode 100644 index ebbed2640..000000000 --- a/spacy/tests/regression/test_issue3468.py +++ /dev/null @@ -1,21 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.lang.en import English -from spacy.tokens import Doc - - -def test_issue3468(): - """Test that sentence boundaries are set correctly so Doc.is_sentenced can - be restored after serialization.""" - nlp = English() - nlp.add_pipe(nlp.create_pipe("sentencizer")) - doc = nlp("Hello world") - assert doc[0].is_sent_start - assert doc.is_sentenced - assert len(list(doc.sents)) == 1 - doc_bytes = doc.to_bytes() - new_doc = Doc(nlp.vocab).from_bytes(doc_bytes) - assert new_doc[0].is_sent_start - assert new_doc.is_sentenced - assert len(list(new_doc.sents)) == 1 From 4e04080b760fd3019d74259ae2172a836846317d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 10 Jul 2019 13:00:52 +0200 Subject: [PATCH 119/148] Only compare sorted patterns in test Try to work around flaky tests on Python 3.5 --- spacy/tests/pipeline/test_entity_ruler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py index a371be38b..5ab1a3af0 100644 --- a/spacy/tests/pipeline/test_entity_ruler.py +++ b/spacy/tests/pipeline/test_entity_ruler.py @@ -111,7 +111,7 @@ def test_entity_ruler_serialize_bytes(nlp, patterns): assert len(new_ruler.patterns) == len(ruler.patterns) for pattern in ruler.patterns: assert pattern in new_ruler.patterns - assert new_ruler.labels == ruler.labels + assert sorted(new_ruler.labels) == sorted(ruler.labels) def test_entity_ruler_serialize_phrase_matcher_attr_bytes(nlp, patterns): From 87f7ec34d503c3cde11570ce5b4ebb961dbb37fe Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jul 2019 13:53:34 +0200 Subject: [PATCH 120/148] Add test for #3880 --- spacy/tests/regression/test_issue3880.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 spacy/tests/regression/test_issue3880.py diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py new file mode 100644 index 000000000..2171b5911 --- /dev/null +++ b/spacy/tests/regression/test_issue3880.py @@ -0,0 +1,23 @@ +# coding: utf8 +from __future__ import unicode_literals +from spacy.lang.en import English +import pytest + + +@pytest.mark.xfail +def test_issue3880(): + """Test that `nlp.pipe()` works when an empty string ends the batch. + + Fixed in v7.0.5 of Thinc. + """ + texts = ["hello", "world", "", ""] + nlp = English() + nlp.add_pipe(nlp.create_pipe("parser")) + nlp.add_pipe(nlp.create_pipe("ner")) + nlp.add_pipe(nlp.create_pipe("tagger")) + nlp.get_pipe("parser").add_label("dep") + nlp.get_pipe("ner").add_label("PERSON") + nlp.get_pipe("tagger").add_label("NN") + nlp.begin_training() + for doc in nlp.pipe(texts): + pass From 465456edb9871dd4bcc24e5a6236bfe272e5f137 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jul 2019 14:01:17 +0200 Subject: [PATCH 121/148] Un-xfail test #3880 --- spacy/tests/regression/test_issue3880.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py index 2171b5911..ecc12afa3 100644 --- a/spacy/tests/regression/test_issue3880.py +++ b/spacy/tests/regression/test_issue3880.py @@ -4,7 +4,6 @@ from spacy.lang.en import English import pytest -@pytest.mark.xfail def test_issue3880(): """Test that `nlp.pipe()` works when an empty string ends the batch. From 3d18600c052be8dca59e9193310f7fc6041011f8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jul 2019 19:21:23 +0200 Subject: [PATCH 122/148] Return True from doc.is_... when no ambiguity * Make doc.is_sentenced return True if len(doc) < 2. * Make doc.is_nered return True if len(doc) == 0, for consistency. Closes #3934 --- spacy/tokens/doc.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index a040cdc67..c77e5c44e 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -240,6 +240,8 @@ cdef class Doc: return True if self.is_parsed: return True + if len(self) < 2: + return True for i in range(1, self.length): if self.c[i].sent_start == -1 or self.c[i].sent_start == 1: return True @@ -251,6 +253,8 @@ cdef class Doc: *any* of the tokens has a named entity tag set (even if the others are uknown values). """ + if len(self) == 0: + return True for i in range(self.length): if self.c[i].ent_iob != 0: return True From b94c5443d90c1fe60eb41d3a520bd8fa8d92f860 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jul 2019 19:37:20 +0200 Subject: [PATCH 123/148] Rename Binder->DocBox, and improve it. --- spacy/tokens/_serialize.py | 73 +++++++++++++++++++++++--------------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 43ea78242..c4478e080 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -11,29 +11,27 @@ from ..tokens import Doc from ..attrs import SPACY, ORTH -class Binder(object): +class DocBox(object): """Serialize analyses from a collection of doc objects.""" - def __init__(self, attrs=None): - """Create a Binder object, to hold serialized annotations. + def __init__(self, attrs=None, store_user_data=False): + """Create a DocBox object, to hold serialized annotations. attrs (list): List of attributes to serialize. 'orth' and 'spacy' are always serialized, so they're not required. Defaults to None. """ attrs = attrs or [] - self.attrs = list(attrs) # Ensure ORTH is always attrs[0] - if ORTH in self.attrs: - self.attrs.pop(ORTH) - if SPACY in self.attrs: - self.attrs.pop(SPACY) + self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY] self.attrs.insert(0, ORTH) self.tokens = [] self.spaces = [] + self.user_data = [] self.strings = set() + self.store_user_data = store_user_data def add(self, doc): - """Add a doc's annotations to the binder for serialization.""" + """Add a doc's annotations to the DocBox for serialization.""" array = doc.to_array(self.attrs) if len(array.shape) == 1: array = array.reshape((array.shape[0], 1)) @@ -43,27 +41,35 @@ class Binder(object): spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) self.strings.update(w.text for w in doc) + if self.store_user_data: + self.user_data.append(srsly.msgpack_dumps(doc.user_data)) def get_docs(self, vocab): """Recover Doc objects from the annotations, using the given vocab.""" for string in self.strings: vocab[string] orth_col = self.attrs.index(ORTH) - for tokens, spaces in zip(self.tokens, self.spaces): + for i in range(len(self.tokens)): + tokens = self.tokens[i] + spaces = self.spaces[i] words = [vocab.strings[orth] for orth in tokens[:, orth_col]] doc = Doc(vocab, words=words, spaces=spaces) doc = doc.from_array(self.attrs, tokens) + if self.store_user_data: + doc.user_data.update(srsly.msgpack_loads(self.user_data[i])) yield doc def merge(self, other): - """Extend the annotations of this binder with the annotations from another.""" + """Extend the annotations of this DocBox with the annotations from another.""" assert self.attrs == other.attrs self.tokens.extend(other.tokens) self.spaces.extend(other.spaces) self.strings.update(other.strings) + if self.store_user_data: + self.user_data.extend(other.user_data) def to_bytes(self): - """Serialize the binder's annotations into a byte string.""" + """Serialize the DocBox's annotations into a byte string.""" for tokens in self.tokens: assert len(tokens.shape) == 2, tokens.shape lengths = [len(tokens) for tokens in self.tokens] @@ -74,10 +80,12 @@ class Binder(object): "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "strings": list(self.strings), } + if self.store_user_data: + msg["user_data"] = self.user_data return gzip.compress(srsly.msgpack_dumps(msg)) def from_bytes(self, string): - """Deserialize the binder's annotations from a byte string.""" + """Deserialize the DocBox's annotations from a byte string.""" msg = srsly.msgpack_loads(gzip.decompress(string)) self.attrs = msg["attrs"] self.strings = set(msg["strings"]) @@ -89,29 +97,38 @@ class Binder(object): flat_spaces = flat_spaces.reshape((flat_spaces.size, 1)) self.tokens = NumpyOps().unflatten(flat_tokens, lengths) self.spaces = NumpyOps().unflatten(flat_spaces, lengths) + if self.store_user_data and "user_data" in msg: + self.user_data = list(msg["user_data"]) for tokens in self.tokens: assert len(tokens.shape) == 2, tokens.shape return self -def merge_bytes(binder_strings): - """Concatenate multiple serialized binders into one byte string.""" - output = None - for byte_string in binder_strings: - binder = Binder().from_bytes(byte_string) - if output is None: - output = binder - else: - output.merge(binder) - return output.to_bytes() +def merge_boxes(boxes): + merged = None + for byte_string in boxes: + if byte_string is not None: + box = DocBox(store_user_data=True).from_bytes(byte_string) + if merged is None: + merged = box + else: + merged.merge(box) + if merged is not None: + return merged.to_bytes() + else: + return b'' -def pickle_binder(binder): - return (unpickle_binder, (binder.to_bytes(),)) +def pickle_box(box): + return (unpickle_box, (box.to_bytes(),)) -def unpickle_binder(byte_string): - return Binder().from_bytes(byte_string) +def unpickle_box(byte_string): + return Box().from_bytes(byte_string) -copy_reg.pickle(Binder, pickle_binder, unpickle_binder) +copy_reg.pickle(Box, pickle_box, unpickle_box) +# Compatibility, as we had named it this previously. +Binder = DocBox + +__all__ = ["DocBox"] From c4c21cb4281133890d0b59c4b5a847d1ef9bff30 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 10 Jul 2019 19:39:38 +0200 Subject: [PATCH 124/148] more friendly textcat errors (#3946) * more friendly textcat errors with require_model and require_labels * update thinc version with recent bugfix --- requirements.txt | 2 +- spacy/errors.py | 1 + spacy/pipeline/pipes.pyx | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8cc52dfe4..58761b95c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=2.0.1,<2.1.0 -thinc>=7.0.2,<7.1.0 +thinc>=7.0.5,<7.1.0 blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.2.0,<1.1.0 diff --git a/spacy/errors.py b/spacy/errors.py index 8f2eab3a1..347ad1fca 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -403,6 +403,7 @@ class Errors(object): E140 = ("The list of entities, prior probabilities and entity vectors should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the provided {found}.") E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or 'cosine'") + E143 = ("Labels for component '{name}' not initialized. Did you forget to call add_label()?") @add_codes diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index d99a1f73e..891e8d4e3 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -902,6 +902,11 @@ class TextCategorizer(Pipe): def labels(self): return tuple(self.cfg.setdefault("labels", [])) + def require_labels(self): + """Raise an error if the component's model has no labels defined.""" + if not self.labels: + raise ValueError(Errors.E143.format(name=self.name)) + @labels.setter def labels(self, value): self.cfg["labels"] = tuple(value) @@ -931,6 +936,7 @@ class TextCategorizer(Pipe): doc.cats[label] = float(scores[i, j]) def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): + self.require_model() scores, bp_scores = self.model.begin_update(docs, drop=drop) loss, d_scores = self.get_loss(docs, golds, scores) bp_scores(d_scores, sgd=sgd) @@ -985,6 +991,7 @@ class TextCategorizer(Pipe): def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs): if self.model is True: self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors") + self.require_labels() self.model = self.Model(len(self.labels), **self.cfg) link_vectors_to_models(self.vocab) if sgd is None: From c6cb78275888228cc647a950d9adfbf545a60ad6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 10 Jul 2019 22:54:09 +0200 Subject: [PATCH 125/148] Set version to 2.1.5.dev0 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 5e7093606..758809934 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.4" +__version__ = "2.1.5.dev0" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From a89fecce97c06d7315bb955de1127025fa310b4b Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jul 2019 00:43:55 +0200 Subject: [PATCH 126/148] failing unit test for issue #3869 --- spacy/tests/regression/test_issue3869.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 spacy/tests/regression/test_issue3869.py diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py new file mode 100644 index 000000000..72a485042 --- /dev/null +++ b/spacy/tests/regression/test_issue3869.py @@ -0,0 +1,29 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from spacy.attrs import IS_ALPHA +from spacy.lang.en import English + + +@pytest.mark.parametrize( + "sentence", + [ + 'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.', + 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.' + ], +) +def test_issue3869(sentence): + """Test that the Doc's count_by function works consistently""" + nlp = English() + + doc = nlp(sentence) + + count = 0 + for token in doc: + count += token.is_alpha + + assert count == doc.count_by(IS_ALPHA).get(1, 0) + + From e0804123854b91bbad5a3e084de867d5fbbff788 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jul 2019 01:53:06 +0200 Subject: [PATCH 127/148] tracked the bug down to PreshCounter.inc - still unclear what goes wrong --- spacy/tests/regression/test_issue3869.py | 6 ++++- spacy/tokens/doc.pxd | 1 + spacy/tokens/doc.pyx | 33 +++++++++++++++++++++++- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py index 72a485042..d76da6989 100644 --- a/spacy/tests/regression/test_issue3869.py +++ b/spacy/tests/regression/test_issue3869.py @@ -11,13 +11,17 @@ from spacy.lang.en import English "sentence", [ 'The story was to the effect that a young American student recently called on Professor Christlieb with a letter of introduction.', - 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.' + 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s #1.', + 'The next month Barry Siddall joined Stoke City on a free transfer, after Chris Pearce had established himself as the Vale\'s number one', + 'Indeed, making the one who remains do all the work has installed him into a position of such insolent tyranny, it will take a month at least to reduce him to his proper proportions.', + "It was a missed assignment, but it shouldn't have resulted in a turnover ..." ], ) def test_issue3869(sentence): """Test that the Doc's count_by function works consistently""" nlp = English() + print() doc = nlp(sentence) count = 0 diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 7cdc2316a..cc05cb495 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -8,6 +8,7 @@ from ..typedefs cimport attr_t from ..attrs cimport attr_id_t + cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index c77e5c44e..657b9a1d6 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -13,6 +13,7 @@ from libc.math cimport sqrt import numpy import numpy.linalg import struct +from libc.stdint cimport int64_t import srsly from thinc.neural.util import get_array_module, copy_array @@ -710,22 +711,52 @@ cdef class Doc: cdef int i cdef attr_t attr cdef size_t count + cdef int64_t this_value + + print("COUNTING") if counts is None: counts = PreshCounter() output_dict = True + print("counts None") else: output_dict = False # Take this check out of the loop, for a bit of extra speed if exclude is None: + print("exclude None") for i in range(self.length): - counts.inc(get_token_attr(&self.c[i], attr_id), 1) + print() + print("token", self[i]) + this_value = get_token_attr(&self.c[i], attr_id) + print("token attr value", this_value) + print("type attr value", type(this_value)) + + print(i, "key this_value before", counts.c_map.cells[this_value].key) + print(i, "value this_value before", counts.c_map.cells[this_value].value) + counts.inc(this_value, 1) + print(i, "key this_value after", counts.c_map.cells[this_value].key) + print(i, "value this_value after", counts.c_map.cells[this_value].value) + + print(i, "key 0", counts.c_map.cells[0].key) + print(i, "value 0", counts.c_map.cells[0].value) + print(i, "key 1", counts.c_map.cells[1].key) + print(i, "value 1", counts.c_map.cells[1].value) else: for i in range(self.length): if not exclude(self[i]): attr = get_token_attr(&self.c[i], attr_id) counts.inc(attr, 1) if output_dict: + print("output_dict") + print(counts.length) + print(counts.total) + print("key 0", counts.c_map.cells[0].key) + print("value 0", counts.c_map.cells[0].value) + print("key 1", counts.c_map.cells[1].key) + print("value 1", counts.c_map.cells[1].value) + print() + print(dict(counts)) + print() return dict(counts) def _realloc(self, new_size): From d5311b3c42554d5288f3fcb9261ae301a21fd9e1 Mon Sep 17 00:00:00 2001 From: yash Date: Thu, 11 Jul 2019 14:53:14 +0530 Subject: [PATCH 128/148] Add test file for issue (#3625) and spacy contributor agreement --- .github/contributors/yashpatadia.md | 106 +++++++++++++++++++++++ spacy/tests/regression/test_issue3625.py | 9 ++ 2 files changed, 115 insertions(+) create mode 100644 .github/contributors/yashpatadia.md create mode 100644 spacy/tests/regression/test_issue3625.py diff --git a/.github/contributors/yashpatadia.md b/.github/contributors/yashpatadia.md new file mode 100644 index 000000000..2dcf9211d --- /dev/null +++ b/.github/contributors/yashpatadia.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Yash Patadia | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 11/07/2019 | +| GitHub username | yash1994 | +| Website (optional) | | \ No newline at end of file diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py new file mode 100644 index 000000000..f61b834fb --- /dev/null +++ b/spacy/tests/regression/test_issue3625.py @@ -0,0 +1,9 @@ +from __future__ import unicode_literals + +from spacy.lang.hi import Hindi + +def test_issue3625(): + """Test that default punctuation rules applies to hindi unicode characters""" + nlp = Hindi() + doc = nlp(u"hi. how हुए. होटल, होटल") + assert [token.text for token in doc] == ['hi', '.', 'how', 'हुए', '.', 'होटल', ',', 'होटल'] \ No newline at end of file From 815f8d13dd0cfe034201b2c35452012a7adb1e03 Mon Sep 17 00:00:00 2001 From: yash Date: Thu, 11 Jul 2019 15:00:51 +0530 Subject: [PATCH 129/148] Fix default punctuation rules for hindi text (#3625 explosion) --- spacy/lang/char_classes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py index cb2e817d5..fb320b2ff 100644 --- a/spacy/lang/char_classes.py +++ b/spacy/lang/char_classes.py @@ -9,6 +9,8 @@ _bengali = r"\u0980-\u09FF" _hebrew = r"\u0591-\u05F4\uFB1D-\uFB4F" +_hindi = r"\u0900-\u097F" + # Latin standard _latin_u_standard = r"A-Z" _latin_l_standard = r"a-z" @@ -193,7 +195,7 @@ _ukrainian = r"а-щюяіїєґА-ЩЮЯІЇЄҐ" _upper = LATIN_UPPER + _russian_upper + _tatar_upper + _greek_upper + _ukrainian_upper _lower = LATIN_LOWER + _russian_lower + _tatar_lower + _greek_lower + _ukrainian_lower -_uncased = _bengali + _hebrew + _persian + _sinhala +_uncased = _bengali + _hebrew + _persian + _sinhala + _hindi ALPHA = group_chars(LATIN + _russian + _tatar + _greek + _ukrainian + _uncased) ALPHA_LOWER = group_chars(_lower + _uncased) From bd3c3f342b01cd0b48e1a02bc11bc37c9d9e63a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Jul 2019 11:48:55 +0200 Subject: [PATCH 130/148] Fix _serialize --- spacy/tokens/_serialize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index c4478e080..57bc98f4b 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -124,10 +124,10 @@ def pickle_box(box): def unpickle_box(byte_string): - return Box().from_bytes(byte_string) + return DocBox().from_bytes(byte_string) -copy_reg.pickle(Box, pickle_box, unpickle_box) +copy_reg.pickle(DocBox, pickle_box, unpickle_box) # Compatibility, as we had named it this previously. Binder = DocBox From 0491a8e7c83dcbf8a293305681b498d38514541f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Jul 2019 11:49:36 +0200 Subject: [PATCH 131/148] Reformat --- spacy/tokens/_serialize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index 57bc98f4b..41f524839 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -116,7 +116,7 @@ def merge_boxes(boxes): if merged is not None: return merged.to_bytes() else: - return b'' + return b"" def pickle_box(box): From ae2d52e323ea8959caf474d23de857d59b5b6ca8 Mon Sep 17 00:00:00 2001 From: yash Date: Thu, 11 Jul 2019 15:26:27 +0530 Subject: [PATCH 132/148] Add default encoding utf-8 for test file --- spacy/tests/regression/test_issue3625.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/regression/test_issue3625.py b/spacy/tests/regression/test_issue3625.py index f61b834fb..e3e0f25ee 100644 --- a/spacy/tests/regression/test_issue3625.py +++ b/spacy/tests/regression/test_issue3625.py @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals from spacy.lang.hi import Hindi From 0b8406a05cf497ce40071efb56894fee7f20b4d2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 11 Jul 2019 12:02:25 +0200 Subject: [PATCH 133/148] Tidy up and auto-format --- spacy/_ml.py | 31 ++++++++---- spacy/lang/ko/__init__.py | 4 +- spacy/lang/ko/tag_map.py | 59 ++++++++++------------- spacy/tests/lang/ko/test_lemmatization.py | 3 +- spacy/tests/lang/ko/test_tokenizer.py | 6 +-- spacy/tests/lang/lt/test_text.py | 48 +++++++++++------- spacy/tests/matcher/test_matcher_api.py | 44 ++++++++++++----- spacy/tests/regression/test_issue3880.py | 2 +- 8 files changed, 118 insertions(+), 79 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index cca324b45..d16e124dc 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -661,21 +661,33 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): conv_depth = cfg.get("conv_depth", 2) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) - pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name + pretrained_vectors = cfg.get("pretrained_vectors") # self.nlp.vocab.vectors.name context_width = cfg.get("context_width") entity_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): - model = Affine(entity_width, entity_width+context_width+1+ner_types)\ - >> Affine(1, entity_width, drop_factor=0.0)\ - >> logistic + model = ( + Affine(entity_width, entity_width + context_width + 1 + ner_types) + >> Affine(1, entity_width, drop_factor=0.0) + >> logistic + ) # context encoder - tok2vec = Tok2Vec(width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors, - cnn_maxout_pieces=cnn_maxout_pieces, subword_features=True, conv_depth=conv_depth, - bilstm_depth=0) >> flatten_add_lengths >> Pooling(mean_pool)\ - >> Residual(zero_init(Maxout(hidden_width, hidden_width))) \ - >> zero_init(Affine(context_width, hidden_width)) + tok2vec = ( + Tok2Vec( + width=hidden_width, + embed_size=embed_width, + pretrained_vectors=pretrained_vectors, + cnn_maxout_pieces=cnn_maxout_pieces, + subword_features=True, + conv_depth=conv_depth, + bilstm_depth=0, + ) + >> flatten_add_lengths + >> Pooling(mean_pool) + >> Residual(zero_init(Maxout(hidden_width, hidden_width))) + >> zero_init(Affine(context_width, hidden_width)) + ) model.tok2vec = tok2vec @@ -684,6 +696,7 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): model.nO = 1 return model + @layerize def flatten(seqs, drop=0.0): ops = Model.ops diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py index 111d01720..f5dff75f1 100644 --- a/spacy/lang/ko/__init__.py +++ b/spacy/lang/ko/__init__.py @@ -6,7 +6,7 @@ import sys from .stop_words import STOP_WORDS -from .tag_map import TAG_MAP, POS +from .tag_map import TAG_MAP from ...attrs import LANG from ...language import Language from ...tokens import Doc @@ -22,6 +22,7 @@ if is_python_pre_3_5: Morpheme = namedtuple("Morpheme", "surface lemma tag") elif is_python_post_3_7: from dataclasses import dataclass + @dataclass(frozen=True) class Morpheme: surface: str @@ -29,6 +30,7 @@ elif is_python_post_3_7: tag: str else: from typing import NamedTuple + class Morpheme(NamedTuple): surface: str lemma: str diff --git a/spacy/lang/ko/tag_map.py b/spacy/lang/ko/tag_map.py index ed6b58170..57317c969 100644 --- a/spacy/lang/ko/tag_map.py +++ b/spacy/lang/ko/tag_map.py @@ -1,66 +1,59 @@ # encoding: utf8 from __future__ import unicode_literals -from collections import defaultdict -from ...symbols import (POS, PUNCT, INTJ, X, SYM, - ADJ, AUX, ADP, CONJ, NOUN, PRON, VERB, ADV, PROPN, - NUM, DET) - +from ...symbols import POS, PUNCT, INTJ, X, SYM, ADJ, AUX, ADP, CONJ, NOUN, PRON +from ...symbols import VERB, ADV, PROPN, NUM, DET + # 은전한닢(mecab-ko-dic)의 품사 태그를 universal pos tag로 대응시킴 # https://docs.google.com/spreadsheets/d/1-9blXKjtjeKZqsf4NzHeYJCrr49-nXeRF6D80udfcwY/edit#gid=589544265 # https://universaldependencies.org/u/pos/ TAG_MAP = { - # J.{1,2} 조사 - "JKS": {POS: ADP}, + # J.{1,2} 조사 + "JKS": {POS: ADP}, "JKC": {POS: ADP}, "JKG": {POS: ADP}, "JKO": {POS: ADP}, "JKB": {POS: ADP}, "JKV": {POS: ADP}, "JKQ": {POS: ADP}, - "JX": {POS: ADP}, # 보조사 + "JX": {POS: ADP}, # 보조사 "JC": {POS: CONJ}, # 접속 조사 - "MAJ": {POS: CONJ}, # 접속 부사 + "MAJ": {POS: CONJ}, # 접속 부사 "MAG": {POS: ADV}, # 일반 부사 - "MM": {POS: DET}, # 관형사 - + "MM": {POS: DET}, # 관형사 "XPN": {POS: X}, # 접두사 - # XS. 접미사 + # XS. 접미사 "XSN": {POS: X}, "XSV": {POS: X}, "XSA": {POS: X}, - "XR": {POS: X}, # 어근 + "XR": {POS: X}, # 어근 # E.{1,2} 어미 "EP": {POS: X}, "EF": {POS: X}, "EC": {POS: X}, "ETN": {POS: X}, "ETM": {POS: X}, - "IC": {POS: INTJ}, # 감탄사 - "VV": {POS: VERB}, # 동사 - "VA": {POS: ADJ}, # 형용사 - "VX": {POS: AUX}, # 보조 용언 + "VA": {POS: ADJ}, # 형용사 + "VX": {POS: AUX}, # 보조 용언 "VCP": {POS: ADP}, # 긍정 지정사(이다) "VCN": {POS: ADJ}, # 부정 지정사(아니다) - - "NNG": {POS: NOUN}, # 일반 명사(general noun) - "NNB": {POS: NOUN}, # 의존 명사 - "NNBC": {POS: NOUN}, # 의존 명사(단위: unit) - "NNP": {POS: PROPN}, # 고유 명사(proper noun) + "NNG": {POS: NOUN}, # 일반 명사(general noun) + "NNB": {POS: NOUN}, # 의존 명사 + "NNBC": {POS: NOUN}, # 의존 명사(단위: unit) + "NNP": {POS: PROPN}, # 고유 명사(proper noun) "NP": {POS: PRON}, # 대명사 - "NR": {POS: NUM}, # 수사(numerals) - "SN": {POS: NUM}, # 숫자 - + "NR": {POS: NUM}, # 수사(numerals) + "SN": {POS: NUM}, # 숫자 # S.{1,2} 부호 - # 문장 부호 - "SF": {POS: PUNCT}, # period or other EOS marker + # 문장 부호 + "SF": {POS: PUNCT}, # period or other EOS marker "SE": {POS: PUNCT}, - "SC": {POS: PUNCT}, # comma, etc. - "SSO": {POS: PUNCT}, # open bracket - "SSC": {POS: PUNCT}, # close bracket - "SY": {POS: SYM}, # 기타 기호 - "SL": {POS: X}, # 외국어 - "SH": {POS: X}, # 한자 + "SC": {POS: PUNCT}, # comma, etc. + "SSO": {POS: PUNCT}, # open bracket + "SSC": {POS: PUNCT}, # close bracket + "SY": {POS: SYM}, # 기타 기호 + "SL": {POS: X}, # 외국어 + "SH": {POS: X}, # 한자 } diff --git a/spacy/tests/lang/ko/test_lemmatization.py b/spacy/tests/lang/ko/test_lemmatization.py index 67371d4ce..42c306c11 100644 --- a/spacy/tests/lang/ko/test_lemmatization.py +++ b/spacy/tests/lang/ko/test_lemmatization.py @@ -5,8 +5,7 @@ import pytest @pytest.mark.parametrize( - "word,lemma", - [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")], + "word,lemma", [("새로운", "새롭"), ("빨간", "빨갛"), ("클수록", "크"), ("뭡니까", "뭣"), ("됐다", "되")] ) def test_ko_lemmatizer_assigns(ko_tokenizer, word, lemma): test_lemma = ko_tokenizer(word)[0].lemma_ diff --git a/spacy/tests/lang/ko/test_tokenizer.py b/spacy/tests/lang/ko/test_tokenizer.py index bd1d94aec..cc7b5fd77 100644 --- a/spacy/tests/lang/ko/test_tokenizer.py +++ b/spacy/tests/lang/ko/test_tokenizer.py @@ -7,15 +7,15 @@ import pytest TOKENIZER_TESTS = [("서울 타워 근처에 살고 있습니다.", "서울 타워 근처 에 살 고 있 습니다 ."), ("영등포구에 있는 맛집 좀 알려주세요.", "영등포구 에 있 는 맛집 좀 알려 주 세요 .")] -TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", +TAG_TESTS = [("서울 타워 근처에 살고 있습니다.", "NNP NNG NNG JKB VV EC VX EF SF"), - ("영등포구에 있는 맛집 좀 알려주세요.", + ("영등포구에 있는 맛집 좀 알려주세요.", "NNP JKB VV ETM NNG MAG VV VX EP SF")] FULL_TAG_TESTS = [("영등포구에 있는 맛집 좀 알려주세요.", "NNP JKB VV ETM NNG MAG VV+EC VX EP+EF SF")] -POS_TESTS = [("서울 타워 근처에 살고 있습니다.", +POS_TESTS = [("서울 타워 근처에 살고 있습니다.", "PROPN NOUN NOUN ADP VERB X AUX X PUNCT"), ("영등포구에 있는 맛집 좀 알려주세요.", "PROPN ADP VERB X NOUN ADV VERB AUX X PUNCT")] diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py index d2550067b..7afc6d497 100644 --- a/spacy/tests/lang/lt/test_text.py +++ b/spacy/tests/lang/lt/test_text.py @@ -5,16 +5,26 @@ import pytest def test_lt_tokenizer_handles_long_text(lt_tokenizer): - text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią -vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis + text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią +vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui.""" tokens = lt_tokenizer(text.replace("\n", "")) assert len(tokens) == 42 -@pytest.mark.parametrize('text,length', [ - ("177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", 15), - ("ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", 16)]) +@pytest.mark.parametrize( + "text,length", + [ + ( + "177R Parodų rūmai–Ozo g. nuo vasario 18 d. bus skelbiamas interneto tinklalapyje.", + 15, + ), + ( + "ISM universiteto doc. dr. Ieva Augutytė-Kvedaravičienė pastebi, kad tyrimais nustatyti elgesio pokyčiai.", + 16, + ), + ], +) def test_lt_tokenizer_handles_punct_abbrev(lt_tokenizer, text, length): tokens = lt_tokenizer(text) assert len(tokens) == length @@ -26,18 +36,22 @@ def test_lt_tokenizer_abbrev_exceptions(lt_tokenizer, text): assert len(tokens) == 1 -@pytest.mark.parametrize("text,match", [ - ("10", True), - ("1", True), - ("10,000", True), - ("10,00", True), - ("999.0", True), - ("vienas", True), - ("du", True), - ("milijardas", True), - ("šuo", False), - (",", False), - ("1/2", True)]) +@pytest.mark.parametrize( + "text,match", + [ + ("10", True), + ("1", True), + ("10,000", True), + ("10,00", True), + ("999.0", True), + ("vienas", True), + ("du", True), + ("milijardas", True), + ("šuo", False), + (",", False), + ("1/2", True), + ], +) def test_lt_lex_attrs_like_number(lt_tokenizer, text, match): tokens = lt_tokenizer(text) assert len(tokens) == 1 diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py index 54ddd6789..013700d52 100644 --- a/spacy/tests/matcher/test_matcher_api.py +++ b/spacy/tests/matcher/test_matcher_api.py @@ -5,7 +5,6 @@ import pytest import re from spacy.matcher import Matcher, DependencyMatcher from spacy.tokens import Doc, Token -from ..util import get_doc @pytest.fixture @@ -288,24 +287,43 @@ def deps(): def dependency_matcher(en_vocab): def is_brown_yellow(text): return bool(re.compile(r"brown|yellow|over").match(text)) + IS_BROWN_YELLOW = en_vocab.add_flag(is_brown_yellow) pattern1 = [ {"SPEC": {"NODE_NAME": "fox"}, "PATTERN": {"ORTH": "fox"}}, - {"SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"},"PATTERN": {"ORTH": "quick", "DEP": "amod"}}, - {"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, "PATTERN": {IS_BROWN_YELLOW: True}}, + { + "SPEC": {"NODE_NAME": "q", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, + "PATTERN": {"ORTH": "quick", "DEP": "amod"}, + }, + { + "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">", "NBOR_NAME": "fox"}, + "PATTERN": {IS_BROWN_YELLOW: True}, + }, ] pattern2 = [ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, - {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, - {"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}} + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, ] pattern3 = [ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}}, - {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}}, - {"SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, "PATTERN": {"ORTH": "brown"}} + { + "SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, + "PATTERN": {"ORTH": "fox"}, + }, + { + "SPEC": {"NODE_NAME": "r", "NBOR_RELOP": ">>", "NBOR_NAME": "fox"}, + "PATTERN": {"ORTH": "brown"}, + }, ] matcher = DependencyMatcher(en_vocab) @@ -320,9 +338,9 @@ def test_dependency_matcher_compile(dependency_matcher): assert len(dependency_matcher) == 3 -def test_dependency_matcher(dependency_matcher, text, heads, deps): - doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) - matches = dependency_matcher(doc) - # assert matches[0][1] == [[3, 1, 2]] - # assert matches[1][1] == [[4, 3, 3]] - # assert matches[2][1] == [[4, 3, 2]] +# def test_dependency_matcher(dependency_matcher, text, heads, deps): +# doc = get_doc(dependency_matcher.vocab, text.split(), heads=heads, deps=deps) +# matches = dependency_matcher(doc) +# assert matches[0][1] == [[3, 1, 2]] +# assert matches[1][1] == [[4, 3, 3]] +# assert matches[2][1] == [[4, 3, 2]] diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py index ecc12afa3..6de373f11 100644 --- a/spacy/tests/regression/test_issue3880.py +++ b/spacy/tests/regression/test_issue3880.py @@ -1,7 +1,7 @@ # coding: utf8 from __future__ import unicode_literals + from spacy.lang.en import English -import pytest def test_issue3880(): From d16675660775853f6530495f464dc715d052e2a7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 11 Jul 2019 12:16:43 +0200 Subject: [PATCH 134/148] Fix test --- spacy/tests/lang/lt/test_text.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/spacy/tests/lang/lt/test_text.py b/spacy/tests/lang/lt/test_text.py index 7afc6d497..cac32aa4d 100644 --- a/spacy/tests/lang/lt/test_text.py +++ b/spacy/tests/lang/lt/test_text.py @@ -5,10 +5,8 @@ import pytest def test_lt_tokenizer_handles_long_text(lt_tokenizer): - text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią -vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis -yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui.""" - tokens = lt_tokenizer(text.replace("\n", "")) + text = """Tokios sausros kriterijus atitinka pirmadienį atlikti skaičiavimai, palyginus faktinį ir žemiausią vidutinį daugiametį vandens lygį. Nustatyta, kad iš 48 šalies vandens matavimo stočių 28-iose stotyse vandens lygis yra žemesnis arba lygus žemiausiam vidutiniam daugiamečiam šiltojo laikotarpio vandens lygiui.""" + tokens = lt_tokenizer(text) assert len(tokens) == 42 From e19f4ee719af1a4ce8391c6934ff3edf4cdb7ca3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Jul 2019 12:32:59 +0200 Subject: [PATCH 135/148] Add warning message re Issue #3853 --- spacy/_ml.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index d16e124dc..abb44e1b7 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -299,7 +299,14 @@ def link_vectors_to_models(vocab): data = ops.asarray(vectors.data) # Set an entry here, so that vectors are accessed by StaticVectors # (unideal, I know) - thinc.extra.load_nlp.VECTORS[(ops.device, vectors.name)] = data + key = (ops.device, vectors.name) + if key in thinc.extra.load_nlp.VECTORS: + if thinc.extra.load_nlp.VECTORS[key].shape != data.shape: + print( + "Warning: Registering vectors data under the same ID as " + "existing vectors, and the new vectors data seems different. " + "This might lead to incorrect results. See Issue #3853") + thinc.extra.load_nlp.VECTORS[key] = data def PyTorchBiLSTM(nO, nI, depth, dropout=0.2): From b40b4c2c31c7e43f7cee1f491e57d444bf1fd6d1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Jul 2019 12:55:11 +0200 Subject: [PATCH 136/148] =?UTF-8?q?=F0=9F=92=AB=20Fix=20issue=20#3839:=20I?= =?UTF-8?q?ncorrect=20entity=20IDs=20from=20Matcher=20with=20operators=20(?= =?UTF-8?q?#3949)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add regression test for issue #3541 * Add comment on bugfix * Remove incorrect test * Un-xfail test --- spacy/matcher/matcher.pyx | 8 ++++---- spacy/tests/regression/test_issue3839.py | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 2dd8c2940..86658ce99 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -262,13 +262,13 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None, cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil: + # There have been a few bugs here. # The code was originally designed to always have pattern[1].attrs.value # be the ent_id when we get to the end of a pattern. However, Issue #2671 # showed this wasn't the case when we had a reject-and-continue before a - # match. I still don't really understand what's going on here, but this - # workaround does resolve the issue. - while pattern.attrs.attr != ID and \ - (pattern.nr_attr > 0 or pattern.nr_extra_attr > 0 or pattern.nr_py > 0): + # match. + # The patch to #2671 was wrong though, which came up in #3839. + while pattern.attrs.attr != ID: pattern += 1 return pattern.attrs.value diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py index fa915faf0..34d6bb46e 100644 --- a/spacy/tests/regression/test_issue3839.py +++ b/spacy/tests/regression/test_issue3839.py @@ -6,7 +6,6 @@ from spacy.matcher import Matcher from spacy.tokens import Doc -@pytest.mark.xfail def test_issue3839(en_vocab): """Test that match IDs returned by the matcher are correct, are in the string """ doc = Doc(en_vocab, words=["terrific", "group", "of", "people"]) From 0f0f07318a9bbf37ca3f4e008c35a7c88ded777f Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jul 2019 13:05:53 +0200 Subject: [PATCH 137/148] counter instead of preshcounter --- bin/train_word_vectors.py | 1 - spacy/tokens/doc.pxd | 1 - spacy/tokens/doc.pyx | 37 +++++-------------------------------- 3 files changed, 5 insertions(+), 34 deletions(-) diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py index 624e339a0..663ce060d 100644 --- a/bin/train_word_vectors.py +++ b/bin/train_word_vectors.py @@ -5,7 +5,6 @@ import logging from pathlib import Path from collections import defaultdict from gensim.models import Word2Vec -from preshed.counter import PreshCounter import plac import spacy diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index cc05cb495..4b8578fe0 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -1,6 +1,5 @@ from cymem.cymem cimport Pool cimport numpy as np -from preshed.counter cimport PreshCounter from ..vocab cimport Vocab from ..structs cimport TokenC, LexemeC diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 657b9a1d6..3b0c2425c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -9,6 +9,7 @@ cimport cython cimport numpy as np from libc.string cimport memcpy, memset from libc.math cimport sqrt +from collections import Counter import numpy import numpy.linalg @@ -698,7 +699,7 @@ cdef class Doc: # Handle 1d case return output if len(attr_ids) >= 2 else output.reshape((self.length,)) - def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): + def count_by(self, attr_id_t attr_id, exclude=None, object counts=None): """Count the frequencies of a given attribute. Produces a dict of `{attribute (int): count (ints)}` frequencies, keyed by the values of the given attribute ID. @@ -713,50 +714,22 @@ cdef class Doc: cdef size_t count cdef int64_t this_value - print("COUNTING") - if counts is None: - counts = PreshCounter() + counts = Counter() output_dict = True - print("counts None") else: output_dict = False # Take this check out of the loop, for a bit of extra speed if exclude is None: - print("exclude None") for i in range(self.length): - print() - print("token", self[i]) this_value = get_token_attr(&self.c[i], attr_id) - print("token attr value", this_value) - print("type attr value", type(this_value)) - - print(i, "key this_value before", counts.c_map.cells[this_value].key) - print(i, "value this_value before", counts.c_map.cells[this_value].value) - counts.inc(this_value, 1) - print(i, "key this_value after", counts.c_map.cells[this_value].key) - print(i, "value this_value after", counts.c_map.cells[this_value].value) - - print(i, "key 0", counts.c_map.cells[0].key) - print(i, "value 0", counts.c_map.cells[0].value) - print(i, "key 1", counts.c_map.cells[1].key) - print(i, "value 1", counts.c_map.cells[1].value) + counts[this_value] += 1 else: for i in range(self.length): if not exclude(self[i]): attr = get_token_attr(&self.c[i], attr_id) - counts.inc(attr, 1) + counts[attr] += 1 if output_dict: - print("output_dict") - print(counts.length) - print(counts.total) - print("key 0", counts.c_map.cells[0].key) - print("value 0", counts.c_map.cells[0].value) - print("key 1", counts.c_map.cells[1].key) - print("value 1", counts.c_map.cells[1].value) - print() - print(dict(counts)) - print() return dict(counts) def _realloc(self, new_size): From 349107daa3b0804c62861dbaa810e9a1488960b1 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Thu, 11 Jul 2019 13:09:22 +0200 Subject: [PATCH 138/148] cleanup --- spacy/tests/regression/test_issue3869.py | 2 -- spacy/tokens/doc.pxd | 1 - spacy/tokens/doc.pyx | 8 ++------ 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/spacy/tests/regression/test_issue3869.py b/spacy/tests/regression/test_issue3869.py index d76da6989..42584b133 100644 --- a/spacy/tests/regression/test_issue3869.py +++ b/spacy/tests/regression/test_issue3869.py @@ -20,8 +20,6 @@ from spacy.lang.en import English def test_issue3869(sentence): """Test that the Doc's count_by function works consistently""" nlp = English() - - print() doc = nlp(sentence) count = 0 diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 4b8578fe0..62665fcc5 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -7,7 +7,6 @@ from ..typedefs cimport attr_t from ..attrs cimport attr_id_t - cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 3b0c2425c..c1883f9c0 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -14,7 +14,6 @@ from collections import Counter import numpy import numpy.linalg import struct -from libc.stdint cimport int64_t import srsly from thinc.neural.util import get_array_module, copy_array @@ -712,7 +711,6 @@ cdef class Doc: cdef int i cdef attr_t attr cdef size_t count - cdef int64_t this_value if counts is None: counts = Counter() @@ -722,13 +720,11 @@ cdef class Doc: # Take this check out of the loop, for a bit of extra speed if exclude is None: for i in range(self.length): - this_value = get_token_attr(&self.c[i], attr_id) - counts[this_value] += 1 + counts[get_token_attr(&self.c[i], attr_id)] += 1 else: for i in range(self.length): if not exclude(self[i]): - attr = get_token_attr(&self.c[i], attr_id) - counts[attr] += 1 + counts[get_token_attr(&self.c[i], attr_id)] += 1 if output_dict: return dict(counts) From 7369949d2e90872c56ec60ba9229ddf4bac92590 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Jul 2019 14:44:32 +0200 Subject: [PATCH 139/148] Add warning for #3853 --- spacy/errors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/errors.py b/spacy/errors.py index 347ad1fca..ed3d6afb9 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -82,6 +82,8 @@ class Warnings(object): "parallel inference via multiprocessing.") W017 = ("Alias '{alias}' already exists in the Knowledge base.") W018 = ("Entity '{entity}' already exists in the Knowledge base.") + W019 = ("Changing vectors name from {old} to {new}, to avoid clash with " + "previously loaded vectors. See Issue #3853.") @add_codes From 09dc01a4266b0f6b3a557918a4af44eac685f1bb Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Jul 2019 14:46:29 +0200 Subject: [PATCH 140/148] Fix #3853, and add warning --- spacy/_ml.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/spacy/_ml.py b/spacy/_ml.py index abb44e1b7..4d9bb4c2b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -24,7 +24,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed import thinc.extra.load_nlp from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE -from .errors import Errors +from .errors import Errors, user_warning, Warnings from . import util try: @@ -302,10 +302,13 @@ def link_vectors_to_models(vocab): key = (ops.device, vectors.name) if key in thinc.extra.load_nlp.VECTORS: if thinc.extra.load_nlp.VECTORS[key].shape != data.shape: - print( - "Warning: Registering vectors data under the same ID as " - "existing vectors, and the new vectors data seems different. " - "This might lead to incorrect results. See Issue #3853") + # This is a hack to avoid the problem in #3853. Maybe we should + # print a warning as well? + old_name = vectors.name + new_name = vectors.name + "_%d" % data.shape[0] + user_warning(Warnings.W019.format(old=old_name, new=new_name)) + vectors.name = new_name + key = (ops.device, vectors.name) thinc.extra.load_nlp.VECTORS[key] = data From cda9fc3dae6ae430298cc293e378eac5813bf685 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 11 Jul 2019 15:53:13 +0200 Subject: [PATCH 141/148] Update Thinc version pin --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 80bb5905a..35ff96903 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,6 @@ requires = ["setuptools", "cymem>=2.0.2,<2.1.0", "preshed>=2.0.1,<2.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc==7.0.0.dev6", + "thinc>=7.0.6,<7.1.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 58761b95c..99935f335 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=2.0.1,<2.1.0 -thinc>=7.0.5,<7.1.0 +thinc>=7.0.6,<7.1.0 blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.2.0,<1.1.0 diff --git a/setup.py b/setup.py index 544188f4a..4d6416cad 100755 --- a/setup.py +++ b/setup.py @@ -228,7 +228,7 @@ def setup_package(): "murmurhash>=0.28.0,<1.1.0", "cymem>=2.0.2,<2.1.0", "preshed>=2.0.1,<2.1.0", - "thinc>=7.0.2,<7.1.0", + "thinc>=7.0.6,<7.1.0", "blis>=0.2.2,<0.3.0", "plac<1.0.0,>=0.9.6", "requests>=2.13.0,<3.0.0", From 123929b58bcdb6738f0d0ca0254783f6ab8e61fb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 12 Jul 2019 00:15:35 +0200 Subject: [PATCH 142/148] Update Thinc version pin --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 35ff96903..35f3d9215 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,6 @@ requires = ["setuptools", "cymem>=2.0.2,<2.1.0", "preshed>=2.0.1,<2.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=7.0.6,<7.1.0", + "thinc>=7.0.8,<7.1.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index 99935f335..5a6870cd3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=2.0.1,<2.1.0 -thinc>=7.0.6,<7.1.0 +thinc>=7.0.8,<7.1.0 blis>=0.2.2,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.2.0,<1.1.0 diff --git a/setup.py b/setup.py index 4d6416cad..b36c48316 100755 --- a/setup.py +++ b/setup.py @@ -228,7 +228,7 @@ def setup_package(): "murmurhash>=0.28.0,<1.1.0", "cymem>=2.0.2,<2.1.0", "preshed>=2.0.1,<2.1.0", - "thinc>=7.0.6,<7.1.0", + "thinc>=7.0.8,<7.1.0", "blis>=0.2.2,<0.3.0", "plac<1.0.0,>=0.9.6", "requests>=2.13.0,<3.0.0", From ed774cb9530d701cdfbad72fa84d86d8ac965706 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 12 Jul 2019 10:01:35 +0200 Subject: [PATCH 143/148] Fixing ngram bug (#3953) * minimal failing example for Issue #3661 * referenced Issue #3661 instead of Issue #3611 * cleanup --- spacy/tests/regression/test_issue3611.py | 51 ++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 spacy/tests/regression/test_issue3611.py diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py new file mode 100644 index 000000000..29aa5370d --- /dev/null +++ b/spacy/tests/regression/test_issue3611.py @@ -0,0 +1,51 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +import spacy +from spacy.util import minibatch, compounding + + +def test_issue3611(): + """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ + unique_classes = ["offensive", "inoffensive"] + x_train = ["This is an offensive text", + "This is the second offensive text", + "inoff"] + y_train = ["offensive", "offensive", "inoffensive"] + + # preparing the data + pos_cats = list() + for train_instance in y_train: + pos_cats.append({label: label == train_instance for label in unique_classes}) + train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats])) + + # set up the spacy model with a text categorizer component + nlp = spacy.blank('en') + + textcat = nlp.create_pipe( + "textcat", + config={ + "exclusive_classes": True, + "architecture": "bow", + "ngram_size": 2 + } + ) + + for label in unique_classes: + textcat.add_label(label) + nlp.add_pipe(textcat, last=True) + + # training the network + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] + with nlp.disable_pipes(*other_pipes): + optimizer = nlp.begin_training() + for i in range(3): + losses = {} + batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) + + for batch in batches: + texts, annotations = zip(*batch) + nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses) + + From 3bc4d618f920998e76cc5302a1ce79d285cdc5c3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Jul 2019 13:26:12 +0200 Subject: [PATCH 144/148] Set version to v2.1.5 --- spacy/about.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/about.py b/spacy/about.py index 758809934..8fb7d23bc 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,13 +4,13 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.5.dev0" +__version__ = "2.1.5" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" __email__ = "contact@explosion.ai" __license__ = "MIT" -__release__ = False +__release__ = True __download_url__ = "https://github.com/explosion/spacy-models/releases/download" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" From 02e12b08527242ae976a62c17ba425a962d36e77 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 12 Jul 2019 13:36:47 +0200 Subject: [PATCH 145/148] Update landing with IRL videos [ci skip] --- website/src/widgets/landing.js | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js index f55aa5aa3..e9dec87f4 100644 --- a/website/src/widgets/landing.js +++ b/website/src/widgets/landing.js @@ -152,20 +152,21 @@ const Landing = ({ data }) => { - We're pleased to invite the spaCy community and other folks working on Natural + We were pleased to invite the spaCy community and other folks working on Natural Language Processing to Berlin this summer for a small and intimate event{' '} - July 5-6, 2019. The event includes a hands-on training day for - teams using spaCy in production, followed by a one-track conference. We've - booked a beautiful venue, hand-picked an awesome lineup of speakers and - scheduled plenty of social time to get to know each other and exchange ideas. + July 6, 2019. We booked a beautiful venue, hand-picked an + awesome lineup of speakers and scheduled plenty of social time to get to know + each other and exchange ideas. The YouTube playlist includes 12 talks about NLP + research, development and applications, with keynotes by Sebastian Ruder + (DeepMind) and Yoav Goldberg (Allen AI). Date: Fri, 12 Jul 2019 17:40:00 +0200 Subject: [PATCH 146/148] Increment version [ci skip] --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 8fb7d23bc..16e5e9522 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.5" +__version__ = "2.1.6" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" From c345c042b0b2c2b8b1607f8f2a8f8ebbe745aa88 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Jul 2019 17:48:16 +0200 Subject: [PATCH 147/148] Fix symbol alignment --- spacy/symbols.pxd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index 4501861a2..5922ee588 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -81,7 +81,6 @@ cdef enum symbol_t: DEP ENT_IOB ENT_TYPE - ENT_KB_ID HEAD SENT_START SPACY @@ -461,3 +460,5 @@ cdef enum symbol_t: xcomp acl + + ENT_KB_ID From ef666656b397b6ec6c5f2693c22afd5a65dea9d3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 12 Jul 2019 17:59:47 +0200 Subject: [PATCH 148/148] Fix attrs alignment --- spacy/attrs.pxd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index c5ba8d765..d9aca078c 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -1,4 +1,6 @@ # Reserve 64 values for flag features +from . cimport symbols + cdef enum attr_id_t: NULL_ATTR IS_ALPHA @@ -82,10 +84,10 @@ cdef enum attr_id_t: DEP ENT_IOB ENT_TYPE - ENT_KB_ID HEAD SENT_START SPACY PROB LANG + ENT_KB_ID = symbols.ENT_KB_ID