From 250a54414b41e365dfe5f45a17ca1f2c245ca3a2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=BB=8E=E8=B0=A2=E9=B9=8F?= Date: Mon, 12 Aug 2019 16:37:48 +0800 Subject: [PATCH 01/12] update lang/zh (#4103) * update lang/zh * update lang/zh --- .github/contributors/phiedulxp.md | 106 +++++++++++++++++++++++++++++ spacy/lang/zh/__init__.py | 5 +- spacy/lang/zh/examples.py | 12 ++-- spacy/lang/zh/lex_attrs.py | 107 ++++++++++++++++++++++++++++++ spacy/lang/zh/tag_map.py | 47 +++++++++++++ 5 files changed, 270 insertions(+), 7 deletions(-) create mode 100644 .github/contributors/phiedulxp.md create mode 100644 spacy/lang/zh/lex_attrs.py create mode 100644 spacy/lang/zh/tag_map.py diff --git a/.github/contributors/phiedulxp.md b/.github/contributors/phiedulxp.md new file mode 100644 index 000000000..fa2666e78 --- /dev/null +++ b/.github/contributors/phiedulxp.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Xiepeng Li | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 20190810 | +| GitHub username | phiedulxp | +| Website (optional) | | diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py index 773bbcf38..b1ee5105c 100644 --- a/spacy/lang/zh/__init__.py +++ b/spacy/lang/zh/__init__.py @@ -6,7 +6,7 @@ from ...language import Language from ...tokens import Doc from ..tokenizer_exceptions import BASE_EXCEPTIONS from .stop_words import STOP_WORDS - +from .tag_map import TAG_MAP class ChineseDefaults(Language.Defaults): lex_attr_getters = dict(Language.Defaults.lex_attr_getters) @@ -14,6 +14,7 @@ class ChineseDefaults(Language.Defaults): use_jieba = True tokenizer_exceptions = BASE_EXCEPTIONS stop_words = STOP_WORDS + tag_map = TAG_MAP writing_system = {"direction": "ltr", "has_case": False, "has_letters": False} @@ -44,4 +45,4 @@ class Chinese(Language): return Doc(self.vocab, words=words, spaces=spaces) -__all__ = ["Chinese"] +__all__ = ["Chinese"] \ No newline at end of file diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py index 3c2e45e80..1553768a8 100644 --- a/spacy/lang/zh/examples.py +++ b/spacy/lang/zh/examples.py @@ -9,10 +9,12 @@ Example sentences to test spaCy and its language models. >>> docs = nlp.pipe(sentences) """ - +# from https://zh.wikipedia.org/wiki/汉语 sentences = [ - "蘋果公司正考量用一億元買下英國的新創公司", - "自駕車將保險責任歸屬轉移至製造商", - "舊金山考慮禁止送貨機器人在人行道上行駛", - "倫敦是英國的大城市", + "作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。", + "汉语有多种分支,当中官话最为流行,为中华人民共和国的国家通用语言(又称为普通话)、以及中华民国的国语。", + "此外,中文还是联合国正式语文,并被上海合作组织等国际组织采用为官方语言。", + "在中国大陆,汉语通称为“汉语”。", + "在联合国、台湾、香港及澳门,通称为“中文”。", + "在新加坡及马来西亚,通称为“华语”。" ] diff --git a/spacy/lang/zh/lex_attrs.py b/spacy/lang/zh/lex_attrs.py new file mode 100644 index 000000000..0a42883f7 --- /dev/null +++ b/spacy/lang/zh/lex_attrs.py @@ -0,0 +1,107 @@ +# coding: utf8 +from __future__ import unicode_literals +import re +from ...attrs import LIKE_NUM + +_single_num_words = [ + "〇", + "一", + "二", + "三", + "四", + "五", + "六", + "七", + "八", + "九", + "十", + "十一", + "十二", + "十三", + "十四", + "十五", + "十六", + "十七", + "十八", + "十九", + "廿", + "卅", + "卌", + "皕", + "零", + "壹", + "贰", + "叁", + "肆", + "伍", + "陆", + "柒", + "捌", + "玖", + "拾", + "拾壹", + "拾贰", + "拾叁", + "拾肆", + "拾伍", + "拾陆", + "拾柒", + "拾捌", + "拾玖" +] + +_count_num_words = [ + "一", + "二", + "三", + "四", + "五", + "六", + "七", + "八", + "九", + "壹", + "贰", + "叁", + "肆", + "伍", + "陆", + "柒", + "捌", + "玖" +] + +_base_num_words = [ + "十", + "百", + "千", + "万", + "亿", + "兆", + "拾", + "佰", + "仟" +] + + +def like_num(text): + if text.startswith(("+", "-", "±", "~")): + text = text[1:] + text = text.replace(",", "").replace( + ".", "").replace(",", "").replace("。", "") + if text.isdigit(): + return True + if text.count("/") == 1: + num, denom = text.split("/") + if num.isdigit() and denom.isdigit(): + return True + if text in _single_num_words: + return True + if re.match('^((' + '|'.join(_count_num_words) + '){1}' + + '(' + '|'.join(_base_num_words) + '){1})+' + + '(' + '|'.join(_count_num_words) + ')?$', text): + return True + return False + + +LEX_ATTRS = {LIKE_NUM: like_num} diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py new file mode 100644 index 000000000..6aa988a98 --- /dev/null +++ b/spacy/lang/zh/tag_map.py @@ -0,0 +1,47 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ...symbols import POS, PUNCT, SYM, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB +from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX + +# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set. +# We also map the tags to the simpler Google Universal POS tag set. + +TAG_MAP = { + "AS": {POS: PART}, + "DEC": {POS: PART}, + "DEG": {POS: PART}, + "DER": {POS: PART}, + "DEV": {POS: PART}, + "ETC": {POS: PART}, + "LC": {POS: PART}, + "MSP": {POS: PART}, + "SP": {POS: PART}, + "BA": {POS: X}, + "FW": {POS: X}, + "IJ": {POS: INTJ}, + "LB": {POS: X}, + "ON": {POS: X}, + "SB": {POS: X}, + "X": {POS: X}, + "URL": {POS: X}, + "INF": {POS: X}, + "NN": {POS: NOUN}, + "NR": {POS: NOUN}, + "NT": {POS: NOUN}, + "VA": {POS: VERB}, + "VC": {POS: VERB}, + "VE": {POS: VERB}, + "VV": {POS: VERB}, + "CD": {POS: NUM}, + "M": {POS: NUM}, + "OD": {POS: NUM}, + "DT": {POS: DET}, + "CC": {POS: CCONJ}, + "CS": {POS: CONJ}, + "AD": {POS: ADV}, + "JJ": {POS: ADJ}, + "P": {POS: ADP}, + "PN": {POS: PRON}, + "PU": {POS: PUNCT} +} \ No newline at end of file From 3a39154804faf372c7b0e54f47c61f5e48e1e66e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 12 Aug 2019 17:26:31 +0200 Subject: [PATCH 02/12] Create wip.yaml [ci skip] --- .github/wip.yaml | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .github/wip.yaml diff --git a/.github/wip.yaml b/.github/wip.yaml new file mode 100644 index 000000000..db768c3e3 --- /dev/null +++ b/.github/wip.yaml @@ -0,0 +1,6 @@ +locations: + - title + - label_name +terms: + - wip + - 🔜 From 35c865024b20d534f3050c3a36fcff44a6717756 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 12 Aug 2019 18:39:54 +0200 Subject: [PATCH 03/12] Fix file name [ci skip] --- .github/{wip.yaml => wip.yml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename .github/{wip.yaml => wip.yml} (100%) diff --git a/.github/wip.yaml b/.github/wip.yml similarity index 100% rename from .github/wip.yaml rename to .github/wip.yml From 5196dbd89d291729b34d47f59b712a166bfd8bca Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 13 Aug 2019 13:31:21 +0200 Subject: [PATCH 04/12] Delete wip.yml [ci skip] --- .github/wip.yml | 6 ------ 1 file changed, 6 deletions(-) delete mode 100644 .github/wip.yml diff --git a/.github/wip.yml b/.github/wip.yml deleted file mode 100644 index db768c3e3..000000000 --- a/.github/wip.yml +++ /dev/null @@ -1,6 +0,0 @@ -locations: - - title - - label_name -terms: - - wip - - 🔜 From 0ba1b5eebcf8d7d8a5c1e2859948622481df70d7 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 13 Aug 2019 15:38:59 +0200 Subject: [PATCH 05/12] CLI scripts for entity linking (wikipedia & generic) (#4091) * document token ent_kb_id * document span kb_id * update pipeline documentation * prior and context weights as bool's instead * entitylinker api documentation * drop for both models * finish entitylinker documentation * small fixes * documentation for KB * candidate documentation * links to api pages in code * small fix * frequency examples as counts for consistency * consistent documentation about tensors returned by predict * add entity linking to usage 101 * add entity linking infobox and KB section to 101 * entity-linking in linguistic features * small typo corrections * training example and docs for entity_linker * predefined nlp and kb * revert back to similarity encodings for simplicity (for now) * set prior probabilities to 0 when excluded * code clean up * bugfix: deleting kb ID from tokens when entities were removed * refactor train el example to use either model or vocab * pretrain_kb example for example kb generation * add to training docs for KB + EL example scripts * small fixes * error numbering * ensure the language of vocab and nlp stay consistent across serialization * equality with = * avoid conflict in errors file * add error 151 * final adjustements to the train scripts - consistency * update of goldparse documentation * small corrections * push commit * turn kb_creator into CLI script (wip) * proper parameters for training entity vectors * wikidata pipeline split up into two executable scripts * remove context_width * move wikidata scripts in bin directory, remove old dummy script * refine KB script with logs and preprocessing options * small edits * small improvements to logging of EL CLI script --- bin/wiki_entity_linking/kb_creator.py | 62 ++- bin/wiki_entity_linking/train_descriptions.py | 60 +- .../training_set_creator.py | 5 +- .../wikidata_pretrain_kb.py | 139 +++++ bin/wiki_entity_linking/wikidata_processor.py | 52 +- .../wikidata_train_entity_linker.py | 430 +++++++++++++++ .../wikipedia_processor.py | 9 +- examples/pipeline/dummy_entity_linking.py | 75 --- examples/pipeline/wikidata_entity_linking.py | 514 ------------------ examples/training/pretrain_kb.py | 139 +++++ examples/training/train_entity_linker.py | 173 ++++++ spacy/_ml.py | 24 +- spacy/errors.py | 10 +- spacy/kb.pyx | 14 +- spacy/pipeline/pipes.pyx | 105 ++-- spacy/tests/pipeline/test_entity_linker.py | 46 +- spacy/tests/serialize/test_serialize_kb.py | 12 +- spacy/tokens/doc.pyx | 1 + 18 files changed, 1111 insertions(+), 759 deletions(-) create mode 100644 bin/wiki_entity_linking/wikidata_pretrain_kb.py create mode 100644 bin/wiki_entity_linking/wikidata_train_entity_linker.py delete mode 100644 examples/pipeline/dummy_entity_linking.py delete mode 100644 examples/pipeline/wikidata_entity_linking.py create mode 100644 examples/training/pretrain_kb.py create mode 100644 examples/training/train_entity_linker.py diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py index 5b25475b2..d88cf9c7e 100644 --- a/bin/wiki_entity_linking/kb_creator.py +++ b/bin/wiki_entity_linking/kb_creator.py @@ -1,16 +1,14 @@ # coding: utf-8 from __future__ import unicode_literals -from .train_descriptions import EntityEncoder -from . import wikidata_processor as wd, wikipedia_processor as wp +from bin.wiki_entity_linking.train_descriptions import EntityEncoder +from bin.wiki_entity_linking import wikidata_processor as wd, wikipedia_processor as wp from spacy.kb import KnowledgeBase import csv import datetime - -INPUT_DIM = 300 # dimension of pre-trained input vectors -DESC_WIDTH = 64 # dimension of output entity vectors +from spacy import Errors def create_kb( @@ -23,17 +21,27 @@ def create_kb( count_input, prior_prob_input, wikidata_input, + entity_vector_length, + limit=None, + read_raw_data=True, ): # Create the knowledge base from Wikidata entries - kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=DESC_WIDTH) + kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length) + + # check the length of the nlp vectors + if "vectors" in nlp.meta and nlp.vocab.vectors.size: + input_dim = nlp.vocab.vectors_length + print("Loaded pre-trained vectors of size %s" % input_dim) + else: + raise ValueError(Errors.E155) # disable this part of the pipeline when rerunning the KB generation from preprocessed files - read_raw_data = True - if read_raw_data: print() - print(" * _read_wikidata_entities", datetime.datetime.now()) - title_to_id, id_to_descr = wd.read_wikidata_entities_json(wikidata_input) + print(now(), " * read wikidata entities:") + title_to_id, id_to_descr = wd.read_wikidata_entities_json( + wikidata_input, limit=limit + ) # write the title-ID and ID-description mappings to file _write_entity_files( @@ -46,7 +54,7 @@ def create_kb( id_to_descr = get_id_to_description(entity_descr_output) print() - print(" * _get_entity_frequencies", datetime.datetime.now()) + print(now(), " * get entity frequencies:") print() entity_frequencies = wp.get_all_frequencies(count_input=count_input) @@ -65,40 +73,41 @@ def create_kb( filtered_title_to_id[title] = entity print(len(title_to_id.keys()), "original titles") - print("kept", len(filtered_title_to_id.keys()), " with frequency", min_entity_freq) + kept_nr = len(filtered_title_to_id.keys()) + print("kept", kept_nr, "entities with min. frequency", min_entity_freq) print() - print(" * train entity encoder", datetime.datetime.now()) + print(now(), " * train entity encoder:") print() - encoder = EntityEncoder(nlp, INPUT_DIM, DESC_WIDTH) + encoder = EntityEncoder(nlp, input_dim, entity_vector_length) encoder.train(description_list=description_list, to_print=True) print() - print(" * get entity embeddings", datetime.datetime.now()) + print(now(), " * get entity embeddings:") print() embeddings = encoder.apply_encoder(description_list) - print() - print(" * adding", len(entity_list), "entities", datetime.datetime.now()) + print(now(), " * adding", len(entity_list), "entities") kb.set_entities( entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings ) - print() - print(" * adding aliases", datetime.datetime.now()) - print() - _add_aliases( + alias_cnt = _add_aliases( kb, title_to_id=filtered_title_to_id, max_entities_per_alias=max_entities_per_alias, min_occ=min_occ, prior_prob_input=prior_prob_input, ) + print() + print(now(), " * adding", alias_cnt, "aliases") + print() print() - print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) + print("# of entities in kb:", kb.get_size_entities()) + print("# of aliases in kb:", kb.get_size_aliases()) - print("done with kb", datetime.datetime.now()) + print(now(), "Done with kb") return kb @@ -140,6 +149,7 @@ def get_id_to_description(entity_descr_output): def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input): wp_titles = title_to_id.keys() + cnt = 0 # adding aliases with prior probabilities # we can read this file sequentially, it's sorted by alias, and then by count @@ -176,6 +186,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in entities=selected_entities, probabilities=prior_probs, ) + cnt += 1 except ValueError as e: print(e) total_count = 0 @@ -190,3 +201,8 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in previous_alias = new_alias line = prior_file.readline() + return cnt + + +def now(): + return datetime.datetime.now() diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py index 6a4d046e5..0663296e4 100644 --- a/bin/wiki_entity_linking/train_descriptions.py +++ b/bin/wiki_entity_linking/train_descriptions.py @@ -18,15 +18,19 @@ class EntityEncoder: """ DROP = 0 - EPOCHS = 5 - STOP_THRESHOLD = 0.04 - BATCH_SIZE = 1000 - def __init__(self, nlp, input_dim, desc_width): + # Set min. acceptable loss to avoid a 'mean of empty slice' warning by numpy + MIN_LOSS = 0.01 + + # Reasonable default to stop training when things are not improving + MAX_NO_IMPROVEMENT = 20 + + def __init__(self, nlp, input_dim, desc_width, epochs=5): self.nlp = nlp self.input_dim = input_dim self.desc_width = desc_width + self.epochs = epochs def apply_encoder(self, description_list): if self.encoder is None: @@ -46,32 +50,41 @@ class EntityEncoder: start = start + batch_size stop = min(stop + batch_size, len(description_list)) + print("encoded:", stop, "entities") return encodings def train(self, description_list, to_print=False): processed, loss = self._train_model(description_list) if to_print: - print("Trained on", processed, "entities across", self.EPOCHS, "epochs") + print( + "Trained entity descriptions on", + processed, + "(non-unique) entities across", + self.epochs, + "epochs", + ) print("Final loss:", loss) def _train_model(self, description_list): - # TODO: when loss gets too low, a 'mean of empty slice' warning is thrown by numpy - + best_loss = 1.0 + iter_since_best = 0 self._build_network(self.input_dim, self.desc_width) processed = 0 loss = 1 - descriptions = description_list.copy() # copy this list so that shuffling does not affect other functions + # copy this list so that shuffling does not affect other functions + descriptions = description_list.copy() + to_continue = True - for i in range(self.EPOCHS): + for i in range(self.epochs): shuffle(descriptions) batch_nr = 0 start = 0 stop = min(self.BATCH_SIZE, len(descriptions)) - while loss > self.STOP_THRESHOLD and start < len(descriptions): + while to_continue and start < len(descriptions): batch = [] for descr in descriptions[start:stop]: doc = self.nlp(descr) @@ -79,9 +92,24 @@ class EntityEncoder: batch.append(doc_vector) loss = self._update(batch) - print(i, batch_nr, loss) + if batch_nr % 25 == 0: + print("loss:", loss) processed += len(batch) + # in general, continue training if we haven't reached our ideal min yet + to_continue = loss > self.MIN_LOSS + + # store the best loss and track how long it's been + if loss < best_loss: + best_loss = loss + iter_since_best = 0 + else: + iter_since_best += 1 + + # stop learning if we haven't seen improvement since the last few iterations + if iter_since_best > self.MAX_NO_IMPROVEMENT: + to_continue = False + batch_nr += 1 start = start + self.BATCH_SIZE stop = min(stop + self.BATCH_SIZE, len(descriptions)) @@ -103,14 +131,16 @@ class EntityEncoder: def _build_network(self, orig_width, hidden_with): with Model.define_operators({">>": chain}): # very simple encoder-decoder model - self.encoder = ( - Affine(hidden_with, orig_width) + self.encoder = Affine(hidden_with, orig_width) + self.model = self.encoder >> zero_init( + Affine(orig_width, hidden_with, drop_factor=0.0) ) - self.model = self.encoder >> zero_init(Affine(orig_width, hidden_with, drop_factor=0.0)) self.sgd = create_default_optimizer(self.model.ops) def _update(self, vectors): - predictions, bp_model = self.model.begin_update(np.asarray(vectors), drop=self.DROP) + predictions, bp_model = self.model.begin_update( + np.asarray(vectors), drop=self.DROP + ) loss, d_scores = self._get_loss(scores=predictions, golds=np.asarray(vectors)) bp_model(d_scores, sgd=self.sgd) return loss / len(vectors) diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py index b090d7659..7f45d9435 100644 --- a/bin/wiki_entity_linking/training_set_creator.py +++ b/bin/wiki_entity_linking/training_set_creator.py @@ -21,9 +21,9 @@ def now(): return datetime.datetime.now() -def create_training(wikipedia_input, entity_def_input, training_output): +def create_training(wikipedia_input, entity_def_input, training_output, limit=None): wp_to_id = kb_creator.get_entity_to_id(entity_def_input) - _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None) + _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=limit) def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=None): @@ -128,6 +128,7 @@ def _process_wikipedia_texts(wikipedia_input, wp_to_id, training_output, limit=N line = file.readline() cnt += 1 + print(now(), "processed", cnt, "lines of Wikipedia dump") text_regex = re.compile(r"(?<=).*(?= 0: + el_pipe.cfg["incl_context"] = True + el_pipe.cfg["incl_prior"] = True + dev_acc_context, _ = _measure_acc(dev_data, el_pipe) + losses["entity_linker"] = losses["entity_linker"] / batchnr + print( + "Epoch, train loss", + itn, + round(losses["entity_linker"], 2), + " / dev accuracy avg", + round(dev_acc_context, 3), + ) + + # STEP 6: measure the performance of our trained pipe on an independent dev set + print() + if len(dev_data): + print() + print(now(), "STEP 6: performance measurement of Entity Linking pipe") + print() + + counts, acc_r, acc_r_d, acc_p, acc_p_d, acc_o, acc_o_d = _measure_baselines( + dev_data, kb + ) + print("dev counts:", sorted(counts.items(), key=lambda x: x[0])) + + oracle_by_label = [(x, round(y, 3)) for x, y in acc_o_d.items()] + print("dev accuracy oracle:", round(acc_o, 3), oracle_by_label) + + random_by_label = [(x, round(y, 3)) for x, y in acc_r_d.items()] + print("dev accuracy random:", round(acc_r, 3), random_by_label) + + prior_by_label = [(x, round(y, 3)) for x, y in acc_p_d.items()] + print("dev accuracy prior:", round(acc_p, 3), prior_by_label) + + # using only context + el_pipe.cfg["incl_context"] = True + el_pipe.cfg["incl_prior"] = False + dev_acc_context, dev_acc_cont_d = _measure_acc(dev_data, el_pipe) + context_by_label = [(x, round(y, 3)) for x, y in dev_acc_cont_d.items()] + print("dev accuracy context:", round(dev_acc_context, 3), context_by_label) + + # measuring combined accuracy (prior + context) + el_pipe.cfg["incl_context"] = True + el_pipe.cfg["incl_prior"] = True + dev_acc_combo, dev_acc_combo_d = _measure_acc(dev_data, el_pipe) + combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_d.items()] + print("dev accuracy prior+context:", round(dev_acc_combo, 3), combo_by_label) + + # STEP 7: apply the EL pipe on a toy example + print() + print(now(), "STEP 7: applying Entity Linking to toy example") + print() + run_el_toy_example(nlp=nlp) + + # STEP 8: write the NLP pipeline (including entity linker) to file + if output_dir: + print() + nlp_loc = output_dir / "nlp" + print(now(), "STEP 8: Writing trained NLP to", nlp_loc) + nlp.to_disk(nlp_loc) + print() + + print() + print(now(), "Done!") + + +def _measure_acc(data, el_pipe=None, error_analysis=False): + # If the docs in the data require further processing with an entity linker, set el_pipe + correct_by_label = dict() + incorrect_by_label = dict() + + docs = [d for d, g in data if len(d) > 0] + if el_pipe is not None: + docs = list(el_pipe.pipe(docs)) + golds = [g for d, g in data if len(d) > 0] + + for doc, gold in zip(docs, golds): + try: + correct_entries_per_article = dict() + for entity, kb_dict in gold.links.items(): + start, end = entity + # only evaluating on positive examples + for gold_kb, value in kb_dict.items(): + if value: + offset = _offset(start, end) + correct_entries_per_article[offset] = gold_kb + + for ent in doc.ents: + ent_label = ent.label_ + pred_entity = ent.kb_id_ + start = ent.start_char + end = ent.end_char + offset = _offset(start, end) + gold_entity = correct_entries_per_article.get(offset, None) + # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' + if gold_entity is not None: + if gold_entity == pred_entity: + correct = correct_by_label.get(ent_label, 0) + correct_by_label[ent_label] = correct + 1 + else: + incorrect = incorrect_by_label.get(ent_label, 0) + incorrect_by_label[ent_label] = incorrect + 1 + if error_analysis: + print(ent.text, "in", doc) + print( + "Predicted", + pred_entity, + "should have been", + gold_entity, + ) + print() + + except Exception as e: + print("Error assessing accuracy", e) + + acc, acc_by_label = calculate_acc(correct_by_label, incorrect_by_label) + return acc, acc_by_label + + +def _measure_baselines(data, kb): + # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound + counts_d = dict() + + random_correct_d = dict() + random_incorrect_d = dict() + + oracle_correct_d = dict() + oracle_incorrect_d = dict() + + prior_correct_d = dict() + prior_incorrect_d = dict() + + docs = [d for d, g in data if len(d) > 0] + golds = [g for d, g in data if len(d) > 0] + + for doc, gold in zip(docs, golds): + try: + correct_entries_per_article = dict() + for entity, kb_dict in gold.links.items(): + start, end = entity + for gold_kb, value in kb_dict.items(): + # only evaluating on positive examples + if value: + offset = _offset(start, end) + correct_entries_per_article[offset] = gold_kb + + for ent in doc.ents: + label = ent.label_ + start = ent.start_char + end = ent.end_char + offset = _offset(start, end) + gold_entity = correct_entries_per_article.get(offset, None) + + # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' + if gold_entity is not None: + counts_d[label] = counts_d.get(label, 0) + 1 + candidates = kb.get_candidates(ent.text) + oracle_candidate = "" + best_candidate = "" + random_candidate = "" + if candidates: + scores = [] + + for c in candidates: + scores.append(c.prior_prob) + if c.entity_ == gold_entity: + oracle_candidate = c.entity_ + + best_index = scores.index(max(scores)) + best_candidate = candidates[best_index].entity_ + random_candidate = random.choice(candidates).entity_ + + if gold_entity == best_candidate: + prior_correct_d[label] = prior_correct_d.get(label, 0) + 1 + else: + prior_incorrect_d[label] = prior_incorrect_d.get(label, 0) + 1 + + if gold_entity == random_candidate: + random_correct_d[label] = random_correct_d.get(label, 0) + 1 + else: + random_incorrect_d[label] = random_incorrect_d.get(label, 0) + 1 + + if gold_entity == oracle_candidate: + oracle_correct_d[label] = oracle_correct_d.get(label, 0) + 1 + else: + oracle_incorrect_d[label] = oracle_incorrect_d.get(label, 0) + 1 + + except Exception as e: + print("Error assessing accuracy", e) + + acc_prior, acc_prior_d = calculate_acc(prior_correct_d, prior_incorrect_d) + acc_rand, acc_rand_d = calculate_acc(random_correct_d, random_incorrect_d) + acc_oracle, acc_oracle_d = calculate_acc(oracle_correct_d, oracle_incorrect_d) + + return ( + counts_d, + acc_rand, + acc_rand_d, + acc_prior, + acc_prior_d, + acc_oracle, + acc_oracle_d, + ) + + +def _offset(start, end): + return "{}_{}".format(start, end) + + +def calculate_acc(correct_by_label, incorrect_by_label): + acc_by_label = dict() + total_correct = 0 + total_incorrect = 0 + all_keys = set() + all_keys.update(correct_by_label.keys()) + all_keys.update(incorrect_by_label.keys()) + for label in sorted(all_keys): + correct = correct_by_label.get(label, 0) + incorrect = incorrect_by_label.get(label, 0) + total_correct += correct + total_incorrect += incorrect + if correct == incorrect == 0: + acc_by_label[label] = 0 + else: + acc_by_label[label] = correct / (correct + incorrect) + acc = 0 + if not (total_correct == total_incorrect == 0): + acc = total_correct / (total_correct + total_incorrect) + return acc, acc_by_label + + +def check_kb(kb): + for mention in ("Bush", "Douglas Adams", "Homer", "Brazil", "China"): + candidates = kb.get_candidates(mention) + + print("generating candidates for " + mention + " :") + for c in candidates: + print( + " ", + c.prior_prob, + c.alias_, + "-->", + c.entity_ + " (freq=" + str(c.entity_freq) + ")", + ) + print() + + +def run_el_toy_example(nlp): + text = ( + "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " + "Douglas reminds us to always bring our towel, even in China or Brazil. " + "The main character in Doug's novel is the man Arthur Dent, " + "but Dougledydoug doesn't write about George Washington or Homer Simpson." + ) + doc = nlp(text) + print(text) + for ent in doc.ents: + print(" ent", ent.text, ent.label_, ent.kb_id_) + print() + + +if __name__ == "__main__": + plac.call(main) diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py index 80d75b013..fca600368 100644 --- a/bin/wiki_entity_linking/wikipedia_processor.py +++ b/bin/wiki_entity_linking/wikipedia_processor.py @@ -120,7 +120,7 @@ def now(): return datetime.datetime.now() -def read_prior_probs(wikipedia_input, prior_prob_output): +def read_prior_probs(wikipedia_input, prior_prob_output, limit=None): """ Read the XML wikipedia data and parse out intra-wiki links to estimate prior probabilities. The full file takes about 2h to parse 1100M lines. @@ -129,9 +129,9 @@ def read_prior_probs(wikipedia_input, prior_prob_output): with bz2.open(wikipedia_input, mode="rb") as file: line = file.readline() cnt = 0 - while line: - if cnt % 5000000 == 0: - print(now(), "processed", cnt, "lines of Wikipedia dump") + while line and (not limit or cnt < limit): + if cnt % 25000000 == 0: + print(now(), "processed", cnt, "lines of Wikipedia XML dump") clean_line = line.strip().decode("utf-8") aliases, entities, normalizations = get_wp_links(clean_line) @@ -141,6 +141,7 @@ def read_prior_probs(wikipedia_input, prior_prob_output): line = file.readline() cnt += 1 + print(now(), "processed", cnt, "lines of Wikipedia XML dump") # write all aliases and their entities and count occurrences to file with prior_prob_output.open("w", encoding="utf8") as outputfile: diff --git a/examples/pipeline/dummy_entity_linking.py b/examples/pipeline/dummy_entity_linking.py deleted file mode 100644 index 6dde616b8..000000000 --- a/examples/pipeline/dummy_entity_linking.py +++ /dev/null @@ -1,75 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -"""Demonstrate how to build a simple knowledge base and run an Entity Linking algorithm. -Currently still a bit of a dummy algorithm: taking simply the entity with highest probability for a given alias -""" -import spacy -from spacy.kb import KnowledgeBase - - -def create_kb(vocab): - kb = KnowledgeBase(vocab=vocab, entity_vector_length=1) - - # adding entities - entity_0 = "Q1004791_Douglas" - print("adding entity", entity_0) - kb.add_entity(entity=entity_0, freq=0.5, entity_vector=[0]) - - entity_1 = "Q42_Douglas_Adams" - print("adding entity", entity_1) - kb.add_entity(entity=entity_1, freq=0.5, entity_vector=[1]) - - entity_2 = "Q5301561_Douglas_Haig" - print("adding entity", entity_2) - kb.add_entity(entity=entity_2, freq=0.5, entity_vector=[2]) - - # adding aliases - print() - alias_0 = "Douglas" - print("adding alias", alias_0) - kb.add_alias(alias=alias_0, entities=[entity_0, entity_1, entity_2], probabilities=[0.6, 0.1, 0.2]) - - alias_1 = "Douglas Adams" - print("adding alias", alias_1) - kb.add_alias(alias=alias_1, entities=[entity_1], probabilities=[0.9]) - - print() - print("kb size:", len(kb), kb.get_size_entities(), kb.get_size_aliases()) - - return kb - - -def add_el(kb, nlp): - el_pipe = nlp.create_pipe(name='entity_linker', config={"context_width": 64}) - el_pipe.set_kb(kb) - nlp.add_pipe(el_pipe, last=True) - nlp.begin_training() - el_pipe.context_weight = 0 - el_pipe.prior_weight = 1 - - for alias in ["Douglas Adams", "Douglas"]: - candidates = nlp.linker.kb.get_candidates(alias) - print() - print(len(candidates), "candidate(s) for", alias, ":") - for c in candidates: - print(" ", c.entity_, c.prior_prob) - - text = "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " \ - "Douglas reminds us to always bring our towel. " \ - "The main character in Doug's novel is called Arthur Dent." - doc = nlp(text) - - print() - for token in doc: - print("token", token.text, token.ent_type_, token.ent_kb_id_) - - print() - for ent in doc.ents: - print("ent", ent.text, ent.label_, ent.kb_id_) - - -if __name__ == "__main__": - my_nlp = spacy.load('en_core_web_sm') - my_kb = create_kb(my_nlp.vocab) - add_el(my_kb, my_nlp) diff --git a/examples/pipeline/wikidata_entity_linking.py b/examples/pipeline/wikidata_entity_linking.py deleted file mode 100644 index 04e5bce6d..000000000 --- a/examples/pipeline/wikidata_entity_linking.py +++ /dev/null @@ -1,514 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import os -from os import path -import random -import datetime -from pathlib import Path - -from bin.wiki_entity_linking import wikipedia_processor as wp -from bin.wiki_entity_linking import training_set_creator, kb_creator -from bin.wiki_entity_linking.kb_creator import DESC_WIDTH - -import spacy -from spacy.kb import KnowledgeBase -from spacy.util import minibatch, compounding - -""" -Demonstrate how to build a knowledge base from WikiData and run an Entity Linking algorithm. -""" - -ROOT_DIR = Path("C:/Users/Sofie/Documents/data/") -OUTPUT_DIR = ROOT_DIR / "wikipedia" -TRAINING_DIR = OUTPUT_DIR / "training_data_nel" - -PRIOR_PROB = OUTPUT_DIR / "prior_prob.csv" -ENTITY_COUNTS = OUTPUT_DIR / "entity_freq.csv" -ENTITY_DEFS = OUTPUT_DIR / "entity_defs.csv" -ENTITY_DESCR = OUTPUT_DIR / "entity_descriptions.csv" - -KB_DIR = OUTPUT_DIR / "kb_1" -KB_FILE = "kb" -NLP_1_DIR = OUTPUT_DIR / "nlp_1" -NLP_2_DIR = OUTPUT_DIR / "nlp_2" - -# get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/ -WIKIDATA_JSON = ROOT_DIR / "wikidata" / "wikidata-20190304-all.json.bz2" - -# get enwiki-latest-pages-articles-multistream.xml.bz2 from https://dumps.wikimedia.org/enwiki/latest/ -ENWIKI_DUMP = ( - ROOT_DIR / "wikipedia" / "enwiki-20190320-pages-articles-multistream.xml.bz2" -) - -# KB construction parameters -MAX_CANDIDATES = 10 -MIN_ENTITY_FREQ = 20 -MIN_PAIR_OCC = 5 - -# model training parameters -EPOCHS = 10 -DROPOUT = 0.5 -LEARN_RATE = 0.005 -L2 = 1e-6 -CONTEXT_WIDTH = 128 - - -def now(): - return datetime.datetime.now() - - -def run_pipeline(): - # set the appropriate booleans to define which parts of the pipeline should be re(run) - print("START", now()) - print() - nlp_1 = spacy.load("en_core_web_lg") - nlp_2 = None - kb_2 = None - - # one-time methods to create KB and write to file - to_create_prior_probs = False - to_create_entity_counts = False - to_create_kb = False - - # read KB back in from file - to_read_kb = True - to_test_kb = False - - # create training dataset - create_wp_training = False - - # train the EL pipe - train_pipe = True - measure_performance = True - - # test the EL pipe on a simple example - to_test_pipeline = True - - # write the NLP object, read back in and test again - to_write_nlp = True - to_read_nlp = True - test_from_file = False - - # STEP 1 : create prior probabilities from WP (run only once) - if to_create_prior_probs: - print("STEP 1: to_create_prior_probs", now()) - wp.read_prior_probs(ENWIKI_DUMP, PRIOR_PROB) - print() - - # STEP 2 : deduce entity frequencies from WP (run only once) - if to_create_entity_counts: - print("STEP 2: to_create_entity_counts", now()) - wp.write_entity_counts(PRIOR_PROB, ENTITY_COUNTS, to_print=False) - print() - - # STEP 3 : create KB and write to file (run only once) - if to_create_kb: - print("STEP 3a: to_create_kb", now()) - kb_1 = kb_creator.create_kb( - nlp=nlp_1, - max_entities_per_alias=MAX_CANDIDATES, - min_entity_freq=MIN_ENTITY_FREQ, - min_occ=MIN_PAIR_OCC, - entity_def_output=ENTITY_DEFS, - entity_descr_output=ENTITY_DESCR, - count_input=ENTITY_COUNTS, - prior_prob_input=PRIOR_PROB, - wikidata_input=WIKIDATA_JSON, - ) - print("kb entities:", kb_1.get_size_entities()) - print("kb aliases:", kb_1.get_size_aliases()) - print() - - print("STEP 3b: write KB and NLP", now()) - - if not path.exists(KB_DIR): - os.makedirs(KB_DIR) - kb_1.dump(KB_DIR / KB_FILE) - nlp_1.to_disk(NLP_1_DIR) - print() - - # STEP 4 : read KB back in from file - if to_read_kb: - print("STEP 4: to_read_kb", now()) - nlp_2 = spacy.load(NLP_1_DIR) - kb_2 = KnowledgeBase(vocab=nlp_2.vocab, entity_vector_length=DESC_WIDTH) - kb_2.load_bulk(KB_DIR / KB_FILE) - print("kb entities:", kb_2.get_size_entities()) - print("kb aliases:", kb_2.get_size_aliases()) - print() - - # test KB - if to_test_kb: - check_kb(kb_2) - print() - - # STEP 5: create a training dataset from WP - if create_wp_training: - print("STEP 5: create training dataset", now()) - training_set_creator.create_training( - wikipedia_input=ENWIKI_DUMP, - entity_def_input=ENTITY_DEFS, - training_output=TRAINING_DIR, - ) - - # STEP 6: create and train the entity linking pipe - if train_pipe: - print("STEP 6: training Entity Linking pipe", now()) - type_to_int = {label: i for i, label in enumerate(nlp_2.entity.labels)} - print(" -analysing", len(type_to_int), "different entity types") - el_pipe = nlp_2.create_pipe( - name="entity_linker", - config={ - "context_width": CONTEXT_WIDTH, - "pretrained_vectors": nlp_2.vocab.vectors.name, - "type_to_int": type_to_int, - }, - ) - el_pipe.set_kb(kb_2) - nlp_2.add_pipe(el_pipe, last=True) - - other_pipes = [pipe for pipe in nlp_2.pipe_names if pipe != "entity_linker"] - with nlp_2.disable_pipes(*other_pipes): # only train Entity Linking - optimizer = nlp_2.begin_training() - optimizer.learn_rate = LEARN_RATE - optimizer.L2 = L2 - - # define the size (nr of entities) of training and dev set - train_limit = 5000 - dev_limit = 5000 - - # for training, get pos & neg instances that correspond to entries in the kb - train_data = training_set_creator.read_training( - nlp=nlp_2, - training_dir=TRAINING_DIR, - dev=False, - limit=train_limit, - kb=el_pipe.kb, - ) - - print("Training on", len(train_data), "articles") - print() - - # for testing, get all pos instances, whether or not they are in the kb - dev_data = training_set_creator.read_training( - nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit, kb=None - ) - - print("Dev testing on", len(dev_data), "articles") - print() - - if not train_data: - print("Did not find any training data") - else: - for itn in range(EPOCHS): - random.shuffle(train_data) - losses = {} - batches = minibatch(train_data, size=compounding(4.0, 128.0, 1.001)) - batchnr = 0 - - with nlp_2.disable_pipes(*other_pipes): - for batch in batches: - try: - docs, golds = zip(*batch) - nlp_2.update( - docs=docs, - golds=golds, - sgd=optimizer, - drop=DROPOUT, - losses=losses, - ) - batchnr += 1 - except Exception as e: - print("Error updating batch:", e) - - if batchnr > 0: - el_pipe.cfg["context_weight"] = 1 - el_pipe.cfg["prior_weight"] = 1 - dev_acc_context, _ = _measure_acc(dev_data, el_pipe) - losses["entity_linker"] = losses["entity_linker"] / batchnr - print( - "Epoch, train loss", - itn, - round(losses["entity_linker"], 2), - " / dev acc avg", - round(dev_acc_context, 3), - ) - - # STEP 7: measure the performance of our trained pipe on an independent dev set - if len(dev_data) and measure_performance: - print() - print("STEP 7: performance measurement of Entity Linking pipe", now()) - print() - - counts, acc_r, acc_r_d, acc_p, acc_p_d, acc_o, acc_o_d = _measure_baselines( - dev_data, kb_2 - ) - print("dev counts:", sorted(counts.items(), key=lambda x: x[0])) - - oracle_by_label = [(x, round(y, 3)) for x, y in acc_o_d.items()] - print("dev acc oracle:", round(acc_o, 3), oracle_by_label) - - random_by_label = [(x, round(y, 3)) for x, y in acc_r_d.items()] - print("dev acc random:", round(acc_r, 3), random_by_label) - - prior_by_label = [(x, round(y, 3)) for x, y in acc_p_d.items()] - print("dev acc prior:", round(acc_p, 3), prior_by_label) - - # using only context - el_pipe.cfg["context_weight"] = 1 - el_pipe.cfg["prior_weight"] = 0 - dev_acc_context, dev_acc_cont_d = _measure_acc(dev_data, el_pipe) - context_by_label = [(x, round(y, 3)) for x, y in dev_acc_cont_d.items()] - print("dev acc context avg:", round(dev_acc_context, 3), context_by_label) - - # measuring combined accuracy (prior + context) - el_pipe.cfg["context_weight"] = 1 - el_pipe.cfg["prior_weight"] = 1 - dev_acc_combo, dev_acc_combo_d = _measure_acc(dev_data, el_pipe) - combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_d.items()] - print("dev acc combo avg:", round(dev_acc_combo, 3), combo_by_label) - - # STEP 8: apply the EL pipe on a toy example - if to_test_pipeline: - print() - print("STEP 8: applying Entity Linking to toy example", now()) - print() - run_el_toy_example(nlp=nlp_2) - - # STEP 9: write the NLP pipeline (including entity linker) to file - if to_write_nlp: - print() - print("STEP 9: testing NLP IO", now()) - print() - print("writing to", NLP_2_DIR) - nlp_2.to_disk(NLP_2_DIR) - print() - - # verify that the IO has gone correctly - if to_read_nlp: - print("reading from", NLP_2_DIR) - nlp_3 = spacy.load(NLP_2_DIR) - - print("running toy example with NLP 3") - run_el_toy_example(nlp=nlp_3) - - # testing performance with an NLP model from file - if test_from_file: - nlp_2 = spacy.load(NLP_1_DIR) - nlp_3 = spacy.load(NLP_2_DIR) - el_pipe = nlp_3.get_pipe("entity_linker") - - dev_limit = 5000 - dev_data = training_set_creator.read_training( - nlp=nlp_2, training_dir=TRAINING_DIR, dev=True, limit=dev_limit, kb=None - ) - - print("Dev testing from file on", len(dev_data), "articles") - print() - - dev_acc_combo, dev_acc_combo_dict = _measure_acc(dev_data, el_pipe) - combo_by_label = [(x, round(y, 3)) for x, y in dev_acc_combo_dict.items()] - print("dev acc combo avg:", round(dev_acc_combo, 3), combo_by_label) - - print() - print("STOP", now()) - - -def _measure_acc(data, el_pipe=None, error_analysis=False): - # If the docs in the data require further processing with an entity linker, set el_pipe - correct_by_label = dict() - incorrect_by_label = dict() - - docs = [d for d, g in data if len(d) > 0] - if el_pipe is not None: - docs = list(el_pipe.pipe(docs)) - golds = [g for d, g in data if len(d) > 0] - - for doc, gold in zip(docs, golds): - try: - correct_entries_per_article = dict() - for entity, kb_dict in gold.links.items(): - start, end = entity - # only evaluating on positive examples - for gold_kb, value in kb_dict.items(): - if value: - offset = _offset(start, end) - correct_entries_per_article[offset] = gold_kb - - for ent in doc.ents: - ent_label = ent.label_ - pred_entity = ent.kb_id_ - start = ent.start_char - end = ent.end_char - offset = _offset(start, end) - gold_entity = correct_entries_per_article.get(offset, None) - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - if gold_entity == pred_entity: - correct = correct_by_label.get(ent_label, 0) - correct_by_label[ent_label] = correct + 1 - else: - incorrect = incorrect_by_label.get(ent_label, 0) - incorrect_by_label[ent_label] = incorrect + 1 - if error_analysis: - print(ent.text, "in", doc) - print( - "Predicted", - pred_entity, - "should have been", - gold_entity, - ) - print() - - except Exception as e: - print("Error assessing accuracy", e) - - acc, acc_by_label = calculate_acc(correct_by_label, incorrect_by_label) - return acc, acc_by_label - - -def _measure_baselines(data, kb): - # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound - counts_d = dict() - - random_correct_d = dict() - random_incorrect_d = dict() - - oracle_correct_d = dict() - oracle_incorrect_d = dict() - - prior_correct_d = dict() - prior_incorrect_d = dict() - - docs = [d for d, g in data if len(d) > 0] - golds = [g for d, g in data if len(d) > 0] - - for doc, gold in zip(docs, golds): - try: - correct_entries_per_article = dict() - for entity, kb_dict in gold.links.items(): - start, end = entity - for gold_kb, value in kb_dict.items(): - # only evaluating on positive examples - if value: - offset = _offset(start, end) - correct_entries_per_article[offset] = gold_kb - - for ent in doc.ents: - label = ent.label_ - start = ent.start_char - end = ent.end_char - offset = _offset(start, end) - gold_entity = correct_entries_per_article.get(offset, None) - - # the gold annotations are not complete so we can't evaluate missing annotations as 'wrong' - if gold_entity is not None: - counts_d[label] = counts_d.get(label, 0) + 1 - candidates = kb.get_candidates(ent.text) - oracle_candidate = "" - best_candidate = "" - random_candidate = "" - if candidates: - scores = [] - - for c in candidates: - scores.append(c.prior_prob) - if c.entity_ == gold_entity: - oracle_candidate = c.entity_ - - best_index = scores.index(max(scores)) - best_candidate = candidates[best_index].entity_ - random_candidate = random.choice(candidates).entity_ - - if gold_entity == best_candidate: - prior_correct_d[label] = prior_correct_d.get(label, 0) + 1 - else: - prior_incorrect_d[label] = prior_incorrect_d.get(label, 0) + 1 - - if gold_entity == random_candidate: - random_correct_d[label] = random_correct_d.get(label, 0) + 1 - else: - random_incorrect_d[label] = random_incorrect_d.get(label, 0) + 1 - - if gold_entity == oracle_candidate: - oracle_correct_d[label] = oracle_correct_d.get(label, 0) + 1 - else: - oracle_incorrect_d[label] = oracle_incorrect_d.get(label, 0) + 1 - - except Exception as e: - print("Error assessing accuracy", e) - - acc_prior, acc_prior_d = calculate_acc(prior_correct_d, prior_incorrect_d) - acc_rand, acc_rand_d = calculate_acc(random_correct_d, random_incorrect_d) - acc_oracle, acc_oracle_d = calculate_acc(oracle_correct_d, oracle_incorrect_d) - - return ( - counts_d, - acc_rand, - acc_rand_d, - acc_prior, - acc_prior_d, - acc_oracle, - acc_oracle_d, - ) - - -def _offset(start, end): - return "{}_{}".format(start, end) - - -def calculate_acc(correct_by_label, incorrect_by_label): - acc_by_label = dict() - total_correct = 0 - total_incorrect = 0 - all_keys = set() - all_keys.update(correct_by_label.keys()) - all_keys.update(incorrect_by_label.keys()) - for label in sorted(all_keys): - correct = correct_by_label.get(label, 0) - incorrect = incorrect_by_label.get(label, 0) - total_correct += correct - total_incorrect += incorrect - if correct == incorrect == 0: - acc_by_label[label] = 0 - else: - acc_by_label[label] = correct / (correct + incorrect) - acc = 0 - if not (total_correct == total_incorrect == 0): - acc = total_correct / (total_correct + total_incorrect) - return acc, acc_by_label - - -def check_kb(kb): - for mention in ("Bush", "Douglas Adams", "Homer", "Brazil", "China"): - candidates = kb.get_candidates(mention) - - print("generating candidates for " + mention + " :") - for c in candidates: - print( - " ", - c.prior_prob, - c.alias_, - "-->", - c.entity_ + " (freq=" + str(c.entity_freq) + ")", - ) - print() - - -def run_el_toy_example(nlp): - text = ( - "In The Hitchhiker's Guide to the Galaxy, written by Douglas Adams, " - "Douglas reminds us to always bring our towel, even in China or Brazil. " - "The main character in Doug's novel is the man Arthur Dent, " - "but Dougledydoug doesn't write about George Washington or Homer Simpson." - ) - doc = nlp(text) - print(text) - for ent in doc.ents: - print(" ent", ent.text, ent.label_, ent.kb_id_) - print() - - -if __name__ == "__main__": - run_pipeline() diff --git a/examples/training/pretrain_kb.py b/examples/training/pretrain_kb.py new file mode 100644 index 000000000..67cc1587d --- /dev/null +++ b/examples/training/pretrain_kb.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +# coding: utf8 + +"""Example of defining and (pre)training spaCy's knowledge base, +which is needed to implement entity linking functionality. + +For more details, see the documentation: +* Knowledge base: https://spacy.io/api/kb +* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking + +Compatible with: spaCy vX.X +Last tested with: vX.X +""" +from __future__ import unicode_literals, print_function + +import plac +from pathlib import Path + +from spacy.vocab import Vocab + +import spacy +from spacy.kb import KnowledgeBase + +from bin.wiki_entity_linking.train_descriptions import EntityEncoder +from spacy import Errors + + +# Q2146908 (Russ Cochran): American golfer +# Q7381115 (Russ Cochran): publisher +ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} + +INPUT_DIM = 300 # dimension of pre-trained input vectors +DESC_WIDTH = 64 # dimension of output entity vectors + + +@plac.annotations( + vocab_path=("Path to the vocab for the kb", "option", "v", Path), + model=("Model name, should have pretrained word embeddings", "option", "m", str), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int), +) +def main(vocab_path=None, model=None, output_dir=None, n_iter=50): + """Load the model, create the KB and pretrain the entity encodings. + Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings. + If an output_dir is provided, the KB will be stored there in a file 'kb'. + When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" + if model is None and vocab_path is None: + raise ValueError(Errors.E154) + + if model is not None: + nlp = spacy.load(model) # load existing spaCy model + print("Loaded model '%s'" % model) + else: + vocab = Vocab().from_disk(vocab_path) + # create blank Language class with specified vocab + nlp = spacy.blank("en", vocab=vocab) + print("Created blank 'en' model with vocab from '%s'" % vocab_path) + + kb = KnowledgeBase(vocab=nlp.vocab) + + # set up the data + entity_ids = [] + descriptions = [] + freqs = [] + for key, value in ENTITIES.items(): + desc, freq = value + entity_ids.append(key) + descriptions.append(desc) + freqs.append(freq) + + # training entity description encodings + # this part can easily be replaced with a custom entity encoder + encoder = EntityEncoder( + nlp=nlp, + input_dim=INPUT_DIM, + desc_width=DESC_WIDTH, + epochs=n_iter, + threshold=0.001, + ) + encoder.train(description_list=descriptions, to_print=True) + + # get the pretrained entity vectors + embeddings = encoder.apply_encoder(descriptions) + + # set the entities, can also be done by calling `kb.add_entity` for each entity + kb.set_entities(entity_list=entity_ids, freq_list=freqs, vector_list=embeddings) + + # adding aliases, the entities need to be defined in the KB beforehand + kb.add_alias( + alias="Russ Cochran", + entities=["Q2146908", "Q7381115"], + probabilities=[0.24, 0.7], # the sum of these probabilities should not exceed 1 + ) + + # test the trained model + print() + _print_kb(kb) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + kb_path = str(output_dir / "kb") + kb.dump(kb_path) + print() + print("Saved KB to", kb_path) + + # only storing the vocab if we weren't already reading it from file + if not vocab_path: + vocab_path = output_dir / "vocab" + kb.vocab.to_disk(vocab_path) + print("Saved vocab to", vocab_path) + + print() + + # test the saved model + # always reload a knowledge base with the same vocab instance! + print("Loading vocab from", vocab_path) + print("Loading KB from", kb_path) + vocab2 = Vocab().from_disk(vocab_path) + kb2 = KnowledgeBase(vocab=vocab2) + kb2.load_bulk(kb_path) + _print_kb(kb2) + print() + + +def _print_kb(kb): + print(kb.get_size_entities(), "kb entities:", kb.get_entity_strings()) + print(kb.get_size_aliases(), "kb aliases:", kb.get_alias_strings()) + + +if __name__ == "__main__": + plac.call(main) + + # Expected output: + + # 2 kb entities: ['Q2146908', 'Q7381115'] + # 1 kb aliases: ['Russ Cochran'] diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py new file mode 100644 index 000000000..f925672f1 --- /dev/null +++ b/examples/training/train_entity_linker.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python +# coding: utf8 + +"""Example of training spaCy's entity linker, starting off with an +existing model and a pre-defined knowledge base. + +For more details, see the documentation: +* Training: https://spacy.io/usage/training +* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking + +Compatible with: spaCy vX.X +Last tested with: vX.X +""" +from __future__ import unicode_literals, print_function + +import plac +import random +from pathlib import Path + +from spacy.symbols import PERSON +from spacy.vocab import Vocab + +import spacy +from spacy.kb import KnowledgeBase + +from spacy import Errors +from spacy.tokens import Span +from spacy.util import minibatch, compounding + + +def sample_train_data(): + train_data = [] + + # Q2146908 (Russ Cochran): American golfer + # Q7381115 (Russ Cochran): publisher + + text_1 = "Russ Cochran his reprints include EC Comics." + dict_1 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}} + train_data.append((text_1, {"links": dict_1})) + + text_2 = "Russ Cochran has been publishing comic art." + dict_2 = {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}} + train_data.append((text_2, {"links": dict_2})) + + text_3 = "Russ Cochran captured his first major title with his son as caddie." + dict_3 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}} + train_data.append((text_3, {"links": dict_3})) + + text_4 = "Russ Cochran was a member of University of Kentucky's golf team." + dict_4 = {(0, 12): {"Q7381115": 0.0, "Q2146908": 1.0}} + train_data.append((text_4, {"links": dict_4})) + + return train_data + + +# training data +TRAIN_DATA = sample_train_data() + + +@plac.annotations( + kb_path=("Path to the knowledge base", "positional", None, Path), + vocab_path=("Path to the vocab for the kb", "positional", None, Path), + output_dir=("Optional output directory", "option", "o", Path), + n_iter=("Number of training iterations", "option", "n", int), +) +def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): + """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. + The `vocab` should be the one used during creation of the KB.""" + vocab = Vocab().from_disk(vocab_path) + # create blank Language class with correct vocab + nlp = spacy.blank("en", vocab=vocab) + nlp.vocab.vectors.name = "spacy_pretrained_vectors" + print("Created blank 'en' model with vocab from '%s'" % vocab_path) + + # create the built-in pipeline components and add them to the pipeline + # nlp.create_pipe works for built-ins that are registered with spaCy + if "entity_linker" not in nlp.pipe_names: + entity_linker = nlp.create_pipe("entity_linker") + kb = KnowledgeBase(vocab=nlp.vocab) + kb.load_bulk(kb_path) + print("Loaded Knowledge Base from '%s'" % kb_path) + entity_linker.set_kb(kb) + nlp.add_pipe(entity_linker, last=True) + else: + entity_linker = nlp.get_pipe("entity_linker") + kb = entity_linker.kb + + # make sure the annotated examples correspond to known identifiers in the knowlege base + kb_ids = kb.get_entity_strings() + for text, annotation in TRAIN_DATA: + for offset, kb_id_dict in annotation["links"].items(): + new_dict = {} + for kb_id, value in kb_id_dict.items(): + if kb_id in kb_ids: + new_dict[kb_id] = value + else: + print( + "Removed", kb_id, "from training because it is not in the KB." + ) + annotation["links"][offset] = new_dict + + # get names of other pipes to disable them during training + other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] + with nlp.disable_pipes(*other_pipes): # only train entity linker + # reset and initialize the weights randomly + optimizer = nlp.begin_training() + for itn in range(n_iter): + random.shuffle(TRAIN_DATA) + losses = {} + # batch up the examples using spaCy's minibatch + batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + for batch in batches: + texts, annotations = zip(*batch) + nlp.update( + texts, # batch of texts + annotations, # batch of annotations + drop=0.2, # dropout - make it harder to memorise data + losses=losses, + sgd=optimizer, + ) + print(itn, "Losses", losses) + + # test the trained model + _apply_model(nlp) + + # save model to output directory + if output_dir is not None: + output_dir = Path(output_dir) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + print() + print("Saved model to", output_dir) + + # test the saved model + print("Loading from", output_dir) + nlp2 = spacy.load(output_dir) + _apply_model(nlp2) + + +def _apply_model(nlp): + for text, annotation in TRAIN_DATA: + doc = nlp.tokenizer(text) + + # set entities so the evaluation is independent of the NER step + # all the examples contain 'Russ Cochran' as the first two tokens in the sentence + rc_ent = Span(doc, 0, 2, label=PERSON) + doc.ents = [rc_ent] + + # apply the entity linker which will now make predictions for the 'Russ Cochran' entities + doc = nlp.get_pipe("entity_linker")(doc) + + print() + print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents]) + print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc]) + + +if __name__ == "__main__": + plac.call(main) + + # Expected output (can be shuffled): + + # Entities[('Russ Cochran', 'PERSON', 'Q7381115')] + # Tokens[('Russ', 'PERSON', 'Q7381115'), ('Cochran', 'PERSON', 'Q7381115'), ("his", '', ''), ('reprints', '', ''), ('include', '', ''), ('The', '', ''), ('Complete', '', ''), ('EC', '', ''), ('Library', '', ''), ('.', '', '')] + + # Entities[('Russ Cochran', 'PERSON', 'Q7381115')] + # Tokens[('Russ', 'PERSON', 'Q7381115'), ('Cochran', 'PERSON', 'Q7381115'), ('has', '', ''), ('been', '', ''), ('publishing', '', ''), ('comic', '', ''), ('art', '', ''), ('.', '', '')] + + # Entities[('Russ Cochran', 'PERSON', 'Q2146908')] + # Tokens[('Russ', 'PERSON', 'Q2146908'), ('Cochran', 'PERSON', 'Q2146908'), ('captured', '', ''), ('his', '', ''), ('first', '', ''), ('major', '', ''), ('title', '', ''), ('with', '', ''), ('his', '', ''), ('son', '', ''), ('as', '', ''), ('caddie', '', ''), ('.', '', '')] + + # Entities[('Russ Cochran', 'PERSON', 'Q2146908')] + # Tokens[('Russ', 'PERSON', 'Q2146908'), ('Cochran', 'PERSON', 'Q2146908'), ('was', '', ''), ('a', '', ''), ('member', '', ''), ('of', '', ''), ('University', '', ''), ('of', '', ''), ('Kentucky', '', ''), ("'s", '', ''), ('golf', '', ''), ('team', '', ''), ('.', '', '')] diff --git a/spacy/_ml.py b/spacy/_ml.py index dedd1bee5..1e8c0f27b 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -665,25 +665,15 @@ def build_simple_cnn_text_classifier(tok2vec, nr_class, exclusive_classes=False, def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): if "entity_width" not in cfg: raise ValueError(Errors.E144.format(param="entity_width")) - if "context_width" not in cfg: - raise ValueError(Errors.E144.format(param="context_width")) conv_depth = cfg.get("conv_depth", 2) cnn_maxout_pieces = cfg.get("cnn_maxout_pieces", 3) pretrained_vectors = cfg.get("pretrained_vectors", None) - context_width = cfg.get("context_width") - entity_width = cfg.get("entity_width") + context_width = cfg.get("entity_width") with Model.define_operators({">>": chain, "**": clone}): - model = ( - Affine(entity_width, entity_width + context_width + 1 + ner_types) - >> Affine(1, entity_width, drop_factor=0.0) - >> logistic - ) - # context encoder - tok2vec = ( - Tok2Vec( + tok2vec = Tok2Vec( width=hidden_width, embed_size=embed_width, pretrained_vectors=pretrained_vectors, @@ -692,17 +682,17 @@ def build_nel_encoder(embed_width, hidden_width, ner_types, **cfg): conv_depth=conv_depth, bilstm_depth=0, ) + + model = ( + tok2vec >> flatten_add_lengths >> Pooling(mean_pool) >> Residual(zero_init(Maxout(hidden_width, hidden_width))) - >> zero_init(Affine(context_width, hidden_width)) + >> zero_init(Affine(context_width, hidden_width, drop_factor=0.0)) ) model.tok2vec = tok2vec - - model.tok2vec = tok2vec - model.tok2vec.nO = context_width - model.nO = 1 + model.nO = context_width return model diff --git a/spacy/errors.py b/spacy/errors.py index 79b9cbdf4..25a170bdb 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -124,7 +124,8 @@ class Errors(object): E016 = ("MultitaskObjective target should be function or one of: dep, " "tag, ent, dep_tag_offset, ent_tag.") E017 = ("Can only add unicode or bytes. Got type: {value_type}") - E018 = ("Can't retrieve string for hash '{hash_value}'.") + E018 = ("Can't retrieve string for hash '{hash_value}'. This usually refers " + "to an issue with the `Vocab` or `StringStore`.") E019 = ("Can't create transition with unknown action ID: {action}. Action " "IDs are enumerated in spacy/syntax/{src}.pyx.") E020 = ("Could not find a gold-standard action to supervise the " @@ -420,7 +421,12 @@ class Errors(object): E151 = ("Trying to call nlp.update without required annotation types. " "Expected top-level keys: {expected_keys}." " Got: {unexpected_keys}.") - + E152 = ("The `nlp` object should have a pre-trained `ner` component.") + E153 = ("Either provide a path to a preprocessed training directory, " + "or to the original Wikipedia XML dump.") + E154 = ("Either the `nlp` model or the `vocab` should be specified.") + E155 = ("The `nlp` object should have access to pre-trained word vectors, cf. " + "https://spacy.io/usage/models#languages.") @add_codes class TempErrors(object): diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 28e762653..176ac17de 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -19,6 +19,13 @@ from libcpp.vector cimport vector cdef class Candidate: + """A `Candidate` object refers to a textual mention (`alias`) that may or may not be resolved + to a specific `entity` from a Knowledge Base. This will be used as input for the entity linking + algorithm which will disambiguate the various candidates to the correct one. + Each candidate (alias, entity) pair is assigned to a certain prior probability. + + DOCS: https://spacy.io/api/candidate + """ def __init__(self, KnowledgeBase kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob): self.kb = kb @@ -62,8 +69,13 @@ cdef class Candidate: cdef class KnowledgeBase: + """A `KnowledgeBase` instance stores unique identifiers for entities and their textual aliases, + to support entity linking of named entities to real-world concepts. - def __init__(self, Vocab vocab, entity_vector_length): + DOCS: https://spacy.io/api/kb + """ + + def __init__(self, Vocab vocab, entity_vector_length=64): self.vocab = vocab self.mem = Pool() self.entity_vector_length = entity_vector_length diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 375a0884b..db45e0faa 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -14,6 +14,8 @@ from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module from spacy.kb import KnowledgeBase + +from spacy.cli.pretrain import get_cossim_loss from .functions import merge_subtokens from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser @@ -1102,7 +1104,7 @@ cdef class EntityRecognizer(Parser): class EntityLinker(Pipe): """Pipeline component for named entity linking. - DOCS: TODO + DOCS: https://spacy.io/api/entitylinker """ name = 'entity_linker' NIL = "NIL" # string used to refer to a non-existing link @@ -1121,9 +1123,6 @@ class EntityLinker(Pipe): self.model = True self.kb = None self.cfg = dict(cfg) - self.sgd_context = None - if not self.cfg.get("context_width"): - self.cfg["context_width"] = 128 def set_kb(self, kb): self.kb = kb @@ -1144,7 +1143,6 @@ class EntityLinker(Pipe): if self.model is True: self.model = self.Model(**self.cfg) - self.sgd_context = self.create_optimizer() if sgd is None: sgd = self.create_optimizer() @@ -1170,12 +1168,6 @@ class EntityLinker(Pipe): golds = [golds] context_docs = [] - entity_encodings = [] - - priors = [] - type_vectors = [] - - type_to_int = self.cfg.get("type_to_int", dict()) for doc, gold in zip(docs, golds): ents_by_offset = dict() @@ -1184,49 +1176,38 @@ class EntityLinker(Pipe): for entity, kb_dict in gold.links.items(): start, end = entity mention = doc.text[start:end] + for kb_id, value in kb_dict.items(): - entity_encoding = self.kb.get_vector(kb_id) - prior_prob = self.kb.get_prior_prob(kb_id, mention) + # Currently only training on the positive instances + if value: + context_docs.append(doc) - gold_ent = ents_by_offset["{}_{}".format(start, end)] - if gold_ent is None: - raise RuntimeError(Errors.E147.format(method="update", msg="gold entity not found")) + context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop) + loss, d_scores = self.get_similarity_loss(scores=context_encodings, golds=golds, docs=None) + bp_context(d_scores, sgd=sgd) - type_vector = [0 for i in range(len(type_to_int))] - if len(type_to_int) > 0: - type_vector[type_to_int[gold_ent.label_]] = 1 + if losses is not None: + losses[self.name] += loss + return loss - # store data - entity_encodings.append(entity_encoding) - context_docs.append(doc) - type_vectors.append(type_vector) + def get_similarity_loss(self, docs, golds, scores): + entity_encodings = [] + for gold in golds: + for entity, kb_dict in gold.links.items(): + for kb_id, value in kb_dict.items(): + # this loss function assumes we're only using positive examples + if value: + entity_encoding = self.kb.get_vector(kb_id) + entity_encodings.append(entity_encoding) - if self.cfg.get("prior_weight", 1) > 0: - priors.append([prior_prob]) - else: - priors.append([0]) + entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - if len(entity_encodings) > 0: - if not (len(priors) == len(entity_encodings) == len(context_docs) == len(type_vectors)): - raise RuntimeError(Errors.E147.format(method="update", msg="vector lengths not equal")) + if scores.shape != entity_encodings.shape: + raise RuntimeError(Errors.E147.format(method="get_loss", msg="gold entities do not match up")) - entity_encodings = self.model.ops.asarray(entity_encodings, dtype="float32") - - context_encodings, bp_context = self.model.tok2vec.begin_update(context_docs, drop=drop) - mention_encodings = [list(context_encodings[i]) + list(entity_encodings[i]) + priors[i] + type_vectors[i] - for i in range(len(entity_encodings))] - pred, bp_mention = self.model.begin_update(self.model.ops.asarray(mention_encodings, dtype="float32"), drop=drop) - - loss, d_scores = self.get_loss(scores=pred, golds=golds, docs=docs) - mention_gradient = bp_mention(d_scores, sgd=sgd) - - context_gradients = [list(x[0:self.cfg.get("context_width")]) for x in mention_gradient] - bp_context(self.model.ops.asarray(context_gradients, dtype="float32"), sgd=self.sgd_context) - - if losses is not None: - losses[self.name] += loss - return loss - return 0 + loss, gradients = get_cossim_loss(yh=scores, y=entity_encodings) + loss = loss / len(entity_encodings) + return loss, gradients def get_loss(self, docs, golds, scores): cats = [] @@ -1271,20 +1252,17 @@ class EntityLinker(Pipe): if isinstance(docs, Doc): docs = [docs] - context_encodings = self.model.tok2vec(docs) + context_encodings = self.model(docs) xp = get_array_module(context_encodings) - type_to_int = self.cfg.get("type_to_int", dict()) - for i, doc in enumerate(docs): if len(doc) > 0: # currently, the context is the same for each entity in a sentence (should be refined) context_encoding = context_encodings[i] + context_enc_t = context_encoding.T + norm_1 = xp.linalg.norm(context_enc_t) for ent in doc.ents: entity_count += 1 - type_vector = [0 for i in range(len(type_to_int))] - if len(type_to_int) > 0: - type_vector[type_to_int[ent.label_]] = 1 candidates = self.kb.get_candidates(ent.text) if not candidates: @@ -1293,20 +1271,23 @@ class EntityLinker(Pipe): else: random.shuffle(candidates) - # this will set the prior probabilities to 0 (just like in training) if their weight is 0 - prior_probs = xp.asarray([[c.prior_prob] for c in candidates]) - prior_probs *= self.cfg.get("prior_weight", 1) + # this will set all prior probabilities to 0 if they should be excluded from the model + prior_probs = xp.asarray([c.prior_prob for c in candidates]) + if not self.cfg.get("incl_prior", True): + prior_probs = xp.asarray([[0.0] for c in candidates]) scores = prior_probs - if self.cfg.get("context_weight", 1) > 0: + # add in similarity from the context + if self.cfg.get("incl_context", True): entity_encodings = xp.asarray([c.entity_vector for c in candidates]) + norm_2 = xp.linalg.norm(entity_encodings, axis=1) + if len(entity_encodings) != len(prior_probs): raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length")) - mention_encodings = [list(context_encoding) + list(entity_encodings[i]) - + list(prior_probs[i]) + type_vector - for i in range(len(entity_encodings))] - scores = self.model(self.model.ops.asarray(mention_encodings, dtype="float32")) + # cosine similarity + sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2) + scores = prior_probs + sims - (prior_probs*sims) # TODO: thresholding best_index = scores.argmax() @@ -1346,7 +1327,7 @@ class EntityLinker(Pipe): def load_model(p): if self.model is True: self.model = self.Model(**self.cfg) - try: + try: self.model.from_bytes(p.open("rb").read()) except AttributeError: raise ValueError(Errors.E149) diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index ca6bf2b6c..b10d55267 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -23,9 +23,9 @@ def test_kb_valid_entities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities - mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[8, 4, 3]) - mykb.add_entity(entity="Q2", freq=0.5, entity_vector=[2, 1, 0]) - mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[-1, -6, 5]) + mykb.add_entity(entity="Q1", freq=19, entity_vector=[8, 4, 3]) + mykb.add_entity(entity="Q2", freq=5, entity_vector=[2, 1, 0]) + mykb.add_entity(entity="Q3", freq=25, entity_vector=[-1, -6, 5]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.2]) @@ -52,9 +52,9 @@ def test_kb_invalid_entities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) - mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) - mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) + mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=5, entity_vector=[2]) + mykb.add_entity(entity="Q3", freq=25, entity_vector=[3]) # adding aliases - should fail because one of the given IDs is not valid with pytest.raises(ValueError): @@ -68,9 +68,9 @@ def test_kb_invalid_probabilities(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) - mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) - mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) + mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=5, entity_vector=[2]) + mykb.add_entity(entity="Q3", freq=25, entity_vector=[3]) # adding aliases - should fail because the sum of the probabilities exceeds 1 with pytest.raises(ValueError): @@ -82,9 +82,9 @@ def test_kb_invalid_combination(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) - mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) - mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) + mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=5, entity_vector=[2]) + mykb.add_entity(entity="Q3", freq=25, entity_vector=[3]) # adding aliases - should fail because the entities and probabilities vectors are not of equal length with pytest.raises(ValueError): @@ -98,11 +98,11 @@ def test_kb_invalid_entity_vector(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=3) # adding entities - mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1, 2, 3]) + mykb.add_entity(entity="Q1", freq=19, entity_vector=[1, 2, 3]) # this should fail because the kb's expected entity vector length is 3 with pytest.raises(ValueError): - mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) + mykb.add_entity(entity="Q2", freq=5, entity_vector=[2]) def test_candidate_generation(nlp): @@ -110,9 +110,9 @@ def test_candidate_generation(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", freq=0.7, entity_vector=[1]) - mykb.add_entity(entity="Q2", freq=0.2, entity_vector=[2]) - mykb.add_entity(entity="Q3", freq=0.5, entity_vector=[3]) + mykb.add_entity(entity="Q1", freq=27, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=12, entity_vector=[2]) + mykb.add_entity(entity="Q3", freq=5, entity_vector=[3]) # adding aliases mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1]) @@ -126,7 +126,7 @@ def test_candidate_generation(nlp): # test the content of the candidates assert mykb.get_candidates("adam")[0].entity_ == "Q2" assert mykb.get_candidates("adam")[0].alias_ == "adam" - assert_almost_equal(mykb.get_candidates("adam")[0].entity_freq, 0.2) + assert_almost_equal(mykb.get_candidates("adam")[0].entity_freq, 12) assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9) @@ -135,8 +135,8 @@ def test_preserving_links_asdoc(nlp): mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) # adding entities - mykb.add_entity(entity="Q1", freq=0.9, entity_vector=[1]) - mykb.add_entity(entity="Q2", freq=0.8, entity_vector=[1]) + mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) + mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) # adding aliases mykb.add_alias(alias="Boston", entities=["Q1"], probabilities=[0.7]) @@ -154,11 +154,11 @@ def test_preserving_links_asdoc(nlp): ruler.add_patterns(patterns) nlp.add_pipe(ruler) - el_pipe = nlp.create_pipe(name="entity_linker", config={"context_width": 64}) + el_pipe = nlp.create_pipe(name="entity_linker") el_pipe.set_kb(mykb) el_pipe.begin_training() - el_pipe.context_weight = 0 - el_pipe.prior_weight = 1 + el_pipe.incl_context = False + el_pipe.incl_prior = True nlp.add_pipe(el_pipe, last=True) # test whether the entity links are preserved by the `as_doc()` function diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 1752abda2..e817e8e12 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -30,10 +30,10 @@ def test_serialize_kb_disk(en_vocab): def _get_dummy_kb(vocab): kb = KnowledgeBase(vocab=vocab, entity_vector_length=3) - kb.add_entity(entity='Q53', freq=0.33, entity_vector=[0, 5, 3]) - kb.add_entity(entity='Q17', freq=0.2, entity_vector=[7, 1, 0]) - kb.add_entity(entity='Q007', freq=0.7, entity_vector=[0, 0, 7]) - kb.add_entity(entity='Q44', freq=0.4, entity_vector=[4, 4, 4]) + kb.add_entity(entity='Q53', freq=33, entity_vector=[0, 5, 3]) + kb.add_entity(entity='Q17', freq=2, entity_vector=[7, 1, 0]) + kb.add_entity(entity='Q007', freq=7, entity_vector=[0, 0, 7]) + kb.add_entity(entity='Q44', freq=342, entity_vector=[4, 4, 4]) kb.add_alias(alias='double07', entities=['Q17', 'Q007'], probabilities=[0.1, 0.9]) kb.add_alias(alias='guy', entities=['Q53', 'Q007', 'Q17', 'Q44'], probabilities=[0.3, 0.3, 0.2, 0.1]) @@ -62,13 +62,13 @@ def _check_kb(kb): assert len(candidates) == 2 assert candidates[0].entity_ == 'Q007' - assert 0.6999 < candidates[0].entity_freq < 0.701 + assert 6.999 < candidates[0].entity_freq < 7.01 assert candidates[0].entity_vector == [0, 0, 7] assert candidates[0].alias_ == 'double07' assert 0.899 < candidates[0].prior_prob < 0.901 assert candidates[1].entity_ == 'Q17' - assert 0.199 < candidates[1].entity_freq < 0.201 + assert 1.99 < candidates[1].entity_freq < 2.01 assert candidates[1].entity_vector == [7, 1, 0] assert candidates[1].alias_ == 'double07' assert 0.099 < candidates[1].prior_prob < 0.101 diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index e9c5a0f1d..21f29f304 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -546,6 +546,7 @@ cdef class Doc: cdef int i for i in range(self.length): self.c[i].ent_type = 0 + self.c[i].ent_kb_id = 0 self.c[i].ent_iob = 0 # Means missing. cdef attr_t ent_type cdef int start, end From 1711b5eb62aa0e19d4b4f88a902189454306caec Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 13 Aug 2019 15:59:55 +0200 Subject: [PATCH 06/12] =?UTF-8?q?=F0=9F=92=AB=20Support=20displaCy=20user?= =?UTF-8?q?=20colors=20via=20entry=20point=20(#4113)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- spacy/displacy/render.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py index 86b933eef..c2a903a56 100644 --- a/spacy/displacy/render.py +++ b/spacy/displacy/render.py @@ -5,7 +5,7 @@ import uuid from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE -from ..util import minify_html, escape_html +from ..util import minify_html, escape_html, get_entry_points DEFAULT_LANG = "en" DEFAULT_DIR = "ltr" @@ -237,6 +237,9 @@ class EntityRenderer(object): "CARDINAL": "#e4e7d2", "PERCENT": "#e4e7d2", } + user_colors = get_entry_points("spacy_displacy_colors") + for user_color in user_colors.values(): + colors.update(user_color) colors.update(options.get("colors", {})) self.default_color = "#ddd" self.colors = colors From 2f3648700c831fd6b9e10bab3eb9d459a2b3c4b8 Mon Sep 17 00:00:00 2001 From: AJ Rader Date: Thu, 15 Aug 2019 05:39:10 -0400 Subject: [PATCH 07/12] Correction of default lemmatizer lookup in English (Issue # 4104) (#4110) * pytest file for issue4104 established * edited default lookup english lemmatizer for spun; fixes issue 4102 * eliminated parameterization and sorted dictionary dependnency in issue 4104 test * added contributor agreement --- .github/contributors/ajrader.md | 106 +++++++++++++++++++++++ spacy/lang/en/lemmatizer/lookup.py | 5 +- spacy/tests/regression/test_issue4104.py | 14 +++ 3 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 .github/contributors/ajrader.md create mode 100644 spacy/tests/regression/test_issue4104.py diff --git a/.github/contributors/ajrader.md b/.github/contributors/ajrader.md new file mode 100644 index 000000000..646d5b537 --- /dev/null +++ b/.github/contributors/ajrader.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI GmbH](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Andrew J Rader | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | August 14, 2019 | +| GitHub username | ajrader | +| Website (optional) | | diff --git a/spacy/lang/en/lemmatizer/lookup.py b/spacy/lang/en/lemmatizer/lookup.py index 66ab2b70b..08bfce5a9 100644 --- a/spacy/lang/en/lemmatizer/lookup.py +++ b/spacy/lang/en/lemmatizer/lookup.py @@ -11558,7 +11558,7 @@ LOOKUP = { "drunker": "drunk", "drunkest": "drunk", "drunks": "drunk", - "dry": "spin-dry", + "dry": "dry", "dry-cleaned": "dry-clean", "dry-cleaners": "dry-cleaner", "dry-cleaning": "dry-clean", @@ -35294,7 +35294,8 @@ LOOKUP = { "spryer": "spry", "spryest": "spry", "spuds": "spud", - "spun": "spin-dry", + "spun": "spin", + "spun-dry": "spin-dry", "spunkier": "spunky", "spunkiest": "spunky", "spunks": "spunk", diff --git a/spacy/tests/regression/test_issue4104.py b/spacy/tests/regression/test_issue4104.py new file mode 100644 index 000000000..b7c6af773 --- /dev/null +++ b/spacy/tests/regression/test_issue4104.py @@ -0,0 +1,14 @@ +# coding: utf8 +from __future__ import unicode_literals + +from ..util import get_doc + +def test_issue4104(en_vocab): + """Test that English lookup lemmatization of spun & dry are correct + expected mapping = {'dry': 'dry', 'spun': 'spin', 'spun-dry': 'spin-dry'} + """ + text = 'dry spun spun-dry' + doc = get_doc(en_vocab, [t for t in text.split(" ")]) + # using a simple list to preserve order + expected = ['dry', 'spin', 'spin-dry'] + assert [token.lemma_ for token in doc] == expected From 2f9b28c21847318f5d14e361505216b1e911ba17 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 15 Aug 2019 18:08:28 +0200 Subject: [PATCH 08/12] Provide more info in cycle error message E069 (#4123) Provide the tokens in the cycle and the first 50 tokens from document in the error message so it's easier to track down the location of the cycle in the data. Addresses feature request in #3698. --- spacy/errors.py | 3 ++- spacy/gold.pyx | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 25a170bdb..0a4875d96 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -243,7 +243,8 @@ class Errors(object): "Tag sequence:\n{tags}") E068 = ("Invalid BILUO tag: '{tag}'.") E069 = ("Invalid gold-standard parse tree. Found cycle between word " - "IDs: {cycle}") + "IDs: {cycle} (tokens: {cycle_tokens}) in the document starting " + "with tokens: {doc_tokens}.") E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) " "does not align with number of annotations ({n_annots}).") E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " diff --git a/spacy/gold.pyx b/spacy/gold.pyx index ce1648ccd..dab65f48e 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -590,7 +590,7 @@ cdef class GoldParse: cycle = nonproj.contains_cycle(self.heads) if cycle is not None: - raise ValueError(Errors.E069.format(cycle=cycle)) + raise ValueError(Errors.E069.format(cycle=cycle, cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]), doc_tokens=" ".join(words[:50]))) def __len__(self): """Get the number of gold-standard tokens. From eea7d4f4a8b4e154ca6897f5dd01e57e97476e50 Mon Sep 17 00:00:00 2001 From: Ziming He Date: Thu, 15 Aug 2019 12:13:32 -0400 Subject: [PATCH 09/12] biluo_tags_from_offsets throw exception for overlapping entities (#4021) * Check whether two entities overlap - biluo_gold_biluo_overlap now throw exception when entities passed in have overlaps - added unit test * SCA agreement --- .github/contributors/ryanzhe.md | 106 ++++++++++++++++++++++++++++++++ spacy/gold.pyx | 12 ++++ spacy/tests/test_gold.py | 10 ++- 3 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 .github/contributors/ryanzhe.md diff --git a/.github/contributors/ryanzhe.md b/.github/contributors/ryanzhe.md new file mode 100644 index 000000000..afb250a4c --- /dev/null +++ b/.github/contributors/ryanzhe.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [ ] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect my + contributions. + + * [x] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ziming He | +| Company name (if applicable) | Georgia Tech | +| Title or role (if applicable) | Student | +| Date | 2019-07-24 | +| GitHub username | RyanZHe | +| Website (optional) | www.papermachine.me | diff --git a/spacy/gold.pyx b/spacy/gold.pyx index dab65f48e..64c2d9772 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -678,11 +678,23 @@ def biluo_tags_from_offsets(doc, entities, missing="O"): >>> tags = biluo_tags_from_offsets(doc, entities) >>> assert tags == ["O", "O", 'U-LOC', "O"] """ + # Ensure no overlapping entity labels exist + tokens_in_ents = {} + starts = {token.idx: token.i for token in doc} ends = {token.idx + len(token): token.i for token in doc} biluo = ["-" for _ in doc] # Handle entity cases for start_char, end_char, label in entities: + for token_index in range(start_char, end_char): + if token_index in tokens_in_ents.keys(): + raise ValueError(Errors.E103.format( + span1=(tokens_in_ents[token_index][0], + tokens_in_ents[token_index][1], + tokens_in_ents[token_index][2]), + span2=(start_char, end_char, label))) + tokens_in_ents[token_index] = (start_char, end_char, label) + start_token = starts.get(start_char) end_token = ends.get(end_char) # Only interested if the tokenization is correct diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py index d370eac53..ac08716b7 100644 --- a/spacy/tests/test_gold.py +++ b/spacy/tests/test_gold.py @@ -4,7 +4,7 @@ from __future__ import unicode_literals from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags from spacy.gold import spans_from_biluo_tags, GoldParse from spacy.tokens import Doc - +import pytest def test_gold_biluo_U(en_vocab): words = ["I", "flew", "to", "London", "."] @@ -32,6 +32,14 @@ def test_gold_biluo_BIL(en_vocab): tags = biluo_tags_from_offsets(doc, entities) assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] +def test_gold_biluo_overlap(en_vocab): + words = ["I", "flew", "to", "San", "Francisco", "Valley", "."] + spaces = [True, True, True, True, True, False, True] + doc = Doc(en_vocab, words=words, spaces=spaces) + entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC"), + (len("I flew to "), len("I flew to San Francisco"), "LOC")] + with pytest.raises(ValueError): + tags = biluo_tags_from_offsets(doc, entities) def test_gold_biluo_misalign(en_vocab): words = ["I", "flew", "to", "San", "Francisco", "Valley."] From a58cb023d7312d218f45c566f02bf59e708ec4fb Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Fri, 16 Aug 2019 10:52:46 +0200 Subject: [PATCH 10/12] WIP: Extending debug-data (#4114) * Extending debug-data with dependency checks, etc. * Modify debug-data to load with GoldCorpus to iterate over .json/.jsonl files within directories * Add GoldCorpus iterator train_docs_without_preprocessing to load original train docs without shuffling and projectivizing * Report number of misaligned tokens * Add more dependency checks and messages * Update spacy/cli/debug_data.py Co-Authored-By: Ines Montani * Fixed conflict * Move counts to _compile_gold() * Move all dependency nonproj/sent/head/cycle counting to _compile_gold() * Unclobber previous merges * Update variable names * Update more variable names, fix misspelling * Don't clobber loading error messages * Only warn about misaligned tokens if present --- spacy/cli/debug_data.py | 223 ++++++++++++++++++++++++++++++++++------ spacy/gold.pyx | 4 + 2 files changed, 194 insertions(+), 33 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index afedb933e..290131c76 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -9,11 +9,14 @@ import srsly from wasabi import Printer, MESSAGES from ..gold import GoldCorpus, read_json_object +from ..syntax import nonproj from ..util import load_model, get_lang_class -# Minimum number of expected occurences of label in data to train new label +# Minimum number of expected occurrences of NER label in data to train new label NEW_LABEL_THRESHOLD = 50 +# Minimum number of expected occurrences of dependency labels +DEP_LABEL_THRESHOLD = 20 # Minimum number of expected examples to train a blank model BLANK_MODEL_MIN_THRESHOLD = 100 BLANK_MODEL_THRESHOLD = 2000 @@ -68,12 +71,10 @@ def debug_data( nlp = lang_cls() msg.divider("Data format validation") - # Load the data in one – might take a while but okay in this case - train_data = _load_file(train_path, msg) - dev_data = _load_file(dev_path, msg) # Validate data format using the JSON schema # TODO: update once the new format is ready + # TODO: move validation to GoldCorpus in order to be able to load from dir train_data_errors = [] # TODO: validate_json dev_data_errors = [] # TODO: validate_json if not train_data_errors: @@ -88,18 +89,34 @@ def debug_data( sys.exit(1) # Create the gold corpus to be able to better analyze data - with msg.loading("Analyzing corpus..."): - train_data = read_json_object(train_data) - dev_data = read_json_object(dev_data) - corpus = GoldCorpus(train_data, dev_data) - train_docs = list(corpus.train_docs(nlp)) - dev_docs = list(corpus.dev_docs(nlp)) + loading_train_error_message = "" + loading_dev_error_message = "" + with msg.loading("Loading corpus..."): + corpus = GoldCorpus(train_path, dev_path) + try: + train_docs = list(corpus.train_docs(nlp)) + train_docs_unpreprocessed = list(corpus.train_docs_without_preprocessing(nlp)) + except ValueError as e: + loading_train_error_message = "Training data cannot be loaded: {}".format(str(e)) + try: + dev_docs = list(corpus.dev_docs(nlp)) + except ValueError as e: + loading_dev_error_message = "Development data cannot be loaded: {}".format(str(e)) + if loading_train_error_message or loading_dev_error_message: + if loading_train_error_message: + msg.fail(loading_train_error_message) + if loading_dev_error_message: + msg.fail(loading_dev_error_message) + sys.exit(1) msg.good("Corpus is loadable") # Create all gold data here to avoid iterating over the train_docs constantly - gold_data = _compile_gold(train_docs, pipeline) - train_texts = gold_data["texts"] - dev_texts = set([doc.text for doc, gold in dev_docs]) + gold_train_data = _compile_gold(train_docs, pipeline) + gold_train_unpreprocessed_data = _compile_gold(train_docs_unpreprocessed, pipeline) + gold_dev_data = _compile_gold(dev_docs, pipeline) + + train_texts = gold_train_data["texts"] + dev_texts = gold_dev_data["texts"] msg.divider("Training stats") msg.text("Training pipeline: {}".format(", ".join(pipeline))) @@ -133,13 +150,21 @@ def debug_data( ) msg.divider("Vocab & Vectors") - n_words = gold_data["n_words"] + n_words = gold_train_data["n_words"] msg.info( "{} total {} in the data ({} unique)".format( - n_words, "word" if n_words == 1 else "words", len(gold_data["words"]) + n_words, "word" if n_words == 1 else "words", len(gold_train_data["words"]) ) ) - most_common_words = gold_data["words"].most_common(10) + if gold_train_data["n_misaligned_words"] > 0: + msg.warn( + "{} misaligned tokens in the training data".format(gold_train_data["n_misaligned_words"]) + ) + if gold_dev_data["n_misaligned_words"] > 0: + msg.warn( + "{} misaligned tokens in the dev data".format(gold_dev_data["n_misaligned_words"]) + ) + most_common_words = gold_train_data["words"].most_common(10) msg.text( "10 most common words: {}".format( _format_labels(most_common_words, counts=True) @@ -159,8 +184,8 @@ def debug_data( if "ner" in pipeline: # Get all unique NER labels present in the data - labels = set(label for label in gold_data["ner"] if label not in ("O", "-")) - label_counts = gold_data["ner"] + labels = set(label for label in gold_train_data["ner"] if label not in ("O", "-")) + label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] @@ -196,8 +221,8 @@ def debug_data( "Existing: {}".format(_format_labels(existing_labels)), show=verbose ) - if gold_data["ws_ents"]: - msg.fail("{} invalid whitespace entity spans".format(gold_data["ws_ents"])) + if gold_train_data["ws_ents"]: + msg.fail("{} invalid whitespace entity spans".format(gold_train_data["ws_ents"])) has_ws_ents_error = True for label in new_labels: @@ -227,7 +252,7 @@ def debug_data( if has_low_data_warning: msg.text( "To train a new entity type, your data should include at " - "least {} insteances of the new label".format(NEW_LABEL_THRESHOLD), + "least {} instances of the new label".format(NEW_LABEL_THRESHOLD), show=verbose, ) if has_no_neg_warning: @@ -245,7 +270,7 @@ def debug_data( if "textcat" in pipeline: msg.divider("Text Classification") - labels = [label for label in gold_data["textcat"]] + labels = [label for label in gold_train_data["textcat"]] model_labels = _get_labels_from_model(nlp, "textcat") new_labels = [l for l in labels if l not in model_labels] existing_labels = [l for l in labels if l in model_labels] @@ -256,7 +281,7 @@ def debug_data( ) if new_labels: labels_with_counts = _format_labels( - gold_data["textcat"].most_common(), counts=True + gold_train_data["textcat"].most_common(), counts=True ) msg.text("New: {}".format(labels_with_counts), show=verbose) if existing_labels: @@ -266,7 +291,7 @@ def debug_data( if "tagger" in pipeline: msg.divider("Part-of-speech Tagging") - labels = [label for label in gold_data["tags"]] + labels = [label for label in gold_train_data["tags"]] tag_map = nlp.Defaults.tag_map msg.info( "{} {} in data ({} {} in tag map)".format( @@ -277,7 +302,7 @@ def debug_data( ) ) labels_with_counts = _format_labels( - gold_data["tags"].most_common(), counts=True + gold_train_data["tags"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) non_tagmap = [l for l in labels if l not in tag_map] @@ -292,17 +317,132 @@ def debug_data( if "parser" in pipeline: msg.divider("Dependency Parsing") - labels = [label for label in gold_data["deps"]] + + # profile sentence length msg.info( - "{} {} in data".format( - len(labels), "label" if len(labels) == 1 else "labels" + "Found {} sentence{} with an average length of {:.1f} words.".format( + gold_train_data["n_sents"], + "s" if len(train_docs) > 1 else "", + gold_train_data["n_words"] / gold_train_data["n_sents"] ) ) + + # profile labels + labels_train = [label for label in gold_train_data["deps"]] + labels_train_unpreprocessed = [label for label in gold_train_unpreprocessed_data["deps"]] + labels_dev = [label for label in gold_dev_data["deps"]] + + if gold_train_unpreprocessed_data["n_nonproj"] > 0: + msg.info( + "Found {} nonprojective train sentence{}".format( + gold_train_unpreprocessed_data["n_nonproj"], + "s" if gold_train_unpreprocessed_data["n_nonproj"] > 1 else "" + ) + ) + if gold_dev_data["n_nonproj"] > 0: + msg.info( + "Found {} nonprojective dev sentence{}".format( + gold_dev_data["n_nonproj"], + "s" if gold_dev_data["n_nonproj"] > 1 else "" + ) + ) + + msg.info( + "{} {} in train data".format( + len(labels_train_unpreprocessed), "label" if len(labels_train) == 1 else "labels" + ) + ) + msg.info( + "{} {} in projectivized train data".format( + len(labels_train), "label" if len(labels_train) == 1 else "labels" + ) + ) + labels_with_counts = _format_labels( - gold_data["deps"].most_common(), counts=True + gold_train_unpreprocessed_data["deps"].most_common(), counts=True ) msg.text(labels_with_counts, show=verbose) + # rare labels in train + for label in gold_train_unpreprocessed_data["deps"]: + if gold_train_unpreprocessed_data["deps"][label] <= DEP_LABEL_THRESHOLD: + msg.warn( + "Low number of examples for label '{}' ({})".format( + label, gold_train_unpreprocessed_data["deps"][label] + ) + ) + has_low_data_warning = True + + + # rare labels in projectivized train + rare_projectivized_labels = [] + for label in gold_train_data["deps"]: + if gold_train_data["deps"][label] <= DEP_LABEL_THRESHOLD and "||" in label: + rare_projectivized_labels.append("{}: {}".format(label, str(gold_train_data["deps"][label]))) + + if len(rare_projectivized_labels) > 0: + msg.warn( + "Low number of examples for {} label{} in the " + "projectivized dependency trees used for training. You may " + "want to projectivize labels such as punct before " + "training in order to improve parser performance.".format( + len(rare_projectivized_labels), + "s" if len(rare_projectivized_labels) > 1 else "") + ) + msg.warn( + "Projectivized labels with low numbers of examples: " + "{}".format("\n".join(rare_projectivized_labels)), + show=verbose + ) + has_low_data_warning = True + + # labels only in train + if set(labels_train) - set(labels_dev): + msg.warn( + "The following labels were found only in the train data: " + "{}".format(", ".join(set(labels_train) - set(labels_dev))), + show=verbose + ) + + # labels only in dev + if set(labels_dev) - set(labels_train): + msg.warn( + "The following labels were found only in the dev data: " + + ", ".join(set(labels_dev) - set(labels_train)), + show=verbose + ) + + if has_low_data_warning: + msg.text( + "To train a parser, your data should include at " + "least {} instances of each label.".format(DEP_LABEL_THRESHOLD), + show=verbose, + ) + + # multiple root labels + if len(gold_train_unpreprocessed_data["roots"]) > 1: + msg.warn( + "Multiple root labels ({}) ".format(", ".join(gold_train_unpreprocessed_data["roots"])) + + "found in training data. spaCy's parser uses a single root " + "label ROOT so this distinction will not be available." + ) + + # these should not happen, but just in case + if gold_train_data["n_nonproj"] > 0: + msg.fail( + "Found {} nonprojective projectivized train sentence{}".format( + gold_train_data["n_nonproj"], + "s" if gold_train_data["n_nonproj"] > 1 else "" + ) + ) + if gold_train_data["n_cycles"] > 0: + msg.fail( + "Found {} projectivized train sentence{} with cycles".format( + gold_train_data["n_cycles"], + "s" if gold_train_data["n_cycles"] > 1 else "" + ) + ) + msg.divider("Summary") good_counts = msg.counts[MESSAGES.GOOD] warn_counts = msg.counts[MESSAGES.WARN] @@ -350,16 +490,25 @@ def _compile_gold(train_docs, pipeline): "tags": Counter(), "deps": Counter(), "words": Counter(), + "roots": Counter(), "ws_ents": 0, "n_words": 0, + "n_misaligned_words": 0, + "n_sents": 0, + "n_nonproj": 0, + "n_cycles": 0, "texts": set(), } for doc, gold in train_docs: - data["words"].update(gold.words) - data["n_words"] += len(gold.words) + valid_words = [x for x in gold.words if x is not None] + data["words"].update(valid_words) + data["n_words"] += len(valid_words) + data["n_misaligned_words"] += len(gold.words) - len(valid_words) data["texts"].add(doc.text) if "ner" in pipeline: for i, label in enumerate(gold.ner): + if label is None: + continue if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: # "Illegal" whitespace entity data["ws_ents"] += 1 @@ -371,9 +520,17 @@ def _compile_gold(train_docs, pipeline): if "textcat" in pipeline: data["cats"].update(gold.cats) if "tagger" in pipeline: - data["tags"].update(gold.tags) + data["tags"].update([x for x in gold.tags if x is not None]) if "parser" in pipeline: - data["deps"].update(gold.labels) + data["deps"].update([x for x in gold.labels if x is not None]) + for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)): + if head == i: + data["roots"].update([dep]) + data["n_sents"] += 1 + if nonproj.is_nonproj_tree(gold.heads): + data["n_nonproj"] += 1 + if nonproj.contains_cycle(gold.heads): + data["n_cycles"] += 1 return data diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 64c2d9772..f6ec8d3fa 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -216,6 +216,10 @@ class GoldCorpus(object): make_projective=True) yield from gold_docs + def train_docs_without_preprocessing(self, nlp, gold_preproc=False): + gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc=gold_preproc) + yield from gold_docs + def dev_docs(self, nlp, gold_preproc=False): gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc) yield from gold_docs From e5c7e19e82d3a9dec5ad34582b2cd3adc5396b1e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 16 Aug 2019 10:53:38 +0200 Subject: [PATCH 11/12] Fix typo and auto-format [ci skip] --- spacy/cli/debug_data.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 290131c76..656cd640a 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -245,7 +245,7 @@ def debug_data( if not has_low_data_warning: msg.good("Good amount of examples for all labels") if not has_no_neg_warning: - msg.good("Examples without occurences available for all labels") + msg.good("Examples without occurrences available for all labels") if not has_ws_ents_error: msg.good("No entities consisting of or starting/ending with whitespace") @@ -321,8 +321,8 @@ def debug_data( # profile sentence length msg.info( "Found {} sentence{} with an average length of {:.1f} words.".format( - gold_train_data["n_sents"], - "s" if len(train_docs) > 1 else "", + gold_train_data["n_sents"], + "s" if len(train_docs) > 1 else "", gold_train_data["n_words"] / gold_train_data["n_sents"] ) ) From 91441f169c913f1fb8fe2ad08538370008daf20b Mon Sep 17 00:00:00 2001 From: Jeno Date: Fri, 16 Aug 2019 11:48:17 -0400 Subject: [PATCH 12/12] Update universe.json to include negspacy (#4132) --- website/meta/universe.json | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index d5b6001ee..ee2fcf50f 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1600,6 +1600,36 @@ "github": "explosion", "website": "https://explosion.ai" } + }, + { + "id": "negspacy", + "title": "negspaCy", + "slogan": "spaCy pipeline object for negating concepts in text based on the NegEx algorithm.", + "github": "jenojp/negspacy", + "url": "https://github.com/jenojp/negspacy", + "description": "negspacy is a spaCy pipeline component that evaluates whether Named Entities are negated in text. It adds an extension to 'Span' objects.", + "pip": "negspacy", + "category": ["pipeline", "scientific"], + "tags": ["negation", "text-processing"], + "thumb":"https://github.com/jenojp/negspacy/blob/master/docs/thumb.png?raw=true", + "image":"https://github.com/jenojp/negspacy/blob/master/docs/icon.png?raw=true", + "code_example": [ + "import spacy", + "from negspacy.negation import Negex", + "", + "nlp = spacy.load(\"en_core_web_sm\")", + "negex = Negex(nlp, ent_types=[\"PERSON','ORG\"])", + "nlp.add_pipe(negex, last=True)", + "", + "doc = nlp(\"She does not like Steve Jobs but likes Apple products.\")", + "for e in doc.ents:", + " print(e.text, e._.negex)" + ], + "author": "Jeno Pizarro", + "author_links": { + "github": "jenojp", + "twitter": "jenojp" + } } ],