diff --git a/examples/training/pretrain_kb.py b/examples/training/pretrain_kb.py index a69e97e14..db6442ad4 100644 --- a/examples/training/pretrain_kb.py +++ b/examples/training/pretrain_kb.py @@ -8,8 +8,8 @@ For more details, see the documentation: * Knowledge base: https://spacy.io/api/kb * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy v2.2 -Last tested with: v2.2 +Compatible with: spaCy v2.2.3 +Last tested with: v2.2.3 """ from __future__ import unicode_literals, print_function diff --git a/examples/training/train_entity_linker.py b/examples/training/train_entity_linker.py index d2b2c2417..df8b59db1 100644 --- a/examples/training/train_entity_linker.py +++ b/examples/training/train_entity_linker.py @@ -8,8 +8,8 @@ For more details, see the documentation: * Training: https://spacy.io/usage/training * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking -Compatible with: spaCy v2.2 -Last tested with: v2.2 +Compatible with: spaCy v2.2.3 +Last tested with: v2.2.3 """ from __future__ import unicode_literals, print_function @@ -22,6 +22,7 @@ from spacy.vocab import Vocab import spacy from spacy.kb import KnowledgeBase +from spacy.pipeline import EntityRuler from spacy.tokens import Span from spacy.util import minibatch, compounding @@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) - # create the built-in pipeline components and add them to the pipeline - # nlp.create_pipe works for built-ins that are registered with spaCy + # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. + nlp.add_pipe(nlp.create_pipe('sentencizer')) + + # Add a custom component to recognize "Russ Cochran" as an entity for the example training data. + # Note that in a realistic application, an actual NER algorithm should be used instead. + ruler = EntityRuler(nlp) + patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler) + + # Create the Entity Linker component and add it to the pipeline. if "entity_linker" not in nlp.pipe_names: - entity_linker = nlp.create_pipe("entity_linker") + # use only the predicted EL score and not the prior probability (for demo purposes) + cfg = {"incl_prior": False} + entity_linker = nlp.create_pipe("entity_linker", cfg) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) entity_linker.set_kb(kb) nlp.add_pipe(entity_linker, last=True) - else: - entity_linker = nlp.get_pipe("entity_linker") - kb = entity_linker.kb - # make sure the annotated examples correspond to known identifiers in the knowlege base - kb_ids = kb.get_entity_strings() + # Convert the texts to docs to make sure we have doc.ents set for the training examples. + # Also ensure that the annotated examples correspond to known identifiers in the knowlege base. + kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() + TRAIN_DOCS = [] for text, annotation in TRAIN_DATA: + with nlp.disable_pipes("entity_linker"): + doc = nlp(text) + annotation_clean = annotation for offset, kb_id_dict in annotation["links"].items(): new_dict = {} for kb_id, value in kb_id_dict.items(): @@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): print( "Removed", kb_id, "from training because it is not in the KB." ) - annotation["links"][offset] = new_dict + annotation_clean["links"][offset] = new_dict + TRAIN_DOCS.append((doc, annotation_clean)) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] @@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): # reset and initialize the weights randomly optimizer = nlp.begin_training() for itn in range(n_iter): - random.shuffle(TRAIN_DATA) + random.shuffle(TRAIN_DOCS) losses = {} # batch up the examples using spaCy's minibatch - batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) + batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( @@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): def _apply_model(nlp): for text, annotation in TRAIN_DATA: - doc = nlp.tokenizer(text) - - # set entities so the evaluation is independent of the NER step - # all the examples contain 'Russ Cochran' as the first two tokens in the sentence - rc_ent = Span(doc, 0, 2, label=PERSON) - doc.ents = [rc_ent] - # apply the entity linker which will now make predictions for the 'Russ Cochran' entities - doc = nlp.get_pipe("entity_linker")(doc) - + doc = nlp(text) print() print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc]) diff --git a/requirements.txt b/requirements.txt index 12f19bb88..f208a2772 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our libraries cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=7.3.0,<7.4.0 +thinc==7.4.0.dev0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.4.0,<1.1.0 diff --git a/setup.cfg b/setup.cfg index 3101209e7..a0103c5a2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,13 +38,13 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=7.3.0,<7.4.0 + thinc==7.4.0.dev0 install_requires = # Our libraries murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=7.3.0,<7.4.0 + thinc==7.4.0.dev0 blis>=0.4.0,<0.5.0 wasabi>=0.4.0,<1.1.0 srsly>=0.1.0,<1.1.0 diff --git a/spacy/errors.py b/spacy/errors.py index 3aeddec09..4dcdcae1a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -81,7 +81,8 @@ class Warnings(object): "Future versions may introduce a `n_process` argument for " "parallel inference via multiprocessing.") W017 = ("Alias '{alias}' already exists in the Knowledge Base.") - W018 = ("Entity '{entity}' already exists in the Knowledge Base.") + W018 = ("Entity '{entity}' already exists in the Knowledge Base - " + "ignoring the duplicate entry.") W019 = ("Changing vectors name from {old} to {new}, to avoid clash with " "previously loaded vectors. See Issue #3853.") W020 = ("Unnamed vectors. This won't allow multiple vectors models to be " @@ -531,6 +532,9 @@ class Errors(object): "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") E186 = ("'{tok_a}' and '{tok_b}' are different texts.") E187 = ("Only unicode strings are supported as labels.") + E188 = ("Could not match the gold entity links to entities in the doc - " + "make sure the gold EL data refers to valid results of the " + "named entity recognizer in the `nlp` pipeline.") @add_codes diff --git a/spacy/kb.pyx b/spacy/kb.pyx index 31fd1706e..63eb41b42 100644 --- a/spacy/kb.pyx +++ b/spacy/kb.pyx @@ -136,29 +136,34 @@ cdef class KnowledgeBase: if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list): raise ValueError(Errors.E140) - nr_entities = len(entity_list) + nr_entities = len(set(entity_list)) self._entry_index = PreshMap(nr_entities+1) self._entries = entry_vec(nr_entities+1) i = 0 cdef KBEntryC entry cdef hash_t entity_hash - while i < nr_entities: - entity_vector = vector_list[i] - if len(entity_vector) != self.entity_vector_length: - raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) - + while i < len(entity_list): + # only process this entity if its unique ID hadn't been added before entity_hash = self.vocab.strings.add(entity_list[i]) - entry.entity_hash = entity_hash - entry.freq = freq_list[i] + if entity_hash in self._entry_index: + user_warning(Warnings.W018.format(entity=entity_list[i])) - vector_index = self.c_add_vector(entity_vector=vector_list[i]) - entry.vector_index = vector_index + else: + entity_vector = vector_list[i] + if len(entity_vector) != self.entity_vector_length: + raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length)) - entry.feats_row = -1 # Features table currently not implemented + entry.entity_hash = entity_hash + entry.freq = freq_list[i] - self._entries[i+1] = entry - self._entry_index[entity_hash] = i+1 + vector_index = self.c_add_vector(entity_vector=vector_list[i]) + entry.vector_index = vector_index + + entry.feats_row = -1 # Features table currently not implemented + + self._entries[i+1] = entry + self._entry_index[entity_hash] = i+1 i += 1 diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx index 6f6848102..30ef3dd36 100644 --- a/spacy/matcher/matcher.pyx +++ b/spacy/matcher/matcher.pyx @@ -677,7 +677,9 @@ def _get_attr_values(spec, string_store): value = string_store.add(value) elif isinstance(value, bool): value = int(value) - elif isinstance(value, (dict, int)): + elif isinstance(value, int): + pass + elif isinstance(value, dict): continue else: raise ValueError(Errors.E153.format(vtype=type(value).__name__)) diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 82f9b665f..f57ea59d2 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -1142,7 +1142,7 @@ cdef class EntityRecognizer(Parser): @component( "entity_linker", - requires=["doc.ents", "token.ent_iob", "token.ent_type"], + requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"], assigns=["token.ent_kb_id"] ) class EntityLinker(Pipe): @@ -1220,13 +1220,20 @@ class EntityLinker(Pipe): for entity, kb_dict in gold.links.items(): start, end = entity mention = doc.text[start:end] + # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt + if not (start, end) in ents_by_offset: + raise RuntimeError(Errors.E188) ent = ents_by_offset[(start, end)] for kb_id, value in kb_dict.items(): # Currently only training on the positive instances if value: - sentence_docs.append(ent.sent.as_doc()) + try: + sentence_docs.append(ent.sent.as_doc()) + except AttributeError: + # Catch the exception when ent.sent is None and provide a user-friendly warning + raise RuntimeError(Errors.E030) sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) diff --git a/spacy/syntax/_beam_utils.pyx b/spacy/syntax/_beam_utils.pyx index dc482f278..b1085c762 100644 --- a/spacy/syntax/_beam_utils.pyx +++ b/spacy/syntax/_beam_utils.pyx @@ -69,7 +69,8 @@ cdef class ParserBeam(object): cdef StateC* st for state in states: beam = Beam(self.moves.n_moves, width, min_density=density) - beam.initialize(self.moves.init_beam_state, state.c.length, + beam.initialize(self.moves.init_beam_state, + self.moves.del_beam_state, state.c.length, state.c._sent) for i in range(beam.width): st = beam.at(i) diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index eb39124ce..efe8573c1 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -324,10 +324,16 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: return st +cdef int _del_state(Pool mem, void* state, void* x) except -1: + cdef StateC* st = state + del st + + cdef class ArcEager(TransitionSystem): def __init__(self, *args, **kwargs): TransitionSystem.__init__(self, *args, **kwargs) self.init_beam_state = _init_state + self.del_beam_state = _del_state @classmethod def get_actions(cls, **kwargs): diff --git a/spacy/syntax/transition_system.pxd b/spacy/syntax/transition_system.pxd index 45d9a787f..a5fe55918 100644 --- a/spacy/syntax/transition_system.pxd +++ b/spacy/syntax/transition_system.pxd @@ -33,6 +33,8 @@ ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL +ctypedef int (*del_state_t)(Pool mem, void* state, void* extra_args) except -1 + cdef class TransitionSystem: cdef Pool mem cdef StringStore strings @@ -42,6 +44,7 @@ cdef class TransitionSystem: cdef public attr_t root_label cdef public freqs cdef init_state_t init_beam_state + cdef del_state_t del_beam_state cdef public object labels cdef int initialize_state(self, StateC* state) nogil diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 7876813e0..65097f114 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -30,6 +30,11 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL: return st +cdef int _del_state(Pool mem, void* state, void* x) except -1: + cdef StateC* st = state + del st + + cdef class TransitionSystem: def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None): self.mem = Pool() @@ -44,6 +49,7 @@ cdef class TransitionSystem: self.initialize_actions(labels_by_action, min_freq=min_freq) self.root_label = self.strings.add('ROOT') self.init_beam_state = _init_state + self.del_beam_state = _del_state def __reduce__(self): return (self.__class__, (self.strings, self.labels), None, None) @@ -72,7 +78,8 @@ cdef class TransitionSystem: for doc in docs: beam = Beam(self.n_moves, beam_width, min_density=beam_density) - beam.initialize(self.init_beam_state, doc.length, doc.c) + beam.initialize(self.init_beam_state, self.del_beam_state, + doc.length, doc.c) for i in range(beam.width): state = beam.at(i) state.offset = offset diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py index f813a9743..01bb93c50 100644 --- a/spacy/tests/doc/test_span.py +++ b/spacy/tests/doc/test_span.py @@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer): return doc +@pytest.mark.parametrize( + "i_sent,i,j,text", + [ + (0, 0, len("This is a"), "This is a"), + (1, 0, len("This is another"), "This is another"), + (2, len("And "), len("And ") + len("a third"), "a third"), + (0, 1, 2, None), + ], +) +def test_char_span(doc, i_sent, i, j, text): + sents = list(doc.sents) + span = sents[i_sent].char_span(i, j) + if not text: + assert not span + else: + assert span.text == text + + def test_spans_sent_spans(doc): sents = list(doc.sents) assert sents[0].start == 0 diff --git a/spacy/tests/regression/test_issue4674.py b/spacy/tests/regression/test_issue4674.py new file mode 100644 index 000000000..36e9f02c1 --- /dev/null +++ b/spacy/tests/regression/test_issue4674.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from spacy.kb import KnowledgeBase +from spacy.util import ensure_path + +from spacy.lang.en import English +from spacy.tests.util import make_tempdir + + +def test_issue4674(): + """Test that setting entities with overlapping identifiers does not mess up IO""" + nlp = English() + kb = KnowledgeBase(nlp.vocab, entity_vector_length=3) + + vector1 = [0.9, 1.1, 1.01] + vector2 = [1.8, 2.25, 2.01] + kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2]) + + assert kb.get_size_entities() == 1 + + # dumping to file & loading back in + with make_tempdir() as d: + dir_path = ensure_path(d) + if not dir_path.exists(): + dir_path.mkdir() + file_path = dir_path / "kb" + kb.dump(str(file_path)) + + kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3) + kb2.load_bulk(str(file_path)) + + assert kb2.get_size_entities() == 1 + diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 9e99392a9..957e853ca 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -584,6 +584,22 @@ cdef class Span: else: return self.doc[root] + def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None): + """Create a `Span` object from the slice `span.text[start : end]`. + + start (int): The index of the first character of the span. + end (int): The index of the first character after the span. + label (uint64 or string): A label to attach to the Span, e.g. for + named entities. + kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity. + vector (ndarray[ndim=1, dtype='float32']): A meaning representation of + the span. + RETURNS (Span): The newly constructed object. + """ + start_idx += self.start_char + end_idx += self.start_char + return self.doc.char_span(start_idx, end_idx) + @property def conjuncts(self): """Tokens that are conjoined to the span's root. diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index ad684f51e..4f948e425 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -7,7 +7,7 @@ source: spacy/tokens/doc.pyx A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and named entities, export annotations to numpy arrays, losslessly serialize to -compressed binary strings. The `Doc` object holds an array of `TokenC]` structs. +compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs. The Python-level `Token` and [`Span`](/api/span) objects are views of this array, i.e. they don't own the data themselves. diff --git a/website/meta/universe.json b/website/meta/universe.json index 98a7807ca..67da8c828 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -1261,6 +1261,21 @@ }, "category": ["podcasts"] }, + { + "type": "education", + "id": "practical-ai-podcast", + "title": "Practical AI: Modern NLP with spaCy", + "slogan": "December 2019", + "description": "\"SpaCy is awesome for NLP! It’s easy to use, has widespread adoption, is open source, and integrates the latest language models. Ines Montani and Matthew Honnibal (core developers of spaCy and co-founders of Explosion) join us to discuss the history of the project, its capabilities, and the latest trends in NLP. We also dig into the practicalities of taking NLP workflows to production. You don’t want to miss this episode!\"", + "thumb": "https://i.imgur.com/jn8Bcdw.png", + "url": "https://changelog.com/practicalai/68", + "author": "Daniel Whitenack & Chris Benson", + "author_links": { + "website": "https://changelog.com/practicalai", + "twitter": "https://twitter.com/PracticalAIFM" + }, + "category": ["podcasts"] + }, { "id": "adam_qas", "title": "ADAM: Question Answering System",