Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-12-13 15:57:49 +01:00
commit ae9fac2d87
17 changed files with 172 additions and 47 deletions

View File

@ -8,8 +8,8 @@ For more details, see the documentation:
* Knowledge base: https://spacy.io/api/kb * Knowledge base: https://spacy.io/api/kb
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy v2.2 Compatible with: spaCy v2.2.3
Last tested with: v2.2 Last tested with: v2.2.3
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function

View File

@ -8,8 +8,8 @@ For more details, see the documentation:
* Training: https://spacy.io/usage/training * Training: https://spacy.io/usage/training
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking * Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy v2.2 Compatible with: spaCy v2.2.3
Last tested with: v2.2 Last tested with: v2.2.3
""" """
from __future__ import unicode_literals, print_function from __future__ import unicode_literals, print_function
@ -22,6 +22,7 @@ from spacy.vocab import Vocab
import spacy import spacy
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy.pipeline import EntityRuler
from spacy.tokens import Span from spacy.tokens import Span
from spacy.util import minibatch, compounding from spacy.util import minibatch, compounding
@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
nlp.vocab.vectors.name = "spacy_pretrained_vectors" nlp.vocab.vectors.name = "spacy_pretrained_vectors"
print("Created blank 'en' model with vocab from '%s'" % vocab_path) print("Created blank 'en' model with vocab from '%s'" % vocab_path)
# create the built-in pipeline components and add them to the pipeline # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
# nlp.create_pipe works for built-ins that are registered with spaCy nlp.add_pipe(nlp.create_pipe('sentencizer'))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
# Note that in a realistic application, an actual NER algorithm should be used instead.
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
# Create the Entity Linker component and add it to the pipeline.
if "entity_linker" not in nlp.pipe_names: if "entity_linker" not in nlp.pipe_names:
entity_linker = nlp.create_pipe("entity_linker") # use only the predicted EL score and not the prior probability (for demo purposes)
cfg = {"incl_prior": False}
entity_linker = nlp.create_pipe("entity_linker", cfg)
kb = KnowledgeBase(vocab=nlp.vocab) kb = KnowledgeBase(vocab=nlp.vocab)
kb.load_bulk(kb_path) kb.load_bulk(kb_path)
print("Loaded Knowledge Base from '%s'" % kb_path) print("Loaded Knowledge Base from '%s'" % kb_path)
entity_linker.set_kb(kb) entity_linker.set_kb(kb)
nlp.add_pipe(entity_linker, last=True) nlp.add_pipe(entity_linker, last=True)
else:
entity_linker = nlp.get_pipe("entity_linker")
kb = entity_linker.kb
# make sure the annotated examples correspond to known identifiers in the knowlege base # Convert the texts to docs to make sure we have doc.ents set for the training examples.
kb_ids = kb.get_entity_strings() # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
TRAIN_DOCS = []
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
with nlp.disable_pipes("entity_linker"):
doc = nlp(text)
annotation_clean = annotation
for offset, kb_id_dict in annotation["links"].items(): for offset, kb_id_dict in annotation["links"].items():
new_dict = {} new_dict = {}
for kb_id, value in kb_id_dict.items(): for kb_id, value in kb_id_dict.items():
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
print( print(
"Removed", kb_id, "from training because it is not in the KB." "Removed", kb_id, "from training because it is not in the KB."
) )
annotation["links"][offset] = new_dict annotation_clean["links"][offset] = new_dict
TRAIN_DOCS.append((doc, annotation_clean))
# get names of other pipes to disable them during training # get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"] other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
# reset and initialize the weights randomly # reset and initialize the weights randomly
optimizer = nlp.begin_training() optimizer = nlp.begin_training()
for itn in range(n_iter): for itn in range(n_iter):
random.shuffle(TRAIN_DATA) random.shuffle(TRAIN_DOCS)
losses = {} losses = {}
# batch up the examples using spaCy's minibatch # batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
for batch in batches: for batch in batches:
texts, annotations = zip(*batch) texts, annotations = zip(*batch)
nlp.update( nlp.update(
@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
def _apply_model(nlp): def _apply_model(nlp):
for text, annotation in TRAIN_DATA: for text, annotation in TRAIN_DATA:
doc = nlp.tokenizer(text)
# set entities so the evaluation is independent of the NER step
# all the examples contain 'Russ Cochran' as the first two tokens in the sentence
rc_ent = Span(doc, 0, 2, label=PERSON)
doc.ents = [rc_ent]
# apply the entity linker which will now make predictions for the 'Russ Cochran' entities # apply the entity linker which will now make predictions for the 'Russ Cochran' entities
doc = nlp.get_pipe("entity_linker")(doc) doc = nlp(text)
print() print()
print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents]) print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc]) print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0 thinc==7.4.0.dev0
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.4.0,<1.1.0 wasabi>=0.4.0,<1.1.0

View File

@ -38,13 +38,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=7.3.0,<7.4.0 thinc==7.4.0.dev0
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0 thinc==7.4.0.dev0
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0 wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0 srsly>=0.1.0,<1.1.0

View File

@ -81,7 +81,8 @@ class Warnings(object):
"Future versions may introduce a `n_process` argument for " "Future versions may introduce a `n_process` argument for "
"parallel inference via multiprocessing.") "parallel inference via multiprocessing.")
W017 = ("Alias '{alias}' already exists in the Knowledge Base.") W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
W018 = ("Entity '{entity}' already exists in the Knowledge Base.") W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
"ignoring the duplicate entry.")
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with " W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
"previously loaded vectors. See Issue #3853.") "previously loaded vectors. See Issue #3853.")
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be " W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
@ -531,6 +532,9 @@ class Errors(object):
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
E186 = ("'{tok_a}' and '{tok_b}' are different texts.") E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
E187 = ("Only unicode strings are supported as labels.") E187 = ("Only unicode strings are supported as labels.")
E188 = ("Could not match the gold entity links to entities in the doc - "
"make sure the gold EL data refers to valid results of the "
"named entity recognizer in the `nlp` pipeline.")
@add_codes @add_codes

View File

@ -136,29 +136,34 @@ cdef class KnowledgeBase:
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list): if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
raise ValueError(Errors.E140) raise ValueError(Errors.E140)
nr_entities = len(entity_list) nr_entities = len(set(entity_list))
self._entry_index = PreshMap(nr_entities+1) self._entry_index = PreshMap(nr_entities+1)
self._entries = entry_vec(nr_entities+1) self._entries = entry_vec(nr_entities+1)
i = 0 i = 0
cdef KBEntryC entry cdef KBEntryC entry
cdef hash_t entity_hash cdef hash_t entity_hash
while i < nr_entities: while i < len(entity_list):
entity_vector = vector_list[i] # only process this entity if its unique ID hadn't been added before
if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
entity_hash = self.vocab.strings.add(entity_list[i]) entity_hash = self.vocab.strings.add(entity_list[i])
entry.entity_hash = entity_hash if entity_hash in self._entry_index:
entry.freq = freq_list[i] user_warning(Warnings.W018.format(entity=entity_list[i]))
vector_index = self.c_add_vector(entity_vector=vector_list[i]) else:
entry.vector_index = vector_index entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
entry.feats_row = -1 # Features table currently not implemented entry.entity_hash = entity_hash
entry.freq = freq_list[i]
self._entries[i+1] = entry vector_index = self.c_add_vector(entity_vector=vector_list[i])
self._entry_index[entity_hash] = i+1 entry.vector_index = vector_index
entry.feats_row = -1 # Features table currently not implemented
self._entries[i+1] = entry
self._entry_index[entity_hash] = i+1
i += 1 i += 1

View File

@ -677,7 +677,9 @@ def _get_attr_values(spec, string_store):
value = string_store.add(value) value = string_store.add(value)
elif isinstance(value, bool): elif isinstance(value, bool):
value = int(value) value = int(value)
elif isinstance(value, (dict, int)): elif isinstance(value, int):
pass
elif isinstance(value, dict):
continue continue
else: else:
raise ValueError(Errors.E153.format(vtype=type(value).__name__)) raise ValueError(Errors.E153.format(vtype=type(value).__name__))

View File

@ -1142,7 +1142,7 @@ cdef class EntityRecognizer(Parser):
@component( @component(
"entity_linker", "entity_linker",
requires=["doc.ents", "token.ent_iob", "token.ent_type"], requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
assigns=["token.ent_kb_id"] assigns=["token.ent_kb_id"]
) )
class EntityLinker(Pipe): class EntityLinker(Pipe):
@ -1220,13 +1220,20 @@ class EntityLinker(Pipe):
for entity, kb_dict in gold.links.items(): for entity, kb_dict in gold.links.items():
start, end = entity start, end = entity
mention = doc.text[start:end] mention = doc.text[start:end]
# the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
if not (start, end) in ents_by_offset:
raise RuntimeError(Errors.E188)
ent = ents_by_offset[(start, end)] ent = ents_by_offset[(start, end)]
for kb_id, value in kb_dict.items(): for kb_id, value in kb_dict.items():
# Currently only training on the positive instances # Currently only training on the positive instances
if value: if value:
sentence_docs.append(ent.sent.as_doc()) try:
sentence_docs.append(ent.sent.as_doc())
except AttributeError:
# Catch the exception when ent.sent is None and provide a user-friendly warning
raise RuntimeError(Errors.E030)
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop) sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None) loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)

View File

@ -69,7 +69,8 @@ cdef class ParserBeam(object):
cdef StateC* st cdef StateC* st
for state in states: for state in states:
beam = Beam(self.moves.n_moves, width, min_density=density) beam = Beam(self.moves.n_moves, width, min_density=density)
beam.initialize(self.moves.init_beam_state, state.c.length, beam.initialize(self.moves.init_beam_state,
self.moves.del_beam_state, state.c.length,
state.c._sent) state.c._sent)
for i in range(beam.width): for i in range(beam.width):
st = <StateC*>beam.at(i) st = <StateC*>beam.at(i)

View File

@ -324,10 +324,16 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
return <void*>st return <void*>st
cdef int _del_state(Pool mem, void* state, void* x) except -1:
cdef StateC* st = <StateC*>state
del st
cdef class ArcEager(TransitionSystem): cdef class ArcEager(TransitionSystem):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs) TransitionSystem.__init__(self, *args, **kwargs)
self.init_beam_state = _init_state self.init_beam_state = _init_state
self.del_beam_state = _del_state
@classmethod @classmethod
def get_actions(cls, **kwargs): def get_actions(cls, **kwargs):

View File

@ -33,6 +33,8 @@ ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
ctypedef int (*del_state_t)(Pool mem, void* state, void* extra_args) except -1
cdef class TransitionSystem: cdef class TransitionSystem:
cdef Pool mem cdef Pool mem
cdef StringStore strings cdef StringStore strings
@ -42,6 +44,7 @@ cdef class TransitionSystem:
cdef public attr_t root_label cdef public attr_t root_label
cdef public freqs cdef public freqs
cdef init_state_t init_beam_state cdef init_state_t init_beam_state
cdef del_state_t del_beam_state
cdef public object labels cdef public object labels
cdef int initialize_state(self, StateC* state) nogil cdef int initialize_state(self, StateC* state) nogil

View File

@ -30,6 +30,11 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
return <void*>st return <void*>st
cdef int _del_state(Pool mem, void* state, void* x) except -1:
cdef StateC* st = <StateC*>state
del st
cdef class TransitionSystem: cdef class TransitionSystem:
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None): def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
self.mem = Pool() self.mem = Pool()
@ -44,6 +49,7 @@ cdef class TransitionSystem:
self.initialize_actions(labels_by_action, min_freq=min_freq) self.initialize_actions(labels_by_action, min_freq=min_freq)
self.root_label = self.strings.add('ROOT') self.root_label = self.strings.add('ROOT')
self.init_beam_state = _init_state self.init_beam_state = _init_state
self.del_beam_state = _del_state
def __reduce__(self): def __reduce__(self):
return (self.__class__, (self.strings, self.labels), None, None) return (self.__class__, (self.strings, self.labels), None, None)
@ -72,7 +78,8 @@ cdef class TransitionSystem:
for doc in docs: for doc in docs:
beam = Beam(self.n_moves, beam_width, min_density=beam_density) beam = Beam(self.n_moves, beam_width, min_density=beam_density)
beam.initialize(self.init_beam_state, doc.length, doc.c) beam.initialize(self.init_beam_state, self.del_beam_state,
doc.length, doc.c)
for i in range(beam.width): for i in range(beam.width):
state = <StateC*>beam.at(i) state = <StateC*>beam.at(i)
state.offset = offset state.offset = offset

View File

@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer):
return doc return doc
@pytest.mark.parametrize(
"i_sent,i,j,text",
[
(0, 0, len("This is a"), "This is a"),
(1, 0, len("This is another"), "This is another"),
(2, len("And "), len("And ") + len("a third"), "a third"),
(0, 1, 2, None),
],
)
def test_char_span(doc, i_sent, i, j, text):
sents = list(doc.sents)
span = sents[i_sent].char_span(i, j)
if not text:
assert not span
else:
assert span.text == text
def test_spans_sent_spans(doc): def test_spans_sent_spans(doc):
sents = list(doc.sents) sents = list(doc.sents)
assert sents[0].start == 0 assert sents[0].start == 0

View File

@ -0,0 +1,34 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.kb import KnowledgeBase
from spacy.util import ensure_path
from spacy.lang.en import English
from spacy.tests.util import make_tempdir
def test_issue4674():
"""Test that setting entities with overlapping identifiers does not mess up IO"""
nlp = English()
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01]
kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2])
assert kb.get_size_entities() == 1
# dumping to file & loading back in
with make_tempdir() as d:
dir_path = ensure_path(d)
if not dir_path.exists():
dir_path.mkdir()
file_path = dir_path / "kb"
kb.dump(str(file_path))
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
kb2.load_bulk(str(file_path))
assert kb2.get_size_entities() == 1

View File

@ -584,6 +584,22 @@ cdef class Span:
else: else:
return self.doc[root] return self.doc[root]
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
"""Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for
named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
RETURNS (Span): The newly constructed object.
"""
start_idx += self.start_char
end_idx += self.start_char
return self.doc.char_span(start_idx, end_idx)
@property @property
def conjuncts(self): def conjuncts(self):
"""Tokens that are conjoined to the span's root. """Tokens that are conjoined to the span's root.

View File

@ -7,7 +7,7 @@ source: spacy/tokens/doc.pyx
A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and
named entities, export annotations to numpy arrays, losslessly serialize to named entities, export annotations to numpy arrays, losslessly serialize to
compressed binary strings. The `Doc` object holds an array of `TokenC]` structs. compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs.
The Python-level `Token` and [`Span`](/api/span) objects are views of this The Python-level `Token` and [`Span`](/api/span) objects are views of this
array, i.e. they don't own the data themselves. array, i.e. they don't own the data themselves.

View File

@ -1261,6 +1261,21 @@
}, },
"category": ["podcasts"] "category": ["podcasts"]
}, },
{
"type": "education",
"id": "practical-ai-podcast",
"title": "Practical AI: Modern NLP with spaCy",
"slogan": "December 2019",
"description": "\"SpaCy is awesome for NLP! Its easy to use, has widespread adoption, is open source, and integrates the latest language models. Ines Montani and Matthew Honnibal (core developers of spaCy and co-founders of Explosion) join us to discuss the history of the project, its capabilities, and the latest trends in NLP. We also dig into the practicalities of taking NLP workflows to production. You dont want to miss this episode!\"",
"thumb": "https://i.imgur.com/jn8Bcdw.png",
"url": "https://changelog.com/practicalai/68",
"author": "Daniel Whitenack & Chris Benson",
"author_links": {
"website": "https://changelog.com/practicalai",
"twitter": "https://twitter.com/PracticalAIFM"
},
"category": ["podcasts"]
},
{ {
"id": "adam_qas", "id": "adam_qas",
"title": "ADAM: Question Answering System", "title": "ADAM: Question Answering System",