Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-12-13 15:57:49 +01:00
commit ae9fac2d87
17 changed files with 172 additions and 47 deletions

View File

@ -8,8 +8,8 @@ For more details, see the documentation:
* Knowledge base: https://spacy.io/api/kb
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy v2.2
Last tested with: v2.2
Compatible with: spaCy v2.2.3
Last tested with: v2.2.3
"""
from __future__ import unicode_literals, print_function

View File

@ -8,8 +8,8 @@ For more details, see the documentation:
* Training: https://spacy.io/usage/training
* Entity Linking: https://spacy.io/usage/linguistic-features#entity-linking
Compatible with: spaCy v2.2
Last tested with: v2.2
Compatible with: spaCy v2.2.3
Last tested with: v2.2.3
"""
from __future__ import unicode_literals, print_function
@ -22,6 +22,7 @@ from spacy.vocab import Vocab
import spacy
from spacy.kb import KnowledgeBase
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
from spacy.util import minibatch, compounding
@ -70,22 +71,35 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
nlp.vocab.vectors.name = "spacy_pretrained_vectors"
print("Created blank 'en' model with vocab from '%s'" % vocab_path)
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
# Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
nlp.add_pipe(nlp.create_pipe('sentencizer'))
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
# Note that in a realistic application, an actual NER algorithm should be used instead.
ruler = EntityRuler(nlp)
patterns = [{"label": "PERSON", "pattern": [{"LOWER": "russ"}, {"LOWER": "cochran"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)
# Create the Entity Linker component and add it to the pipeline.
if "entity_linker" not in nlp.pipe_names:
entity_linker = nlp.create_pipe("entity_linker")
# use only the predicted EL score and not the prior probability (for demo purposes)
cfg = {"incl_prior": False}
entity_linker = nlp.create_pipe("entity_linker", cfg)
kb = KnowledgeBase(vocab=nlp.vocab)
kb.load_bulk(kb_path)
print("Loaded Knowledge Base from '%s'" % kb_path)
entity_linker.set_kb(kb)
nlp.add_pipe(entity_linker, last=True)
else:
entity_linker = nlp.get_pipe("entity_linker")
kb = entity_linker.kb
# make sure the annotated examples correspond to known identifiers in the knowlege base
kb_ids = kb.get_entity_strings()
# Convert the texts to docs to make sure we have doc.ents set for the training examples.
# Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
TRAIN_DOCS = []
for text, annotation in TRAIN_DATA:
with nlp.disable_pipes("entity_linker"):
doc = nlp(text)
annotation_clean = annotation
for offset, kb_id_dict in annotation["links"].items():
new_dict = {}
for kb_id, value in kb_id_dict.items():
@ -95,7 +109,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
print(
"Removed", kb_id, "from training because it is not in the KB."
)
annotation["links"][offset] = new_dict
annotation_clean["links"][offset] = new_dict
TRAIN_DOCS.append((doc, annotation_clean))
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "entity_linker"]
@ -103,10 +118,10 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
# reset and initialize the weights randomly
optimizer = nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
random.shuffle(TRAIN_DOCS)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
@ -138,16 +153,8 @@ def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
def _apply_model(nlp):
for text, annotation in TRAIN_DATA:
doc = nlp.tokenizer(text)
# set entities so the evaluation is independent of the NER step
# all the examples contain 'Russ Cochran' as the first two tokens in the sentence
rc_ent = Span(doc, 0, 2, label=PERSON)
doc.ents = [rc_ent]
# apply the entity linker which will now make predictions for the 'Russ Cochran' entities
doc = nlp.get_pipe("entity_linker")(doc)
doc = nlp(text)
print()
print("Entities", [(ent.text, ent.label_, ent.kb_id_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_kb_id_) for t in doc])

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0
thinc==7.4.0.dev0
blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.4.0,<1.1.0

View File

@ -38,13 +38,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=7.3.0,<7.4.0
thinc==7.4.0.dev0
install_requires =
# Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=7.3.0,<7.4.0
thinc==7.4.0.dev0
blis>=0.4.0,<0.5.0
wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0

View File

@ -81,7 +81,8 @@ class Warnings(object):
"Future versions may introduce a `n_process` argument for "
"parallel inference via multiprocessing.")
W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
W018 = ("Entity '{entity}' already exists in the Knowledge Base.")
W018 = ("Entity '{entity}' already exists in the Knowledge Base - "
"ignoring the duplicate entry.")
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
"previously loaded vectors. See Issue #3853.")
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
@ -531,6 +532,9 @@ class Errors(object):
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
E187 = ("Only unicode strings are supported as labels.")
E188 = ("Could not match the gold entity links to entities in the doc - "
"make sure the gold EL data refers to valid results of the "
"named entity recognizer in the `nlp` pipeline.")
@add_codes

View File

@ -136,29 +136,34 @@ cdef class KnowledgeBase:
if len(entity_list) != len(freq_list) or len(entity_list) != len(vector_list):
raise ValueError(Errors.E140)
nr_entities = len(entity_list)
nr_entities = len(set(entity_list))
self._entry_index = PreshMap(nr_entities+1)
self._entries = entry_vec(nr_entities+1)
i = 0
cdef KBEntryC entry
cdef hash_t entity_hash
while i < nr_entities:
entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
while i < len(entity_list):
# only process this entity if its unique ID hadn't been added before
entity_hash = self.vocab.strings.add(entity_list[i])
entry.entity_hash = entity_hash
entry.freq = freq_list[i]
if entity_hash in self._entry_index:
user_warning(Warnings.W018.format(entity=entity_list[i]))
vector_index = self.c_add_vector(entity_vector=vector_list[i])
entry.vector_index = vector_index
else:
entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length:
raise ValueError(Errors.E141.format(found=len(entity_vector), required=self.entity_vector_length))
entry.feats_row = -1 # Features table currently not implemented
entry.entity_hash = entity_hash
entry.freq = freq_list[i]
self._entries[i+1] = entry
self._entry_index[entity_hash] = i+1
vector_index = self.c_add_vector(entity_vector=vector_list[i])
entry.vector_index = vector_index
entry.feats_row = -1 # Features table currently not implemented
self._entries[i+1] = entry
self._entry_index[entity_hash] = i+1
i += 1

View File

@ -677,7 +677,9 @@ def _get_attr_values(spec, string_store):
value = string_store.add(value)
elif isinstance(value, bool):
value = int(value)
elif isinstance(value, (dict, int)):
elif isinstance(value, int):
pass
elif isinstance(value, dict):
continue
else:
raise ValueError(Errors.E153.format(vtype=type(value).__name__))

View File

@ -1142,7 +1142,7 @@ cdef class EntityRecognizer(Parser):
@component(
"entity_linker",
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
requires=["doc.ents", "doc.sents", "token.ent_iob", "token.ent_type"],
assigns=["token.ent_kb_id"]
)
class EntityLinker(Pipe):
@ -1220,13 +1220,20 @@ class EntityLinker(Pipe):
for entity, kb_dict in gold.links.items():
start, end = entity
mention = doc.text[start:end]
# the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
if not (start, end) in ents_by_offset:
raise RuntimeError(Errors.E188)
ent = ents_by_offset[(start, end)]
for kb_id, value in kb_dict.items():
# Currently only training on the positive instances
if value:
sentence_docs.append(ent.sent.as_doc())
try:
sentence_docs.append(ent.sent.as_doc())
except AttributeError:
# Catch the exception when ent.sent is None and provide a user-friendly warning
raise RuntimeError(Errors.E030)
sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)

View File

@ -69,7 +69,8 @@ cdef class ParserBeam(object):
cdef StateC* st
for state in states:
beam = Beam(self.moves.n_moves, width, min_density=density)
beam.initialize(self.moves.init_beam_state, state.c.length,
beam.initialize(self.moves.init_beam_state,
self.moves.del_beam_state, state.c.length,
state.c._sent)
for i in range(beam.width):
st = <StateC*>beam.at(i)

View File

@ -324,10 +324,16 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
return <void*>st
cdef int _del_state(Pool mem, void* state, void* x) except -1:
cdef StateC* st = <StateC*>state
del st
cdef class ArcEager(TransitionSystem):
def __init__(self, *args, **kwargs):
TransitionSystem.__init__(self, *args, **kwargs)
self.init_beam_state = _init_state
self.del_beam_state = _del_state
@classmethod
def get_actions(cls, **kwargs):

View File

@ -33,6 +33,8 @@ ctypedef int (*do_func_t)(StateC* state, attr_t label) nogil
ctypedef void* (*init_state_t)(Pool mem, int length, void* tokens) except NULL
ctypedef int (*del_state_t)(Pool mem, void* state, void* extra_args) except -1
cdef class TransitionSystem:
cdef Pool mem
cdef StringStore strings
@ -42,6 +44,7 @@ cdef class TransitionSystem:
cdef public attr_t root_label
cdef public freqs
cdef init_state_t init_beam_state
cdef del_state_t del_beam_state
cdef public object labels
cdef int initialize_state(self, StateC* state) nogil

View File

@ -30,6 +30,11 @@ cdef void* _init_state(Pool mem, int length, void* tokens) except NULL:
return <void*>st
cdef int _del_state(Pool mem, void* state, void* x) except -1:
cdef StateC* st = <StateC*>state
del st
cdef class TransitionSystem:
def __init__(self, StringStore string_table, labels_by_action=None, min_freq=None):
self.mem = Pool()
@ -44,6 +49,7 @@ cdef class TransitionSystem:
self.initialize_actions(labels_by_action, min_freq=min_freq)
self.root_label = self.strings.add('ROOT')
self.init_beam_state = _init_state
self.del_beam_state = _del_state
def __reduce__(self):
return (self.__class__, (self.strings, self.labels), None, None)
@ -72,7 +78,8 @@ cdef class TransitionSystem:
for doc in docs:
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
beam.initialize(self.init_beam_state, doc.length, doc.c)
beam.initialize(self.init_beam_state, self.del_beam_state,
doc.length, doc.c)
for i in range(beam.width):
state = <StateC*>beam.at(i)
state.offset = offset

View File

@ -32,6 +32,24 @@ def doc_not_parsed(en_tokenizer):
return doc
@pytest.mark.parametrize(
"i_sent,i,j,text",
[
(0, 0, len("This is a"), "This is a"),
(1, 0, len("This is another"), "This is another"),
(2, len("And "), len("And ") + len("a third"), "a third"),
(0, 1, 2, None),
],
)
def test_char_span(doc, i_sent, i, j, text):
sents = list(doc.sents)
span = sents[i_sent].char_span(i, j)
if not text:
assert not span
else:
assert span.text == text
def test_spans_sent_spans(doc):
sents = list(doc.sents)
assert sents[0].start == 0

View File

@ -0,0 +1,34 @@
# coding: utf-8
from __future__ import unicode_literals
from spacy.kb import KnowledgeBase
from spacy.util import ensure_path
from spacy.lang.en import English
from spacy.tests.util import make_tempdir
def test_issue4674():
"""Test that setting entities with overlapping identifiers does not mess up IO"""
nlp = English()
kb = KnowledgeBase(nlp.vocab, entity_vector_length=3)
vector1 = [0.9, 1.1, 1.01]
vector2 = [1.8, 2.25, 2.01]
kb.set_entities(entity_list=["Q1", "Q1"], freq_list=[32, 111], vector_list=[vector1, vector2])
assert kb.get_size_entities() == 1
# dumping to file & loading back in
with make_tempdir() as d:
dir_path = ensure_path(d)
if not dir_path.exists():
dir_path.mkdir()
file_path = dir_path / "kb"
kb.dump(str(file_path))
kb2 = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=3)
kb2.load_bulk(str(file_path))
assert kb2.get_size_entities() == 1

View File

@ -584,6 +584,22 @@ cdef class Span:
else:
return self.doc[root]
def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None):
"""Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
label (uint64 or string): A label to attach to the Span, e.g. for
named entities.
kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
RETURNS (Span): The newly constructed object.
"""
start_idx += self.start_char
end_idx += self.start_char
return self.doc.char_span(start_idx, end_idx)
@property
def conjuncts(self):
"""Tokens that are conjoined to the span's root.

View File

@ -7,7 +7,7 @@ source: spacy/tokens/doc.pyx
A `Doc` is a sequence of [`Token`](/api/token) objects. Access sentences and
named entities, export annotations to numpy arrays, losslessly serialize to
compressed binary strings. The `Doc` object holds an array of `TokenC]` structs.
compressed binary strings. The `Doc` object holds an array of [`TokenC`](/api/cython-structs#tokenc) structs.
The Python-level `Token` and [`Span`](/api/span) objects are views of this
array, i.e. they don't own the data themselves.

View File

@ -1261,6 +1261,21 @@
},
"category": ["podcasts"]
},
{
"type": "education",
"id": "practical-ai-podcast",
"title": "Practical AI: Modern NLP with spaCy",
"slogan": "December 2019",
"description": "\"SpaCy is awesome for NLP! Its easy to use, has widespread adoption, is open source, and integrates the latest language models. Ines Montani and Matthew Honnibal (core developers of spaCy and co-founders of Explosion) join us to discuss the history of the project, its capabilities, and the latest trends in NLP. We also dig into the practicalities of taking NLP workflows to production. You dont want to miss this episode!\"",
"thumb": "https://i.imgur.com/jn8Bcdw.png",
"url": "https://changelog.com/practicalai/68",
"author": "Daniel Whitenack & Chris Benson",
"author_links": {
"website": "https://changelog.com/practicalai",
"twitter": "https://twitter.com/PracticalAIFM"
},
"category": ["podcasts"]
},
{
"id": "adam_qas",
"title": "ADAM: Question Answering System",