mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
💫 Break up large pipeline.pyx (#3246)
* Break up large pipeline.pyx * Merge some components back together * Fix typo
This commit is contained in:
parent
e7593b791e
commit
a9f8d17632
2
setup.py
2
setup.py
|
@ -41,7 +41,7 @@ MOD_NAMES = [
|
||||||
"spacy.vocab",
|
"spacy.vocab",
|
||||||
"spacy.attrs",
|
"spacy.attrs",
|
||||||
"spacy.morphology",
|
"spacy.morphology",
|
||||||
"spacy.pipeline",
|
"spacy.pipeline.pipes",
|
||||||
"spacy.syntax.stateclass",
|
"spacy.syntax.stateclass",
|
||||||
"spacy.syntax._state",
|
"spacy.syntax._state",
|
||||||
"spacy.tokenizer",
|
"spacy.tokenizer",
|
||||||
|
|
8
spacy/pipeline/__init__.py
Normal file
8
spacy/pipeline/__init__.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from .pipes import Tagger, DependencyParser, EntityRecognizer # noqa
|
||||||
|
from .pipes import TextCategorizer, Tensorizer, Pipe # noqa
|
||||||
|
from .entityruler import EntityRuler # noqa
|
||||||
|
from .hooks import SentenceSegmenter, SimilarityHook # noqa
|
||||||
|
from .functions import merge_entities, merge_noun_chunks, merge_subtokens # noqa
|
170
spacy/pipeline/entityruler.py
Normal file
170
spacy/pipeline/entityruler.py
Normal file
|
@ -0,0 +1,170 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
import srsly
|
||||||
|
|
||||||
|
from ..errors import Errors
|
||||||
|
from ..compat import basestring_
|
||||||
|
from ..util import ensure_path
|
||||||
|
from ..tokens import Span
|
||||||
|
from ..matcher import Matcher, PhraseMatcher
|
||||||
|
|
||||||
|
|
||||||
|
class EntityRuler(object):
|
||||||
|
name = "entity_ruler"
|
||||||
|
|
||||||
|
def __init__(self, nlp, **cfg):
|
||||||
|
"""Initialise the entitiy ruler. If patterns are supplied here, they
|
||||||
|
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||||
|
key. A pattern can either be a token pattern (list) or a phrase pattern
|
||||||
|
(string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
|
||||||
|
|
||||||
|
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
||||||
|
and process phrase patterns.
|
||||||
|
patterns (iterable): Optional patterns to load in.
|
||||||
|
overwrite_ents (bool): If existing entities are present, e.g. entities
|
||||||
|
added by the model, overwrite them by matches if necessary.
|
||||||
|
**cfg: Other config parameters. If pipeline component is loaded as part
|
||||||
|
of a model pipeline, this will include all keyword arguments passed
|
||||||
|
to `spacy.load`.
|
||||||
|
RETURNS (EntityRuler): The newly constructed object.
|
||||||
|
"""
|
||||||
|
self.nlp = nlp
|
||||||
|
self.overwrite = cfg.get("overwrite_ents", False)
|
||||||
|
self.token_patterns = defaultdict(list)
|
||||||
|
self.phrase_patterns = defaultdict(list)
|
||||||
|
self.matcher = Matcher(nlp.vocab)
|
||||||
|
self.phrase_matcher = PhraseMatcher(nlp.vocab)
|
||||||
|
patterns = cfg.get("patterns")
|
||||||
|
if patterns is not None:
|
||||||
|
self.add_patterns(patterns)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
"""The number of all patterns added to the entity ruler."""
|
||||||
|
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
||||||
|
n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
|
||||||
|
return n_token_patterns + n_phrase_patterns
|
||||||
|
|
||||||
|
def __contains__(self, label):
|
||||||
|
"""Whether a label is present in the patterns."""
|
||||||
|
return label in self.token_patterns or label in self.phrase_patterns
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
"""Find matches in document and add them as entities.
|
||||||
|
|
||||||
|
doc (Doc): The Doc object in the pipeline.
|
||||||
|
RETURNS (Doc): The Doc with added entities, if available.
|
||||||
|
"""
|
||||||
|
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||||
|
matches = set(
|
||||||
|
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
||||||
|
)
|
||||||
|
get_sort_key = lambda m: (m[2] - m[1], m[1])
|
||||||
|
matches = sorted(matches, key=get_sort_key, reverse=True)
|
||||||
|
entities = list(doc.ents)
|
||||||
|
new_entities = []
|
||||||
|
seen_tokens = set()
|
||||||
|
for match_id, start, end in matches:
|
||||||
|
if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
|
||||||
|
continue
|
||||||
|
# check for end - 1 here because boundaries are inclusive
|
||||||
|
if start not in seen_tokens and end - 1 not in seen_tokens:
|
||||||
|
new_entities.append(Span(doc, start, end, label=match_id))
|
||||||
|
entities = [
|
||||||
|
e for e in entities if not (e.start < end and e.end > start)
|
||||||
|
]
|
||||||
|
seen_tokens.update(range(start, end))
|
||||||
|
doc.ents = entities + new_entities
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@property
|
||||||
|
def labels(self):
|
||||||
|
"""All labels present in the match patterns.
|
||||||
|
|
||||||
|
RETURNS (set): The string labels.
|
||||||
|
"""
|
||||||
|
all_labels = set(self.token_patterns.keys())
|
||||||
|
all_labels.update(self.phrase_patterns.keys())
|
||||||
|
return all_labels
|
||||||
|
|
||||||
|
@property
|
||||||
|
def patterns(self):
|
||||||
|
"""Get all patterns that were added to the entity ruler.
|
||||||
|
|
||||||
|
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||||
|
"""
|
||||||
|
all_patterns = []
|
||||||
|
for label, patterns in self.token_patterns.items():
|
||||||
|
for pattern in patterns:
|
||||||
|
all_patterns.append({"label": label, "pattern": pattern})
|
||||||
|
for label, patterns in self.phrase_patterns.items():
|
||||||
|
for pattern in patterns:
|
||||||
|
all_patterns.append({"label": label, "pattern": pattern.text})
|
||||||
|
return all_patterns
|
||||||
|
|
||||||
|
def add_patterns(self, patterns):
|
||||||
|
"""Add patterns to the entitiy ruler. A pattern can either be a token
|
||||||
|
pattern (list of dicts) or a phrase pattern (string). For example:
|
||||||
|
{'label': 'ORG', 'pattern': 'Apple'}
|
||||||
|
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
||||||
|
|
||||||
|
patterns (list): The patterns to add.
|
||||||
|
"""
|
||||||
|
for entry in patterns:
|
||||||
|
label = entry["label"]
|
||||||
|
pattern = entry["pattern"]
|
||||||
|
if isinstance(pattern, basestring_):
|
||||||
|
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||||
|
elif isinstance(pattern, list):
|
||||||
|
self.token_patterns[label].append(pattern)
|
||||||
|
else:
|
||||||
|
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||||
|
for label, patterns in self.token_patterns.items():
|
||||||
|
self.matcher.add(label, None, *patterns)
|
||||||
|
for label, patterns in self.phrase_patterns.items():
|
||||||
|
self.phrase_matcher.add(label, None, *patterns)
|
||||||
|
|
||||||
|
def from_bytes(self, patterns_bytes, **kwargs):
|
||||||
|
"""Load the entity ruler from a bytestring.
|
||||||
|
|
||||||
|
patterns_bytes (bytes): The bytestring to load.
|
||||||
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
"""
|
||||||
|
patterns = srsly.msgpack_loads(patterns_bytes)
|
||||||
|
self.add_patterns(patterns)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_bytes(self, **kwargs):
|
||||||
|
"""Serialize the entity ruler patterns to a bytestring.
|
||||||
|
|
||||||
|
RETURNS (bytes): The serialized patterns.
|
||||||
|
"""
|
||||||
|
return srsly.msgpack_dumps(self.patterns)
|
||||||
|
|
||||||
|
def from_disk(self, path, **kwargs):
|
||||||
|
"""Load the entity ruler from a file. Expects a file containing
|
||||||
|
newline-delimited JSON (JSONL) with one entry per line.
|
||||||
|
|
||||||
|
path (unicode / Path): The JSONL file to load.
|
||||||
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
"""
|
||||||
|
path = ensure_path(path)
|
||||||
|
path = path.with_suffix(".jsonl")
|
||||||
|
patterns = srsly.read_jsonl(path)
|
||||||
|
self.add_patterns(patterns)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def to_disk(self, path, **kwargs):
|
||||||
|
"""Save the entity ruler patterns to a directory. The patterns will be
|
||||||
|
saved as newline-delimited JSON (JSONL).
|
||||||
|
|
||||||
|
path (unicode / Path): The JSONL file to load.
|
||||||
|
**kwargs: Other config paramters, mostly for consistency.
|
||||||
|
RETURNS (EntityRuler): The loaded entity ruler.
|
||||||
|
"""
|
||||||
|
path = ensure_path(path)
|
||||||
|
path = path.with_suffix(".jsonl")
|
||||||
|
srsly.write_jsonl(path, self.patterns)
|
48
spacy/pipeline/functions.py
Normal file
48
spacy/pipeline/functions.py
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from ..matcher import Matcher
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: replace doc.merge with doc.retokenize
|
||||||
|
|
||||||
|
|
||||||
|
def merge_noun_chunks(doc):
|
||||||
|
"""Merge noun chunks into a single token.
|
||||||
|
|
||||||
|
doc (Doc): The Doc object.
|
||||||
|
RETURNS (Doc): The Doc object with merged noun chunks.
|
||||||
|
"""
|
||||||
|
if not doc.is_parsed:
|
||||||
|
return doc
|
||||||
|
spans = [
|
||||||
|
(np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
|
||||||
|
]
|
||||||
|
for start, end, tag, dep in spans:
|
||||||
|
doc.merge(start, end, tag=tag, dep=dep)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def merge_entities(doc):
|
||||||
|
"""Merge entities into a single token.
|
||||||
|
|
||||||
|
doc (Doc): The Doc object.
|
||||||
|
RETURNS (Doc): The Doc object with merged noun entities.
|
||||||
|
"""
|
||||||
|
spans = [
|
||||||
|
(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
|
||||||
|
]
|
||||||
|
for start, end, tag, dep, ent_type in spans:
|
||||||
|
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def merge_subtokens(doc, label="subtok"):
|
||||||
|
merger = Matcher(doc.vocab)
|
||||||
|
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
|
||||||
|
matches = merger(doc)
|
||||||
|
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||||
|
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||||
|
for start_char, end_char in offsets:
|
||||||
|
doc.merge(start_char, end_char)
|
||||||
|
return doc
|
100
spacy/pipeline/hooks.py
Normal file
100
spacy/pipeline/hooks.py
Normal file
|
@ -0,0 +1,100 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||||
|
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||||
|
|
||||||
|
from .pipes import Pipe
|
||||||
|
from .._ml import link_vectors_to_models
|
||||||
|
|
||||||
|
|
||||||
|
class SentenceSegmenter(object):
|
||||||
|
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||||
|
(that doesn't require the dependency parse). To change the sentence
|
||||||
|
boundary detection strategy, pass a generator function `strategy` on
|
||||||
|
initialization, or assign a new strategy to the .strategy attribute.
|
||||||
|
Sentence detection strategies should be generators that take `Doc` objects
|
||||||
|
and yield `Span` objects for each sentence.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "sentencizer"
|
||||||
|
|
||||||
|
def __init__(self, vocab, strategy=None):
|
||||||
|
self.vocab = vocab
|
||||||
|
if strategy is None or strategy == "on_punct":
|
||||||
|
strategy = self.split_on_punct
|
||||||
|
self.strategy = strategy
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
doc.user_hooks["sents"] = self.strategy
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def split_on_punct(doc):
|
||||||
|
start = 0
|
||||||
|
seen_period = False
|
||||||
|
for i, word in enumerate(doc):
|
||||||
|
if seen_period and not word.is_punct:
|
||||||
|
yield doc[start : word.i]
|
||||||
|
start = word.i
|
||||||
|
seen_period = False
|
||||||
|
elif word.text in [".", "!", "?"]:
|
||||||
|
seen_period = True
|
||||||
|
if start < len(doc):
|
||||||
|
yield doc[start : len(doc)]
|
||||||
|
|
||||||
|
|
||||||
|
class SimilarityHook(Pipe):
|
||||||
|
"""
|
||||||
|
Experimental: A pipeline component to install a hook for supervised
|
||||||
|
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||||
|
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||||
|
interface. By default, the model concatenates the elementwise mean and
|
||||||
|
elementwise max of the two tensors, and compares them using the
|
||||||
|
Cauchy-like similarity function from Chen (2013):
|
||||||
|
|
||||||
|
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||||
|
|
||||||
|
Where W is a vector of dimension weights, initialized to 1.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "similarity"
|
||||||
|
|
||||||
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
self.vocab = vocab
|
||||||
|
self.model = model
|
||||||
|
self.cfg = dict(cfg)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def Model(cls, length):
|
||||||
|
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
||||||
|
|
||||||
|
def __call__(self, doc):
|
||||||
|
"""Install similarity hook"""
|
||||||
|
doc.user_hooks["similarity"] = self.predict
|
||||||
|
return doc
|
||||||
|
|
||||||
|
def pipe(self, docs, **kwargs):
|
||||||
|
for doc in docs:
|
||||||
|
yield self(doc)
|
||||||
|
|
||||||
|
def predict(self, doc1, doc2):
|
||||||
|
self.require_model()
|
||||||
|
return self.model.predict([(doc1, doc2)])
|
||||||
|
|
||||||
|
def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
|
||||||
|
self.require_model()
|
||||||
|
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||||
|
|
||||||
|
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
||||||
|
"""Allocate model, using width from tensorizer in pipeline.
|
||||||
|
|
||||||
|
gold_tuples (iterable): Gold-standard training data.
|
||||||
|
pipeline (list): The pipeline the model is part of.
|
||||||
|
"""
|
||||||
|
if self.model is True:
|
||||||
|
self.model = self.Model(pipeline[0].model.nO)
|
||||||
|
link_vectors_to_models(self.vocab)
|
||||||
|
if sgd is None:
|
||||||
|
sgd = self.create_optimizer()
|
||||||
|
return sgd
|
|
@ -3,271 +3,39 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import numpy
|
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
from collections import OrderedDict, defaultdict
|
|
||||||
|
import numpy
|
||||||
|
from collections import OrderedDict
|
||||||
import srsly
|
import srsly
|
||||||
|
|
||||||
from thinc.api import chain
|
from thinc.api import chain
|
||||||
from thinc.v2v import Affine, Maxout, Softmax
|
from thinc.v2v import Affine, Maxout, Softmax
|
||||||
from thinc.misc import LayerNorm
|
from thinc.misc import LayerNorm
|
||||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
|
||||||
from thinc.neural.util import to_categorical, copy_array
|
from thinc.neural.util import to_categorical, copy_array
|
||||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
|
||||||
|
|
||||||
from .tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from .syntax.nn_parser cimport Parser
|
from ..syntax.nn_parser cimport Parser
|
||||||
from .syntax import nonproj
|
from ..syntax.ner cimport BiluoPushDown
|
||||||
from .syntax.ner cimport BiluoPushDown
|
from ..syntax.arc_eager cimport ArcEager
|
||||||
from .syntax.arc_eager cimport ArcEager
|
from ..morphology cimport Morphology
|
||||||
from .morphology cimport Morphology
|
from ..vocab cimport Vocab
|
||||||
from .vocab cimport Vocab
|
|
||||||
from .syntax import nonproj
|
|
||||||
from .matcher import Matcher
|
|
||||||
|
|
||||||
from .matcher import Matcher, PhraseMatcher
|
from ..syntax import nonproj
|
||||||
from .tokens.span import Span
|
from ..attrs import POS, ID
|
||||||
from .attrs import POS, ID
|
from ..parts_of_speech import X
|
||||||
from .parts_of_speech import X
|
from .._ml import Tok2Vec, build_tagger_model, build_simple_cnn_text_classifier
|
||||||
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||||
from ._ml import build_simple_cnn_text_classifier
|
from .._ml import masked_language_model, create_default_optimizer
|
||||||
from ._ml import link_vectors_to_models, zero_init, flatten
|
from ..errors import Errors, TempErrors
|
||||||
from ._ml import create_default_optimizer
|
from .. import util
|
||||||
from ._ml import masked_language_model
|
|
||||||
from .errors import Errors, TempErrors
|
|
||||||
from .compat import basestring_
|
|
||||||
from . import util
|
|
||||||
|
|
||||||
|
|
||||||
class SentenceSegmenter(object):
|
def _load_cfg(path):
|
||||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
if path.exists():
|
||||||
(that doesn't require the dependency parse). To change the sentence
|
return srsly.read_json(path)
|
||||||
boundary detection strategy, pass a generator function `strategy` on
|
else:
|
||||||
initialization, or assign a new strategy to the .strategy attribute.
|
return {}
|
||||||
Sentence detection strategies should be generators that take `Doc` objects
|
|
||||||
and yield `Span` objects for each sentence.
|
|
||||||
"""
|
|
||||||
name = 'sentencizer'
|
|
||||||
|
|
||||||
def __init__(self, vocab, strategy=None):
|
|
||||||
self.vocab = vocab
|
|
||||||
if strategy is None or strategy == 'on_punct':
|
|
||||||
strategy = self.split_on_punct
|
|
||||||
self.strategy = strategy
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
doc.user_hooks['sents'] = self.strategy
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def split_on_punct(doc):
|
|
||||||
start = 0
|
|
||||||
seen_period = False
|
|
||||||
for i, word in enumerate(doc):
|
|
||||||
if seen_period and not word.is_punct:
|
|
||||||
yield doc[start:word.i]
|
|
||||||
start = word.i
|
|
||||||
seen_period = False
|
|
||||||
elif word.text in ['.', '!', '?']:
|
|
||||||
seen_period = True
|
|
||||||
if start < len(doc):
|
|
||||||
yield doc[start:len(doc)]
|
|
||||||
|
|
||||||
|
|
||||||
def merge_noun_chunks(doc):
|
|
||||||
"""Merge noun chunks into a single token.
|
|
||||||
|
|
||||||
doc (Doc): The Doc object.
|
|
||||||
RETURNS (Doc): The Doc object with merged noun chunks.
|
|
||||||
"""
|
|
||||||
if not doc.is_parsed:
|
|
||||||
return doc
|
|
||||||
spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
|
|
||||||
for np in doc.noun_chunks]
|
|
||||||
for start, end, tag, dep in spans:
|
|
||||||
doc.merge(start, end, tag=tag, dep=dep)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def merge_entities(doc):
|
|
||||||
"""Merge entities into a single token.
|
|
||||||
|
|
||||||
doc (Doc): The Doc object.
|
|
||||||
RETURNS (Doc): The Doc object with merged noun entities.
|
|
||||||
"""
|
|
||||||
spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
|
|
||||||
for e in doc.ents]
|
|
||||||
for start, end, tag, dep, ent_type in spans:
|
|
||||||
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
def merge_subtokens(doc, label='subtok'):
|
|
||||||
merger = Matcher(doc.vocab)
|
|
||||||
merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
|
|
||||||
matches = merger(doc)
|
|
||||||
spans = [doc[start:end+1] for _, start, end in matches]
|
|
||||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
|
||||||
for start_char, end_char in offsets:
|
|
||||||
doc.merge(start_char, end_char)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
|
|
||||||
class EntityRuler(object):
|
|
||||||
name = 'entity_ruler'
|
|
||||||
|
|
||||||
def __init__(self, nlp, **cfg):
|
|
||||||
"""Initialise the entitiy ruler. If patterns are supplied here, they
|
|
||||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
|
||||||
key. A pattern can either be a token pattern (list) or a phrase pattern
|
|
||||||
(string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
|
|
||||||
|
|
||||||
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
|
||||||
and process phrase patterns.
|
|
||||||
patterns (iterable): Optional patterns to load in.
|
|
||||||
overwrite_ents (bool): If existing entities are present, e.g. entities
|
|
||||||
added by the model, overwrite them by matches if necessary.
|
|
||||||
**cfg: Other config parameters. If pipeline component is loaded as part
|
|
||||||
of a model pipeline, this will include all keyword arguments passed
|
|
||||||
to `spacy.load`.
|
|
||||||
RETURNS (EntityRuler): The newly constructed object.
|
|
||||||
"""
|
|
||||||
self.nlp = nlp
|
|
||||||
self.overwrite = cfg.get('overwrite_ents', False)
|
|
||||||
self.token_patterns = defaultdict(list)
|
|
||||||
self.phrase_patterns = defaultdict(list)
|
|
||||||
self.matcher = Matcher(nlp.vocab)
|
|
||||||
self.phrase_matcher = PhraseMatcher(nlp.vocab)
|
|
||||||
patterns = cfg.get('patterns')
|
|
||||||
if patterns is not None:
|
|
||||||
self.add_patterns(patterns)
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
"""The number of all patterns added to the entity ruler."""
|
|
||||||
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
|
||||||
n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
|
|
||||||
return n_token_patterns + n_phrase_patterns
|
|
||||||
|
|
||||||
def __contains__(self, label):
|
|
||||||
"""Whether a label is present in the patterns."""
|
|
||||||
return label in self.token_patterns or label in self.phrase_patterns
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
"""Find matches in document and add them as entities.
|
|
||||||
|
|
||||||
doc (Doc): The Doc object in the pipeline.
|
|
||||||
RETURNS (Doc): The Doc with added entities, if available.
|
|
||||||
"""
|
|
||||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
|
||||||
matches = set([(m_id, start, end) for m_id, start, end in matches
|
|
||||||
if start != end])
|
|
||||||
get_sort_key = lambda m: (m[2] - m[1], m[1])
|
|
||||||
matches = sorted(matches, key=get_sort_key, reverse=True)
|
|
||||||
entities = list(doc.ents)
|
|
||||||
new_entities = []
|
|
||||||
seen_tokens = set()
|
|
||||||
for match_id, start, end in matches:
|
|
||||||
if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
|
|
||||||
continue
|
|
||||||
# check for end - 1 here because boundaries are inclusive
|
|
||||||
if start not in seen_tokens and end - 1 not in seen_tokens:
|
|
||||||
new_entities.append(Span(doc, start, end, label=match_id))
|
|
||||||
entities = [e for e in entities
|
|
||||||
if not (e.start < end and e.end > start)]
|
|
||||||
seen_tokens.update(range(start, end))
|
|
||||||
doc.ents = entities + new_entities
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@property
|
|
||||||
def labels(self):
|
|
||||||
"""All labels present in the match patterns.
|
|
||||||
|
|
||||||
RETURNS (set): The string labels.
|
|
||||||
"""
|
|
||||||
all_labels = set(self.token_patterns.keys())
|
|
||||||
all_labels.update(self.phrase_patterns.keys())
|
|
||||||
return all_labels
|
|
||||||
|
|
||||||
@property
|
|
||||||
def patterns(self):
|
|
||||||
"""Get all patterns that were added to the entity ruler.
|
|
||||||
|
|
||||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
|
||||||
"""
|
|
||||||
all_patterns = []
|
|
||||||
for label, patterns in self.token_patterns.items():
|
|
||||||
for pattern in patterns:
|
|
||||||
all_patterns.append({'label': label, 'pattern': pattern})
|
|
||||||
for label, patterns in self.phrase_patterns.items():
|
|
||||||
for pattern in patterns:
|
|
||||||
all_patterns.append({'label': label, 'pattern': pattern.text})
|
|
||||||
return all_patterns
|
|
||||||
|
|
||||||
def add_patterns(self, patterns):
|
|
||||||
"""Add patterns to the entitiy ruler. A pattern can either be a token
|
|
||||||
pattern (list of dicts) or a phrase pattern (string). For example:
|
|
||||||
{'label': 'ORG', 'pattern': 'Apple'}
|
|
||||||
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
|
||||||
|
|
||||||
patterns (list): The patterns to add.
|
|
||||||
"""
|
|
||||||
for entry in patterns:
|
|
||||||
label = entry['label']
|
|
||||||
pattern = entry['pattern']
|
|
||||||
if isinstance(pattern, basestring_):
|
|
||||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
|
||||||
elif isinstance(pattern, list):
|
|
||||||
self.token_patterns[label].append(pattern)
|
|
||||||
else:
|
|
||||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
|
||||||
for label, patterns in self.token_patterns.items():
|
|
||||||
self.matcher.add(label, None, *patterns)
|
|
||||||
for label, patterns in self.phrase_patterns.items():
|
|
||||||
self.phrase_matcher.add(label, None, *patterns)
|
|
||||||
|
|
||||||
def from_bytes(self, patterns_bytes, **kwargs):
|
|
||||||
"""Load the entity ruler from a bytestring.
|
|
||||||
|
|
||||||
patterns_bytes (bytes): The bytestring to load.
|
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
|
||||||
"""
|
|
||||||
patterns = srsly.msgpack_loads(patterns_bytes)
|
|
||||||
self.add_patterns(patterns)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_bytes(self, **kwargs):
|
|
||||||
"""Serialize the entity ruler patterns to a bytestring.
|
|
||||||
|
|
||||||
RETURNS (bytes): The serialized patterns.
|
|
||||||
"""
|
|
||||||
return srsly.msgpack_dumps(self.patterns)
|
|
||||||
|
|
||||||
def from_disk(self, path, **kwargs):
|
|
||||||
"""Load the entity ruler from a file. Expects a file containing
|
|
||||||
newline-delimited JSON (JSONL) with one entry per line.
|
|
||||||
|
|
||||||
path (unicode / Path): The JSONL file to load.
|
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
|
||||||
"""
|
|
||||||
path = util.ensure_path(path)
|
|
||||||
path = path.with_suffix('.jsonl')
|
|
||||||
patterns = srsly.read_jsonl(path)
|
|
||||||
self.add_patterns(patterns)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def to_disk(self, path, **kwargs):
|
|
||||||
"""Save the entity ruler patterns to a directory. The patterns will be
|
|
||||||
saved as newline-delimited JSON (JSONL).
|
|
||||||
|
|
||||||
path (unicode / Path): The JSONL file to load.
|
|
||||||
**kwargs: Other config paramters, mostly for consistency.
|
|
||||||
RETURNS (EntityRuler): The loaded entity ruler.
|
|
||||||
"""
|
|
||||||
path = util.ensure_path(path)
|
|
||||||
path = path.with_suffix('.jsonl')
|
|
||||||
srsly.write_jsonl(path, self.patterns)
|
|
||||||
|
|
||||||
|
|
||||||
class Pipe(object):
|
class Pipe(object):
|
||||||
|
@ -275,6 +43,7 @@ class Pipe(object):
|
||||||
it defines the interface that components should follow to function as
|
it defines the interface that components should follow to function as
|
||||||
components in a spaCy analysis pipeline.
|
components in a spaCy analysis pipeline.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = None
|
name = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -300,7 +69,7 @@ class Pipe(object):
|
||||||
|
|
||||||
def require_model(self):
|
def require_model(self):
|
||||||
"""Raise an error if the component's model is not initialized."""
|
"""Raise an error if the component's model is not initialized."""
|
||||||
if getattr(self, 'model', None) in (None, True, False):
|
if getattr(self, "model", None) in (None, True, False):
|
||||||
raise ValueError(Errors.E109.format(name=self.name))
|
raise ValueError(Errors.E109.format(name=self.name))
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
@ -326,7 +95,7 @@ class Pipe(object):
|
||||||
"""Modify a batch of documents, using pre-computed scores."""
|
"""Modify a batch of documents, using pre-computed scores."""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
|
||||||
"""Learn from a batch of documents and gold-standard information,
|
"""Learn from a batch of documents and gold-standard information,
|
||||||
updating the pipe's model.
|
updating the pipe's model.
|
||||||
|
|
||||||
|
@ -353,11 +122,11 @@ class Pipe(object):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def create_optimizer(self):
|
def create_optimizer(self):
|
||||||
return create_default_optimizer(self.model.ops,
|
return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
|
||||||
**self.cfg.get('optimizer', {}))
|
|
||||||
|
|
||||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(
|
||||||
**kwargs):
|
self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs
|
||||||
|
):
|
||||||
"""Initialize the pipe for training, using data exampes if available.
|
"""Initialize the pipe for training, using data exampes if available.
|
||||||
If no model has been initialized yet, the model is added."""
|
If no model has been initialized yet, the model is added."""
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
|
@ -375,70 +144,70 @@ class Pipe(object):
|
||||||
def to_bytes(self, **exclude):
|
def to_bytes(self, **exclude):
|
||||||
"""Serialize the pipe to a bytestring."""
|
"""Serialize the pipe to a bytestring."""
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
|
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||||
if self.model in (True, False, None):
|
if self.model in (True, False, None):
|
||||||
serialize['model'] = lambda: self.model
|
serialize["model"] = lambda: self.model
|
||||||
else:
|
else:
|
||||||
serialize['model'] = self.model.to_bytes
|
serialize["model"] = self.model.to_bytes
|
||||||
serialize['vocab'] = self.vocab.to_bytes
|
serialize["vocab"] = self.vocab.to_bytes
|
||||||
return util.to_bytes(serialize, exclude)
|
return util.to_bytes(serialize, exclude)
|
||||||
|
|
||||||
def from_bytes(self, bytes_data, **exclude):
|
def from_bytes(self, bytes_data, **exclude):
|
||||||
"""Load the pipe from a bytestring."""
|
"""Load the pipe from a bytestring."""
|
||||||
|
|
||||||
def load_model(b):
|
def load_model(b):
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
# TODO: Remove this once we don't have to handle previous models
|
||||||
if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
|
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
||||||
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
|
self.cfg["pretrained_vectors"] = self.vocab.vectors.name
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
self.model.from_bytes(b)
|
self.model.from_bytes(b)
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict(
|
||||||
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
(
|
||||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||||
('model', load_model),
|
("vocab", lambda b: self.vocab.from_bytes(b)),
|
||||||
))
|
("model", load_model),
|
||||||
|
)
|
||||||
|
)
|
||||||
util.from_bytes(bytes_data, deserialize, exclude)
|
util.from_bytes(bytes_data, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def to_disk(self, path, **exclude):
|
def to_disk(self, path, **exclude):
|
||||||
"""Serialize the pipe to disk."""
|
"""Serialize the pipe to disk."""
|
||||||
serialize = OrderedDict()
|
serialize = OrderedDict()
|
||||||
serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
|
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||||
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
|
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||||
if self.model not in (None, True, False):
|
if self.model not in (None, True, False):
|
||||||
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
|
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
||||||
util.to_disk(path, serialize, exclude)
|
util.to_disk(path, serialize, exclude)
|
||||||
|
|
||||||
def from_disk(self, path, **exclude):
|
def from_disk(self, path, **exclude):
|
||||||
"""Load the pipe from disk."""
|
"""Load the pipe from disk."""
|
||||||
|
|
||||||
def load_model(p):
|
def load_model(p):
|
||||||
# TODO: Remove this once we don't have to handle previous models
|
# TODO: Remove this once we don't have to handle previous models
|
||||||
if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
|
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
||||||
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
|
self.cfg["pretrained_vectors"] = self.vocab.vectors.name
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
self.model.from_bytes(p.open('rb').read())
|
self.model.from_bytes(p.open("rb").read())
|
||||||
|
|
||||||
deserialize = OrderedDict((
|
deserialize = OrderedDict(
|
||||||
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
(
|
||||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
("cfg", lambda p: self.cfg.update(_load_cfg(p))),
|
||||||
('model', load_model),
|
("vocab", lambda p: self.vocab.from_disk(p)),
|
||||||
))
|
("model", load_model),
|
||||||
|
)
|
||||||
|
)
|
||||||
util.from_disk(path, deserialize, exclude)
|
util.from_disk(path, deserialize, exclude)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
def _load_cfg(path):
|
|
||||||
if path.exists():
|
|
||||||
return srsly.read_json(path)
|
|
||||||
else:
|
|
||||||
return {}
|
|
||||||
|
|
||||||
|
|
||||||
class Tensorizer(Pipe):
|
class Tensorizer(Pipe):
|
||||||
"""Pre-train position-sensitive vectors for tokens."""
|
"""Pre-train position-sensitive vectors for tokens."""
|
||||||
name = 'tensorizer'
|
|
||||||
|
name = "tensorizer"
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, output_size=300, **cfg):
|
def Model(cls, output_size=300, **cfg):
|
||||||
|
@ -449,7 +218,7 @@ class Tensorizer(Pipe):
|
||||||
**cfg: Config parameters.
|
**cfg: Config parameters.
|
||||||
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||||
"""
|
"""
|
||||||
input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96))
|
input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
|
||||||
return zero_init(Affine(output_size, input_size, drop_factor=0.0))
|
return zero_init(Affine(output_size, input_size, drop_factor=0.0))
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
@ -470,7 +239,7 @@ class Tensorizer(Pipe):
|
||||||
self.model = model
|
self.model = model
|
||||||
self.input_models = []
|
self.input_models = []
|
||||||
self.cfg = dict(cfg)
|
self.cfg = dict(cfg)
|
||||||
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
self.cfg.setdefault("cnn_maxout_pieces", 3)
|
||||||
|
|
||||||
def __call__(self, doc):
|
def __call__(self, doc):
|
||||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||||
|
@ -516,10 +285,12 @@ class Tensorizer(Pipe):
|
||||||
"""
|
"""
|
||||||
for doc, tensor in zip(docs, tensors):
|
for doc, tensor in zip(docs, tensors):
|
||||||
if tensor.shape[0] != len(doc):
|
if tensor.shape[0] != len(doc):
|
||||||
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
|
raise ValueError(
|
||||||
|
Errors.E076.format(rows=tensor.shape[0], words=len(doc))
|
||||||
|
)
|
||||||
doc.tensor = tensor
|
doc.tensor = tensor
|
||||||
|
|
||||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
|
||||||
"""Update the model.
|
"""Update the model.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -545,7 +316,7 @@ class Tensorizer(Pipe):
|
||||||
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
||||||
bp_input(d_input, sgd=sgd)
|
bp_input(d_input, sgd=sgd)
|
||||||
if losses is not None:
|
if losses is not None:
|
||||||
losses.setdefault(self.name, 0.)
|
losses.setdefault(self.name, 0.0)
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
@ -553,11 +324,10 @@ class Tensorizer(Pipe):
|
||||||
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||||
target = self.vocab.vectors.data[ids]
|
target = self.vocab.vectors.data[ids]
|
||||||
d_scores = (prediction - target) / prediction.shape[0]
|
d_scores = (prediction - target) / prediction.shape[0]
|
||||||
loss = (d_scores**2).sum()
|
loss = (d_scores ** 2).sum()
|
||||||
return loss, d_scores
|
return loss, d_scores
|
||||||
|
|
||||||
def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None,
|
def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||||
**kwargs):
|
|
||||||
"""Allocate models, pre-process training data and acquire an
|
"""Allocate models, pre-process training data and acquire an
|
||||||
optimizer.
|
optimizer.
|
||||||
|
|
||||||
|
@ -566,7 +336,7 @@ class Tensorizer(Pipe):
|
||||||
"""
|
"""
|
||||||
if pipeline is not None:
|
if pipeline is not None:
|
||||||
for name, model in pipeline:
|
for name, model in pipeline:
|
||||||
if getattr(model, 'tok2vec', None):
|
if getattr(model, "tok2vec", None):
|
||||||
self.input_models.append(model.tok2vec)
|
self.input_models.append(model.tok2vec)
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model(**self.cfg)
|
self.model = self.Model(**self.cfg)
|
||||||
|
@ -1086,61 +856,6 @@ class ClozeMultitask(Pipe):
|
||||||
losses[self.name] += loss
|
losses[self.name] += loss
|
||||||
|
|
||||||
|
|
||||||
class SimilarityHook(Pipe):
|
|
||||||
"""
|
|
||||||
Experimental: A pipeline component to install a hook for supervised
|
|
||||||
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
|
||||||
documents. The similarity model can be any object obeying the Thinc `Model`
|
|
||||||
interface. By default, the model concatenates the elementwise mean and
|
|
||||||
elementwise max of the two tensors, and compares them using the
|
|
||||||
Cauchy-like similarity function from Chen (2013):
|
|
||||||
|
|
||||||
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
|
||||||
|
|
||||||
Where W is a vector of dimension weights, initialized to 1.
|
|
||||||
"""
|
|
||||||
name = 'similarity'
|
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
|
||||||
self.vocab = vocab
|
|
||||||
self.model = model
|
|
||||||
self.cfg = dict(cfg)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def Model(cls, length):
|
|
||||||
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
|
||||||
|
|
||||||
def __call__(self, doc):
|
|
||||||
"""Install similarity hook"""
|
|
||||||
doc.user_hooks['similarity'] = self.predict
|
|
||||||
return doc
|
|
||||||
|
|
||||||
def pipe(self, docs, **kwargs):
|
|
||||||
for doc in docs:
|
|
||||||
yield self(doc)
|
|
||||||
|
|
||||||
def predict(self, doc1, doc2):
|
|
||||||
self.require_model()
|
|
||||||
return self.model.predict([(doc1, doc2)])
|
|
||||||
|
|
||||||
def update(self, doc1_doc2, golds, sgd=None, drop=0.):
|
|
||||||
self.require_model()
|
|
||||||
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
|
||||||
|
|
||||||
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
|
||||||
"""Allocate model, using width from tensorizer in pipeline.
|
|
||||||
|
|
||||||
gold_tuples (iterable): Gold-standard training data.
|
|
||||||
pipeline (list): The pipeline the model is part of.
|
|
||||||
"""
|
|
||||||
if self.model is True:
|
|
||||||
self.model = self.Model(pipeline[0].model.nO)
|
|
||||||
link_vectors_to_models(self.vocab)
|
|
||||||
if sgd is None:
|
|
||||||
sgd = self.create_optimizer()
|
|
||||||
return sgd
|
|
||||||
|
|
||||||
|
|
||||||
class TextCategorizer(Pipe):
|
class TextCategorizer(Pipe):
|
||||||
name = 'textcat'
|
name = 'textcat'
|
||||||
|
|
||||||
|
@ -1299,13 +1014,12 @@ cdef class DependencyParser(Parser):
|
||||||
|
|
||||||
|
|
||||||
cdef class EntityRecognizer(Parser):
|
cdef class EntityRecognizer(Parser):
|
||||||
name = 'ner'
|
name = "ner"
|
||||||
TransitionSystem = BiluoPushDown
|
TransitionSystem = BiluoPushDown
|
||||||
|
|
||||||
nr_feature = 6
|
nr_feature = 6
|
||||||
|
|
||||||
def add_multitask_objective(self, target):
|
def add_multitask_objective(self, target):
|
||||||
if target == 'cloze':
|
if target == "cloze":
|
||||||
cloze = ClozeMultitask(self.vocab)
|
cloze = ClozeMultitask(self.vocab)
|
||||||
self._multitasks.append(cloze)
|
self._multitasks.append(cloze)
|
||||||
else:
|
else:
|
||||||
|
@ -1326,8 +1040,8 @@ cdef class EntityRecognizer(Parser):
|
||||||
def labels(self):
|
def labels(self):
|
||||||
# Get the labels from the model by looking at the available moves, e.g.
|
# Get the labels from the model by looking at the available moves, e.g.
|
||||||
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
|
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
|
||||||
return [move.split('-')[1] for move in self.move_names
|
return [move.split("-")[1] for move in self.move_names
|
||||||
if move[0] in ('B', 'I', 'L', 'U')]
|
if move[0] in ("B", "I", "L", "U")]
|
||||||
|
|
||||||
|
|
||||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
|
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
|
Loading…
Reference in New Issue
Block a user