mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
💫 Break up large pipeline.pyx (#3246)
* Break up large pipeline.pyx * Merge some components back together * Fix typo
This commit is contained in:
parent
e7593b791e
commit
a9f8d17632
2
setup.py
2
setup.py
|
@ -41,7 +41,7 @@ MOD_NAMES = [
|
|||
"spacy.vocab",
|
||||
"spacy.attrs",
|
||||
"spacy.morphology",
|
||||
"spacy.pipeline",
|
||||
"spacy.pipeline.pipes",
|
||||
"spacy.syntax.stateclass",
|
||||
"spacy.syntax._state",
|
||||
"spacy.tokenizer",
|
||||
|
|
8
spacy/pipeline/__init__.py
Normal file
8
spacy/pipeline/__init__.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .pipes import Tagger, DependencyParser, EntityRecognizer # noqa
|
||||
from .pipes import TextCategorizer, Tensorizer, Pipe # noqa
|
||||
from .entityruler import EntityRuler # noqa
|
||||
from .hooks import SentenceSegmenter, SimilarityHook # noqa
|
||||
from .functions import merge_entities, merge_noun_chunks, merge_subtokens # noqa
|
170
spacy/pipeline/entityruler.py
Normal file
170
spacy/pipeline/entityruler.py
Normal file
|
@ -0,0 +1,170 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import defaultdict
|
||||
import srsly
|
||||
|
||||
from ..errors import Errors
|
||||
from ..compat import basestring_
|
||||
from ..util import ensure_path
|
||||
from ..tokens import Span
|
||||
from ..matcher import Matcher, PhraseMatcher
|
||||
|
||||
|
||||
class EntityRuler(object):
|
||||
name = "entity_ruler"
|
||||
|
||||
def __init__(self, nlp, **cfg):
|
||||
"""Initialise the entitiy ruler. If patterns are supplied here, they
|
||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||
key. A pattern can either be a token pattern (list) or a phrase pattern
|
||||
(string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
|
||||
|
||||
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
||||
and process phrase patterns.
|
||||
patterns (iterable): Optional patterns to load in.
|
||||
overwrite_ents (bool): If existing entities are present, e.g. entities
|
||||
added by the model, overwrite them by matches if necessary.
|
||||
**cfg: Other config parameters. If pipeline component is loaded as part
|
||||
of a model pipeline, this will include all keyword arguments passed
|
||||
to `spacy.load`.
|
||||
RETURNS (EntityRuler): The newly constructed object.
|
||||
"""
|
||||
self.nlp = nlp
|
||||
self.overwrite = cfg.get("overwrite_ents", False)
|
||||
self.token_patterns = defaultdict(list)
|
||||
self.phrase_patterns = defaultdict(list)
|
||||
self.matcher = Matcher(nlp.vocab)
|
||||
self.phrase_matcher = PhraseMatcher(nlp.vocab)
|
||||
patterns = cfg.get("patterns")
|
||||
if patterns is not None:
|
||||
self.add_patterns(patterns)
|
||||
|
||||
def __len__(self):
|
||||
"""The number of all patterns added to the entity ruler."""
|
||||
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
||||
n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
|
||||
return n_token_patterns + n_phrase_patterns
|
||||
|
||||
def __contains__(self, label):
|
||||
"""Whether a label is present in the patterns."""
|
||||
return label in self.token_patterns or label in self.phrase_patterns
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Find matches in document and add them as entities.
|
||||
|
||||
doc (Doc): The Doc object in the pipeline.
|
||||
RETURNS (Doc): The Doc with added entities, if available.
|
||||
"""
|
||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||
matches = set(
|
||||
[(m_id, start, end) for m_id, start, end in matches if start != end]
|
||||
)
|
||||
get_sort_key = lambda m: (m[2] - m[1], m[1])
|
||||
matches = sorted(matches, key=get_sort_key, reverse=True)
|
||||
entities = list(doc.ents)
|
||||
new_entities = []
|
||||
seen_tokens = set()
|
||||
for match_id, start, end in matches:
|
||||
if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
|
||||
continue
|
||||
# check for end - 1 here because boundaries are inclusive
|
||||
if start not in seen_tokens and end - 1 not in seen_tokens:
|
||||
new_entities.append(Span(doc, start, end, label=match_id))
|
||||
entities = [
|
||||
e for e in entities if not (e.start < end and e.end > start)
|
||||
]
|
||||
seen_tokens.update(range(start, end))
|
||||
doc.ents = entities + new_entities
|
||||
return doc
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
"""All labels present in the match patterns.
|
||||
|
||||
RETURNS (set): The string labels.
|
||||
"""
|
||||
all_labels = set(self.token_patterns.keys())
|
||||
all_labels.update(self.phrase_patterns.keys())
|
||||
return all_labels
|
||||
|
||||
@property
|
||||
def patterns(self):
|
||||
"""Get all patterns that were added to the entity ruler.
|
||||
|
||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||
"""
|
||||
all_patterns = []
|
||||
for label, patterns in self.token_patterns.items():
|
||||
for pattern in patterns:
|
||||
all_patterns.append({"label": label, "pattern": pattern})
|
||||
for label, patterns in self.phrase_patterns.items():
|
||||
for pattern in patterns:
|
||||
all_patterns.append({"label": label, "pattern": pattern.text})
|
||||
return all_patterns
|
||||
|
||||
def add_patterns(self, patterns):
|
||||
"""Add patterns to the entitiy ruler. A pattern can either be a token
|
||||
pattern (list of dicts) or a phrase pattern (string). For example:
|
||||
{'label': 'ORG', 'pattern': 'Apple'}
|
||||
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
||||
|
||||
patterns (list): The patterns to add.
|
||||
"""
|
||||
for entry in patterns:
|
||||
label = entry["label"]
|
||||
pattern = entry["pattern"]
|
||||
if isinstance(pattern, basestring_):
|
||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||
elif isinstance(pattern, list):
|
||||
self.token_patterns[label].append(pattern)
|
||||
else:
|
||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||
for label, patterns in self.token_patterns.items():
|
||||
self.matcher.add(label, None, *patterns)
|
||||
for label, patterns in self.phrase_patterns.items():
|
||||
self.phrase_matcher.add(label, None, *patterns)
|
||||
|
||||
def from_bytes(self, patterns_bytes, **kwargs):
|
||||
"""Load the entity ruler from a bytestring.
|
||||
|
||||
patterns_bytes (bytes): The bytestring to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
"""
|
||||
patterns = srsly.msgpack_loads(patterns_bytes)
|
||||
self.add_patterns(patterns)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the entity ruler patterns to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized patterns.
|
||||
"""
|
||||
return srsly.msgpack_dumps(self.patterns)
|
||||
|
||||
def from_disk(self, path, **kwargs):
|
||||
"""Load the entity ruler from a file. Expects a file containing
|
||||
newline-delimited JSON (JSONL) with one entry per line.
|
||||
|
||||
path (unicode / Path): The JSONL file to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
path = path.with_suffix(".jsonl")
|
||||
patterns = srsly.read_jsonl(path)
|
||||
self.add_patterns(patterns)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the entity ruler patterns to a directory. The patterns will be
|
||||
saved as newline-delimited JSON (JSONL).
|
||||
|
||||
path (unicode / Path): The JSONL file to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
path = path.with_suffix(".jsonl")
|
||||
srsly.write_jsonl(path, self.patterns)
|
48
spacy/pipeline/functions.py
Normal file
48
spacy/pipeline/functions.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..matcher import Matcher
|
||||
|
||||
|
||||
# TODO: replace doc.merge with doc.retokenize
|
||||
|
||||
|
||||
def merge_noun_chunks(doc):
|
||||
"""Merge noun chunks into a single token.
|
||||
|
||||
doc (Doc): The Doc object.
|
||||
RETURNS (Doc): The Doc object with merged noun chunks.
|
||||
"""
|
||||
if not doc.is_parsed:
|
||||
return doc
|
||||
spans = [
|
||||
(np.start_char, np.end_char, np.root.tag, np.root.dep) for np in doc.noun_chunks
|
||||
]
|
||||
for start, end, tag, dep in spans:
|
||||
doc.merge(start, end, tag=tag, dep=dep)
|
||||
return doc
|
||||
|
||||
|
||||
def merge_entities(doc):
|
||||
"""Merge entities into a single token.
|
||||
|
||||
doc (Doc): The Doc object.
|
||||
RETURNS (Doc): The Doc object with merged noun entities.
|
||||
"""
|
||||
spans = [
|
||||
(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label) for e in doc.ents
|
||||
]
|
||||
for start, end, tag, dep, ent_type in spans:
|
||||
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
|
||||
return doc
|
||||
|
||||
|
||||
def merge_subtokens(doc, label="subtok"):
|
||||
merger = Matcher(doc.vocab)
|
||||
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
|
||||
matches = merger(doc)
|
||||
spans = [doc[start : end + 1] for _, start, end in matches]
|
||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||
for start_char, end_char in offsets:
|
||||
doc.merge(start_char, end_char)
|
||||
return doc
|
100
spacy/pipeline/hooks.py
Normal file
100
spacy/pipeline/hooks.py
Normal file
|
@ -0,0 +1,100 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
||||
from .pipes import Pipe
|
||||
from .._ml import link_vectors_to_models
|
||||
|
||||
|
||||
class SentenceSegmenter(object):
|
||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
(that doesn't require the dependency parse). To change the sentence
|
||||
boundary detection strategy, pass a generator function `strategy` on
|
||||
initialization, or assign a new strategy to the .strategy attribute.
|
||||
Sentence detection strategies should be generators that take `Doc` objects
|
||||
and yield `Span` objects for each sentence.
|
||||
"""
|
||||
|
||||
name = "sentencizer"
|
||||
|
||||
def __init__(self, vocab, strategy=None):
|
||||
self.vocab = vocab
|
||||
if strategy is None or strategy == "on_punct":
|
||||
strategy = self.split_on_punct
|
||||
self.strategy = strategy
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.user_hooks["sents"] = self.strategy
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def split_on_punct(doc):
|
||||
start = 0
|
||||
seen_period = False
|
||||
for i, word in enumerate(doc):
|
||||
if seen_period and not word.is_punct:
|
||||
yield doc[start : word.i]
|
||||
start = word.i
|
||||
seen_period = False
|
||||
elif word.text in [".", "!", "?"]:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
yield doc[start : len(doc)]
|
||||
|
||||
|
||||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
Experimental: A pipeline component to install a hook for supervised
|
||||
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||
interface. By default, the model concatenates the elementwise mean and
|
||||
elementwise max of the two tensors, and compares them using the
|
||||
Cauchy-like similarity function from Chen (2013):
|
||||
|
||||
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||
|
||||
Where W is a vector of dimension weights, initialized to 1.
|
||||
"""
|
||||
|
||||
name = "similarity"
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, length):
|
||||
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Install similarity hook"""
|
||||
doc.user_hooks["similarity"] = self.predict
|
||||
return doc
|
||||
|
||||
def pipe(self, docs, **kwargs):
|
||||
for doc in docs:
|
||||
yield self(doc)
|
||||
|
||||
def predict(self, doc1, doc2):
|
||||
self.require_model()
|
||||
return self.model.predict([(doc1, doc2)])
|
||||
|
||||
def update(self, doc1_doc2, golds, sgd=None, drop=0.0):
|
||||
self.require_model()
|
||||
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||
|
||||
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
||||
"""Allocate model, using width from tensorizer in pipeline.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
if self.model is True:
|
||||
self.model = self.Model(pipeline[0].model.nO)
|
||||
link_vectors_to_models(self.vocab)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
|
@ -3,271 +3,39 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import numpy
|
||||
cimport numpy as np
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
import numpy
|
||||
from collections import OrderedDict
|
||||
import srsly
|
||||
|
||||
from thinc.api import chain
|
||||
from thinc.v2v import Affine, Maxout, Softmax
|
||||
from thinc.misc import LayerNorm
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||
from thinc.neural.util import to_categorical, copy_array
|
||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
||||
from .tokens.doc cimport Doc
|
||||
from .syntax.nn_parser cimport Parser
|
||||
from .syntax import nonproj
|
||||
from .syntax.ner cimport BiluoPushDown
|
||||
from .syntax.arc_eager cimport ArcEager
|
||||
from .morphology cimport Morphology
|
||||
from .vocab cimport Vocab
|
||||
from .syntax import nonproj
|
||||
from .matcher import Matcher
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..syntax.nn_parser cimport Parser
|
||||
from ..syntax.ner cimport BiluoPushDown
|
||||
from ..syntax.arc_eager cimport ArcEager
|
||||
from ..morphology cimport Morphology
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
from .matcher import Matcher, PhraseMatcher
|
||||
from .tokens.span import Span
|
||||
from .attrs import POS, ID
|
||||
from .parts_of_speech import X
|
||||
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||
from ._ml import build_simple_cnn_text_classifier
|
||||
from ._ml import link_vectors_to_models, zero_init, flatten
|
||||
from ._ml import create_default_optimizer
|
||||
from ._ml import masked_language_model
|
||||
from .errors import Errors, TempErrors
|
||||
from .compat import basestring_
|
||||
from . import util
|
||||
from ..syntax import nonproj
|
||||
from ..attrs import POS, ID
|
||||
from ..parts_of_speech import X
|
||||
from .._ml import Tok2Vec, build_tagger_model, build_simple_cnn_text_classifier
|
||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||
from .._ml import masked_language_model, create_default_optimizer
|
||||
from ..errors import Errors, TempErrors
|
||||
from .. import util
|
||||
|
||||
|
||||
class SentenceSegmenter(object):
|
||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
(that doesn't require the dependency parse). To change the sentence
|
||||
boundary detection strategy, pass a generator function `strategy` on
|
||||
initialization, or assign a new strategy to the .strategy attribute.
|
||||
Sentence detection strategies should be generators that take `Doc` objects
|
||||
and yield `Span` objects for each sentence.
|
||||
"""
|
||||
name = 'sentencizer'
|
||||
|
||||
def __init__(self, vocab, strategy=None):
|
||||
self.vocab = vocab
|
||||
if strategy is None or strategy == 'on_punct':
|
||||
strategy = self.split_on_punct
|
||||
self.strategy = strategy
|
||||
|
||||
def __call__(self, doc):
|
||||
doc.user_hooks['sents'] = self.strategy
|
||||
return doc
|
||||
|
||||
@staticmethod
|
||||
def split_on_punct(doc):
|
||||
start = 0
|
||||
seen_period = False
|
||||
for i, word in enumerate(doc):
|
||||
if seen_period and not word.is_punct:
|
||||
yield doc[start:word.i]
|
||||
start = word.i
|
||||
seen_period = False
|
||||
elif word.text in ['.', '!', '?']:
|
||||
seen_period = True
|
||||
if start < len(doc):
|
||||
yield doc[start:len(doc)]
|
||||
|
||||
|
||||
def merge_noun_chunks(doc):
|
||||
"""Merge noun chunks into a single token.
|
||||
|
||||
doc (Doc): The Doc object.
|
||||
RETURNS (Doc): The Doc object with merged noun chunks.
|
||||
"""
|
||||
if not doc.is_parsed:
|
||||
return doc
|
||||
spans = [(np.start_char, np.end_char, np.root.tag, np.root.dep)
|
||||
for np in doc.noun_chunks]
|
||||
for start, end, tag, dep in spans:
|
||||
doc.merge(start, end, tag=tag, dep=dep)
|
||||
return doc
|
||||
|
||||
|
||||
def merge_entities(doc):
|
||||
"""Merge entities into a single token.
|
||||
|
||||
doc (Doc): The Doc object.
|
||||
RETURNS (Doc): The Doc object with merged noun entities.
|
||||
"""
|
||||
spans = [(e.start_char, e.end_char, e.root.tag, e.root.dep, e.label)
|
||||
for e in doc.ents]
|
||||
for start, end, tag, dep, ent_type in spans:
|
||||
doc.merge(start, end, tag=tag, dep=dep, ent_type=ent_type)
|
||||
return doc
|
||||
|
||||
|
||||
def merge_subtokens(doc, label='subtok'):
|
||||
merger = Matcher(doc.vocab)
|
||||
merger.add('SUBTOK', None, [{'DEP': label, 'op': '+'}])
|
||||
matches = merger(doc)
|
||||
spans = [doc[start:end+1] for _, start, end in matches]
|
||||
offsets = [(span.start_char, span.end_char) for span in spans]
|
||||
for start_char, end_char in offsets:
|
||||
doc.merge(start_char, end_char)
|
||||
return doc
|
||||
|
||||
|
||||
class EntityRuler(object):
|
||||
name = 'entity_ruler'
|
||||
|
||||
def __init__(self, nlp, **cfg):
|
||||
"""Initialise the entitiy ruler. If patterns are supplied here, they
|
||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||
key. A pattern can either be a token pattern (list) or a phrase pattern
|
||||
(string). For example: `{'label': 'ORG', 'pattern': 'Apple'}`.
|
||||
|
||||
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
||||
and process phrase patterns.
|
||||
patterns (iterable): Optional patterns to load in.
|
||||
overwrite_ents (bool): If existing entities are present, e.g. entities
|
||||
added by the model, overwrite them by matches if necessary.
|
||||
**cfg: Other config parameters. If pipeline component is loaded as part
|
||||
of a model pipeline, this will include all keyword arguments passed
|
||||
to `spacy.load`.
|
||||
RETURNS (EntityRuler): The newly constructed object.
|
||||
"""
|
||||
self.nlp = nlp
|
||||
self.overwrite = cfg.get('overwrite_ents', False)
|
||||
self.token_patterns = defaultdict(list)
|
||||
self.phrase_patterns = defaultdict(list)
|
||||
self.matcher = Matcher(nlp.vocab)
|
||||
self.phrase_matcher = PhraseMatcher(nlp.vocab)
|
||||
patterns = cfg.get('patterns')
|
||||
if patterns is not None:
|
||||
self.add_patterns(patterns)
|
||||
|
||||
def __len__(self):
|
||||
"""The number of all patterns added to the entity ruler."""
|
||||
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
||||
n_phrase_patterns = sum(len(p) for p in self.phrase_patterns.values())
|
||||
return n_token_patterns + n_phrase_patterns
|
||||
|
||||
def __contains__(self, label):
|
||||
"""Whether a label is present in the patterns."""
|
||||
return label in self.token_patterns or label in self.phrase_patterns
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Find matches in document and add them as entities.
|
||||
|
||||
doc (Doc): The Doc object in the pipeline.
|
||||
RETURNS (Doc): The Doc with added entities, if available.
|
||||
"""
|
||||
matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
|
||||
matches = set([(m_id, start, end) for m_id, start, end in matches
|
||||
if start != end])
|
||||
get_sort_key = lambda m: (m[2] - m[1], m[1])
|
||||
matches = sorted(matches, key=get_sort_key, reverse=True)
|
||||
entities = list(doc.ents)
|
||||
new_entities = []
|
||||
seen_tokens = set()
|
||||
for match_id, start, end in matches:
|
||||
if any(t.ent_type for t in doc[start:end]) and not self.overwrite:
|
||||
continue
|
||||
# check for end - 1 here because boundaries are inclusive
|
||||
if start not in seen_tokens and end - 1 not in seen_tokens:
|
||||
new_entities.append(Span(doc, start, end, label=match_id))
|
||||
entities = [e for e in entities
|
||||
if not (e.start < end and e.end > start)]
|
||||
seen_tokens.update(range(start, end))
|
||||
doc.ents = entities + new_entities
|
||||
return doc
|
||||
|
||||
@property
|
||||
def labels(self):
|
||||
"""All labels present in the match patterns.
|
||||
|
||||
RETURNS (set): The string labels.
|
||||
"""
|
||||
all_labels = set(self.token_patterns.keys())
|
||||
all_labels.update(self.phrase_patterns.keys())
|
||||
return all_labels
|
||||
|
||||
@property
|
||||
def patterns(self):
|
||||
"""Get all patterns that were added to the entity ruler.
|
||||
|
||||
RETURNS (list): The original patterns, one dictionary per pattern.
|
||||
"""
|
||||
all_patterns = []
|
||||
for label, patterns in self.token_patterns.items():
|
||||
for pattern in patterns:
|
||||
all_patterns.append({'label': label, 'pattern': pattern})
|
||||
for label, patterns in self.phrase_patterns.items():
|
||||
for pattern in patterns:
|
||||
all_patterns.append({'label': label, 'pattern': pattern.text})
|
||||
return all_patterns
|
||||
|
||||
def add_patterns(self, patterns):
|
||||
"""Add patterns to the entitiy ruler. A pattern can either be a token
|
||||
pattern (list of dicts) or a phrase pattern (string). For example:
|
||||
{'label': 'ORG', 'pattern': 'Apple'}
|
||||
{'label': 'GPE', 'pattern': [{'lower': 'san'}, {'lower': 'francisco'}]}
|
||||
|
||||
patterns (list): The patterns to add.
|
||||
"""
|
||||
for entry in patterns:
|
||||
label = entry['label']
|
||||
pattern = entry['pattern']
|
||||
if isinstance(pattern, basestring_):
|
||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||
elif isinstance(pattern, list):
|
||||
self.token_patterns[label].append(pattern)
|
||||
else:
|
||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||
for label, patterns in self.token_patterns.items():
|
||||
self.matcher.add(label, None, *patterns)
|
||||
for label, patterns in self.phrase_patterns.items():
|
||||
self.phrase_matcher.add(label, None, *patterns)
|
||||
|
||||
def from_bytes(self, patterns_bytes, **kwargs):
|
||||
"""Load the entity ruler from a bytestring.
|
||||
|
||||
patterns_bytes (bytes): The bytestring to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
"""
|
||||
patterns = srsly.msgpack_loads(patterns_bytes)
|
||||
self.add_patterns(patterns)
|
||||
return self
|
||||
|
||||
def to_bytes(self, **kwargs):
|
||||
"""Serialize the entity ruler patterns to a bytestring.
|
||||
|
||||
RETURNS (bytes): The serialized patterns.
|
||||
"""
|
||||
return srsly.msgpack_dumps(self.patterns)
|
||||
|
||||
def from_disk(self, path, **kwargs):
|
||||
"""Load the entity ruler from a file. Expects a file containing
|
||||
newline-delimited JSON (JSONL) with one entry per line.
|
||||
|
||||
path (unicode / Path): The JSONL file to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
path = path.with_suffix('.jsonl')
|
||||
patterns = srsly.read_jsonl(path)
|
||||
self.add_patterns(patterns)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the entity ruler patterns to a directory. The patterns will be
|
||||
saved as newline-delimited JSON (JSONL).
|
||||
|
||||
path (unicode / Path): The JSONL file to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
path = path.with_suffix('.jsonl')
|
||||
srsly.write_jsonl(path, self.patterns)
|
||||
def _load_cfg(path):
|
||||
if path.exists():
|
||||
return srsly.read_json(path)
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
class Pipe(object):
|
||||
|
@ -275,6 +43,7 @@ class Pipe(object):
|
|||
it defines the interface that components should follow to function as
|
||||
components in a spaCy analysis pipeline.
|
||||
"""
|
||||
|
||||
name = None
|
||||
|
||||
@classmethod
|
||||
|
@ -300,7 +69,7 @@ class Pipe(object):
|
|||
|
||||
def require_model(self):
|
||||
"""Raise an error if the component's model is not initialized."""
|
||||
if getattr(self, 'model', None) in (None, True, False):
|
||||
if getattr(self, "model", None) in (None, True, False):
|
||||
raise ValueError(Errors.E109.format(name=self.name))
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
|
@ -326,7 +95,7 @@ class Pipe(object):
|
|||
"""Modify a batch of documents, using pre-computed scores."""
|
||||
raise NotImplementedError
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
|
||||
"""Learn from a batch of documents and gold-standard information,
|
||||
updating the pipe's model.
|
||||
|
||||
|
@ -353,11 +122,11 @@ class Pipe(object):
|
|||
raise NotImplementedError
|
||||
|
||||
def create_optimizer(self):
|
||||
return create_default_optimizer(self.model.ops,
|
||||
**self.cfg.get('optimizer', {}))
|
||||
return create_default_optimizer(self.model.ops, **self.cfg.get("optimizer", {}))
|
||||
|
||||
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||
**kwargs):
|
||||
def begin_training(
|
||||
self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs
|
||||
):
|
||||
"""Initialize the pipe for training, using data exampes if available.
|
||||
If no model has been initialized yet, the model is added."""
|
||||
if self.model is True:
|
||||
|
@ -375,70 +144,70 @@ class Pipe(object):
|
|||
def to_bytes(self, **exclude):
|
||||
"""Serialize the pipe to a bytestring."""
|
||||
serialize = OrderedDict()
|
||||
serialize['cfg'] = lambda: srsly.json_dumps(self.cfg)
|
||||
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
|
||||
if self.model in (True, False, None):
|
||||
serialize['model'] = lambda: self.model
|
||||
serialize["model"] = lambda: self.model
|
||||
else:
|
||||
serialize['model'] = self.model.to_bytes
|
||||
serialize['vocab'] = self.vocab.to_bytes
|
||||
serialize["model"] = self.model.to_bytes
|
||||
serialize["vocab"] = self.vocab.to_bytes
|
||||
return util.to_bytes(serialize, exclude)
|
||||
|
||||
def from_bytes(self, bytes_data, **exclude):
|
||||
"""Load the pipe from a bytestring."""
|
||||
|
||||
def load_model(b):
|
||||
# TODO: Remove this once we don't have to handle previous models
|
||||
if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
|
||||
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
|
||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
||||
self.cfg["pretrained_vectors"] = self.vocab.vectors.name
|
||||
if self.model is True:
|
||||
self.model = self.Model(**self.cfg)
|
||||
self.model.from_bytes(b)
|
||||
|
||||
deserialize = OrderedDict((
|
||||
('cfg', lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||
('vocab', lambda b: self.vocab.from_bytes(b)),
|
||||
('model', load_model),
|
||||
))
|
||||
deserialize = OrderedDict(
|
||||
(
|
||||
("cfg", lambda b: self.cfg.update(srsly.json_loads(b))),
|
||||
("vocab", lambda b: self.vocab.from_bytes(b)),
|
||||
("model", load_model),
|
||||
)
|
||||
)
|
||||
util.from_bytes(bytes_data, deserialize, exclude)
|
||||
return self
|
||||
|
||||
def to_disk(self, path, **exclude):
|
||||
"""Serialize the pipe to disk."""
|
||||
serialize = OrderedDict()
|
||||
serialize['cfg'] = lambda p: srsly.write_json(p, self.cfg)
|
||||
serialize['vocab'] = lambda p: self.vocab.to_disk(p)
|
||||
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
|
||||
serialize["vocab"] = lambda p: self.vocab.to_disk(p)
|
||||
if self.model not in (None, True, False):
|
||||
serialize['model'] = lambda p: p.open('wb').write(self.model.to_bytes())
|
||||
serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
|
||||
util.to_disk(path, serialize, exclude)
|
||||
|
||||
def from_disk(self, path, **exclude):
|
||||
"""Load the pipe from disk."""
|
||||
|
||||
def load_model(p):
|
||||
# TODO: Remove this once we don't have to handle previous models
|
||||
if self.cfg.get('pretrained_dims') and 'pretrained_vectors' not in self.cfg:
|
||||
self.cfg['pretrained_vectors'] = self.vocab.vectors.name
|
||||
if self.cfg.get("pretrained_dims") and "pretrained_vectors" not in self.cfg:
|
||||
self.cfg["pretrained_vectors"] = self.vocab.vectors.name
|
||||
if self.model is True:
|
||||
self.model = self.Model(**self.cfg)
|
||||
self.model.from_bytes(p.open('rb').read())
|
||||
self.model.from_bytes(p.open("rb").read())
|
||||
|
||||
deserialize = OrderedDict((
|
||||
('cfg', lambda p: self.cfg.update(_load_cfg(p))),
|
||||
('vocab', lambda p: self.vocab.from_disk(p)),
|
||||
('model', load_model),
|
||||
))
|
||||
deserialize = OrderedDict(
|
||||
(
|
||||
("cfg", lambda p: self.cfg.update(_load_cfg(p))),
|
||||
("vocab", lambda p: self.vocab.from_disk(p)),
|
||||
("model", load_model),
|
||||
)
|
||||
)
|
||||
util.from_disk(path, deserialize, exclude)
|
||||
return self
|
||||
|
||||
|
||||
def _load_cfg(path):
|
||||
if path.exists():
|
||||
return srsly.read_json(path)
|
||||
else:
|
||||
return {}
|
||||
|
||||
|
||||
class Tensorizer(Pipe):
|
||||
"""Pre-train position-sensitive vectors for tokens."""
|
||||
name = 'tensorizer'
|
||||
|
||||
name = "tensorizer"
|
||||
|
||||
@classmethod
|
||||
def Model(cls, output_size=300, **cfg):
|
||||
|
@ -449,7 +218,7 @@ class Tensorizer(Pipe):
|
|||
**cfg: Config parameters.
|
||||
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||
"""
|
||||
input_size = util.env_opt('token_vector_width', cfg.get('input_size', 96))
|
||||
input_size = util.env_opt("token_vector_width", cfg.get("input_size", 96))
|
||||
return zero_init(Affine(output_size, input_size, drop_factor=0.0))
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
|
@ -470,7 +239,7 @@ class Tensorizer(Pipe):
|
|||
self.model = model
|
||||
self.input_models = []
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
self.cfg.setdefault("cnn_maxout_pieces", 3)
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||
|
@ -516,10 +285,12 @@ class Tensorizer(Pipe):
|
|||
"""
|
||||
for doc, tensor in zip(docs, tensors):
|
||||
if tensor.shape[0] != len(doc):
|
||||
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
|
||||
raise ValueError(
|
||||
Errors.E076.format(rows=tensor.shape[0], words=len(doc))
|
||||
)
|
||||
doc.tensor = tensor
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||
def update(self, docs, golds, state=None, drop=0.0, sgd=None, losses=None):
|
||||
"""Update the model.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -545,7 +316,7 @@ class Tensorizer(Pipe):
|
|||
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
||||
bp_input(d_input, sgd=sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses.setdefault(self.name, 0.0)
|
||||
losses[self.name] += loss
|
||||
return loss
|
||||
|
||||
|
@ -553,11 +324,10 @@ class Tensorizer(Pipe):
|
|||
ids = self.model.ops.flatten([doc.to_array(ID).ravel() for doc in docs])
|
||||
target = self.vocab.vectors.data[ids]
|
||||
d_scores = (prediction - target) / prediction.shape[0]
|
||||
loss = (d_scores**2).sum()
|
||||
loss = (d_scores ** 2).sum()
|
||||
return loss, d_scores
|
||||
|
||||
def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None,
|
||||
**kwargs):
|
||||
def begin_training(self, gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
|
||||
"""Allocate models, pre-process training data and acquire an
|
||||
optimizer.
|
||||
|
||||
|
@ -566,7 +336,7 @@ class Tensorizer(Pipe):
|
|||
"""
|
||||
if pipeline is not None:
|
||||
for name, model in pipeline:
|
||||
if getattr(model, 'tok2vec', None):
|
||||
if getattr(model, "tok2vec", None):
|
||||
self.input_models.append(model.tok2vec)
|
||||
if self.model is True:
|
||||
self.model = self.Model(**self.cfg)
|
||||
|
@ -1086,61 +856,6 @@ class ClozeMultitask(Pipe):
|
|||
losses[self.name] += loss
|
||||
|
||||
|
||||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
Experimental: A pipeline component to install a hook for supervised
|
||||
similarity into `Doc` objects. Requires a `Tensorizer` to pre-process
|
||||
documents. The similarity model can be any object obeying the Thinc `Model`
|
||||
interface. By default, the model concatenates the elementwise mean and
|
||||
elementwise max of the two tensors, and compares them using the
|
||||
Cauchy-like similarity function from Chen (2013):
|
||||
|
||||
>>> similarity = 1. / (1. + (W * (vec1-vec2)**2).sum())
|
||||
|
||||
Where W is a vector of dimension weights, initialized to 1.
|
||||
"""
|
||||
name = 'similarity'
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.cfg = dict(cfg)
|
||||
|
||||
@classmethod
|
||||
def Model(cls, length):
|
||||
return Siamese(Pooling(max_pool, mean_pool), CauchySimilarity(length))
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Install similarity hook"""
|
||||
doc.user_hooks['similarity'] = self.predict
|
||||
return doc
|
||||
|
||||
def pipe(self, docs, **kwargs):
|
||||
for doc in docs:
|
||||
yield self(doc)
|
||||
|
||||
def predict(self, doc1, doc2):
|
||||
self.require_model()
|
||||
return self.model.predict([(doc1, doc2)])
|
||||
|
||||
def update(self, doc1_doc2, golds, sgd=None, drop=0.):
|
||||
self.require_model()
|
||||
sims, bp_sims = self.model.begin_update(doc1_doc2, drop=drop)
|
||||
|
||||
def begin_training(self, _=tuple(), pipeline=None, sgd=None, **kwargs):
|
||||
"""Allocate model, using width from tensorizer in pipeline.
|
||||
|
||||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
if self.model is True:
|
||||
self.model = self.Model(pipeline[0].model.nO)
|
||||
link_vectors_to_models(self.vocab)
|
||||
if sgd is None:
|
||||
sgd = self.create_optimizer()
|
||||
return sgd
|
||||
|
||||
|
||||
class TextCategorizer(Pipe):
|
||||
name = 'textcat'
|
||||
|
||||
|
@ -1299,13 +1014,12 @@ cdef class DependencyParser(Parser):
|
|||
|
||||
|
||||
cdef class EntityRecognizer(Parser):
|
||||
name = 'ner'
|
||||
name = "ner"
|
||||
TransitionSystem = BiluoPushDown
|
||||
|
||||
nr_feature = 6
|
||||
|
||||
def add_multitask_objective(self, target):
|
||||
if target == 'cloze':
|
||||
if target == "cloze":
|
||||
cloze = ClozeMultitask(self.vocab)
|
||||
self._multitasks.append(cloze)
|
||||
else:
|
||||
|
@ -1326,8 +1040,8 @@ cdef class EntityRecognizer(Parser):
|
|||
def labels(self):
|
||||
# Get the labels from the model by looking at the available moves, e.g.
|
||||
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
|
||||
return [move.split('-')[1] for move in self.move_names
|
||||
if move[0] in ('B', 'I', 'L', 'U')]
|
||||
return [move.split("-")[1] for move in self.move_names
|
||||
if move[0] in ("B", "I", "L", "U")]
|
||||
|
||||
|
||||
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'Tensorizer', 'TextCategorizer']
|
Loading…
Reference in New Issue
Block a user