
346 lines
12 KiB
Raw Normal View History

# coding: utf8
from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import dill
2017-05-18 12:25:19 +03:00
import numpy
from thinc.neural import Model
from thinc.neural.ops import NumpyOps, CupyOps
2017-05-21 17:07:06 +03:00
from thinc.neural.optimizers import Adam
2017-05-18 12:25:19 +03:00
from .tokenizer import Tokenizer
from .vocab import Vocab
from .tagger import Tagger
from .lemmatizer import Lemmatizer
from .syntax.parser import get_templates
from .syntax.nonproj import PseudoProjectivity
from .pipeline import NeuralDependencyParser, EntityRecognizer
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
2017-04-17 02:46:14 +03:00
from .compat import json_dumps
from .attrs import IS_STOP
2017-05-09 00:58:31 +03:00
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
2017-05-09 01:58:10 +03:00
from .lang.lex_attrs import LEX_ATTRS
from . import util
2017-05-21 17:07:06 +03:00
from .scorer import Scorer
2016-09-24 21:26:17 +03:00
class BaseDefaults(object):
2016-10-18 17:18:25 +03:00
def create_lemmatizer(cls, nlp=None):
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
2016-10-18 17:18:25 +03:00
def create_vocab(cls, nlp=None):
lemmatizer = cls.create_lemmatizer(nlp)
lex_attr_getters = dict(cls.lex_attr_getters)
# This is messy, but it's the minimal working fix to Issue #639.
lex_attr_getters[IS_STOP] = lambda string: string.lower() in cls.stop_words
vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=cls.tag_map,
2017-03-15 17:24:40 +03:00
for tag_str, exc in cls.morph_rules.items():
for orth_str, attrs in exc.items():
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
return vocab
2016-12-18 18:54:52 +03:00
2016-10-18 17:18:25 +03:00
def create_tokenizer(cls, nlp=None):
rules = cls.tokenizer_exceptions
token_match = cls.token_match
prefix_search = util.compile_prefix_regex(cls.prefixes).search \
if cls.prefixes else None
suffix_search = util.compile_suffix_regex(cls.suffixes).search \
if cls.suffixes else None
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
if cls.infixes else None
2016-10-18 17:18:25 +03:00
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
2016-11-26 14:36:04 +03:00
return Tokenizer(vocab, rules=rules,
2016-10-18 17:18:25 +03:00
prefix_search=prefix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, token_match=token_match)
def create_tagger(cls, nlp=None, **cfg):
if nlp is None:
return NeuralTagger(cls.create_vocab(nlp), **cfg)
return NeuralTagger(nlp.vocab, **cfg)
def create_parser(cls, nlp=None, **cfg):
if nlp is None:
return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
return NeuralDependencyParser(nlp.vocab, **cfg)
def create_entity(cls, nlp=None, **cfg):
if nlp is None:
return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
return NeuralEntityRecognizer(nlp.vocab, **cfg)
2016-10-18 17:18:25 +03:00
def create_pipeline(cls, nlp=None):
meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc
pipeline = []
for entry in cls.pipeline:
factory = cls.Defaults.factories[entry]
pipeline.append(factory(nlp, **meta.get(entry, {})))
return pipeline
factories = {
'make_doc': create_tokenizer,
'token_vectors': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
'tags': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
'dependencies': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
'entities': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
2017-05-09 00:58:31 +03:00
token_match = TOKEN_MATCH
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
infixes = tuple(TOKENIZER_INFIXES)
tag_map = dict(TAG_MAP)
tokenizer_exceptions = {}
parser_features = get_templates('parser')
entity_features = get_templates('ner')
2016-10-18 17:18:25 +03:00
tagger_features = Tagger.feature_templates # TODO -- fix this
2016-09-24 21:26:17 +03:00
stop_words = set()
2016-12-18 17:50:09 +03:00
lemma_rules = {}
lemma_exc = {}
lemma_index = {}
2017-03-15 17:24:40 +03:00
morph_rules = {}
2017-05-09 01:58:10 +03:00
lex_attr_getters = LEX_ATTRS
2015-09-14 10:48:51 +03:00
class Language(object):
A text-processing pipeline. Usually you'll load this once per process, and
pass the instance around your program.
2016-09-24 21:26:17 +03:00
Defaults = BaseDefaults
lang = None
2015-08-25 16:37:17 +03:00
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
self.meta = dict(meta)
if vocab is True:
factory = self.Defaults.create_vocab
vocab = factory(self, **meta.get('vocab', {}))
self.vocab = vocab
if make_doc is True:
factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {}))
self.make_doc = make_doc
if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self)
elif pipeline:
self.pipeline = list(pipeline)
# Resolve strings, like "cnn", "lstm", etc
for i, entry in enumerate(self.pipeline):
if entry in self.Defaults.factories:
factory = self.Defaults.factories[entry]
self.pipeline[i] = factory(self, **meta.get(entry, {}))
self.pipeline = []
def __call__(self, text, **disabled):
Apply the pipeline to some text. The text can span multiple sentences,
2015-08-25 16:37:17 +03:00
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
2016-12-18 18:54:52 +03:00
2015-08-25 16:37:17 +03:00
text (unicode): The text to be processed.
2016-11-01 14:25:36 +03:00
doc (Doc): A container for accessing the annotations.
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
('An', 'NN')
2015-08-25 16:37:17 +03:00
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
return doc
2015-08-25 16:37:17 +03:00
def update(self, docs, golds, drop=0., sgd=None):
grads = {}
def get_grads(W, dW, key=None):
grads[key] = (W, dW)
tok2vec = self.pipeline[0]
feats = tok2vec.doc2feats(docs)
for proc in self.pipeline[1:]:
grads = {}
tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop)
2017-05-21 17:07:06 +03:00
d_tokvecses = proc.update((docs, tokvecses), golds, sgd=sgd, drop=drop)
bp_tokvecses(d_tokvecses, sgd=sgd)
if sgd is not None:
for key, (W, dW) in grads.items():
# TODO: Unhack this when thinc improves
if isinstance(W, numpy.ndarray):
sgd.ops = NumpyOps()
sgd.ops = CupyOps()
sgd(W, dW, key=key)
2017-05-21 17:07:06 +03:00
for key in list(grads.keys()):
for doc in docs:
doc.tensor = None
2017-05-21 17:07:06 +03:00
def preprocess_gold(self, docs_golds):
for proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)
for doc, gold in docs_golds:
yield doc, gold
def begin_training(self, get_gold_tuples, **cfg):
# Populate vocab
2017-05-21 17:07:06 +03:00
for _, annots_brackets in get_gold_tuples():
for annots, _ in annots_brackets:
for word in annots[1]:
_ = self.vocab[word]
contexts = []
2017-05-18 12:25:19 +03:00
if cfg.get('use_gpu'):
Model.ops = CupyOps()
Model.Ops = CupyOps
print("Use GPU")
for proc in self.pipeline:
if hasattr(proc, 'begin_training'):
2017-05-21 17:07:06 +03:00
context = proc.begin_training(get_gold_tuples(),
2017-05-21 17:07:06 +03:00
optimizer = Adam(Model.ops, 0.001)
return optimizer
def evaluate(self, docs_golds):
docs, golds = zip(*docs_golds)
scorer = Scorer()
for doc, gold in zip(self.pipe(docs), golds):
scorer.score(doc, gold)
return scorer
2017-05-18 12:25:19 +03:00
def use_params(self, params, **cfg):
2017-05-18 16:30:59 +03:00
contexts = [pipe.use_params(params) for pipe
in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.
for context in contexts:
except StopIteration:
2017-05-18 12:25:19 +03:00
for context in contexts:
2017-05-18 16:30:59 +03:00
2017-05-18 12:25:19 +03:00
except StopIteration:
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
Process texts as a stream, and yield Doc objects in order.
2016-12-18 18:54:52 +03:00
2016-11-01 14:25:36 +03:00
Supports GIL-free multi-threading.
2016-12-18 18:54:52 +03:00
2016-11-01 14:25:36 +03:00
texts (iterator)
tag (bool)
parse (bool)
entity (bool)
#docs = (self.make_doc(text) for text in texts)
docs = texts
for proc in self.pipeline:
name = getattr(proc, 'name', None)
if name in disabled and not disabled[name]:
if hasattr(proc, 'pipe'):
docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
docs = (proc(doc) for doc in docs)
for doc in docs:
yield doc
def to_disk(self, path, **exclude):
"""Save the current state to a directory.
2017-04-17 02:40:26 +03:00
path: A path to a directory, which will be created if it doesn't
exist. Paths may be either strings or pathlib.Path-like
**exclude: Prevent named attributes from being saved.
path = util.ensure_path(path)
if not path.exists():
if not path.is_dir():
raise IOError("Output path must be a directory")
props = {}
for name, value in self.__dict__.items():
if name in exclude:
if hasattr(value, 'to_disk'):
value.to_disk(path / name)
props[name] = value
with (path / 'props.pickle').open('wb') as file_:
dill.dump(props, file_)
def from_disk(self, path, **exclude):
"""Load the current state from a directory.
path: A path to a directory. Paths may be either strings or
pathlib.Path-like objects.
**exclude: Prevent named attributes from being saved.
path = util.ensure_path(path)
for name in path.iterdir():
if name not in exclude and hasattr(self, str(name)):
getattr(self, name).from_disk(path / name)
with (path / 'props.pickle').open('rb') as file_:
bytes_data =
self.from_bytes(bytes_data, **exclude)
return self
def to_bytes(self, **exclude):
"""Serialize the current state to a binary string.
2016-12-18 18:54:52 +03:00
path: A path to a directory. Paths may be either strings or
pathlib.Path-like objects.
**exclude: Prevent named attributes from being serialized.
props = dict(self.__dict__)
for key in exclude:
if key in props:
return dill.dumps(props, -1)
def from_bytes(self, bytes_data, **exclude):
"""Load state from a binary string.
bytes_data (bytes): The data to load from.
**exclude: Prevent named attributes from being loaded.
props = dill.loads(bytes_data)
for key, value in props.items():
if key not in exclude:
setattr(self, key, value)
return self
2016-12-18 18:54:52 +03:00