2017-04-15 13:05:47 +03:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import absolute_import, unicode_literals
|
2016-10-09 13:24:24 +03:00
|
|
|
from contextlib import contextmanager
|
2015-08-27 10:16:11 +03:00
|
|
|
|
2015-08-26 20:16:09 +03:00
|
|
|
from .tokenizer import Tokenizer
|
|
|
|
from .vocab import Vocab
|
|
|
|
from .tagger import Tagger
|
2016-09-25 16:37:33 +03:00
|
|
|
from .lemmatizer import Lemmatizer
|
2016-10-09 13:24:24 +03:00
|
|
|
from .train import Trainer
|
2016-09-26 12:57:54 +03:00
|
|
|
from .syntax.parser import get_templates
|
2016-10-09 13:24:24 +03:00
|
|
|
from .syntax.nonproj import PseudoProjectivity
|
2017-05-16 17:17:30 +03:00
|
|
|
from .pipeline import NeuralDependencyParser, EntityRecognizer
|
|
|
|
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
|
2017-04-17 02:46:14 +03:00
|
|
|
from .compat import json_dumps
|
2017-04-15 13:05:47 +03:00
|
|
|
from .attrs import IS_STOP
|
2017-05-09 00:58:31 +03:00
|
|
|
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
|
|
|
|
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
|
|
|
from .lang.tag_map import TAG_MAP
|
2017-05-09 01:58:10 +03:00
|
|
|
from .lang.lex_attrs import LEX_ATTRS
|
2017-04-15 13:05:47 +03:00
|
|
|
from . import util
|
2016-10-09 13:24:24 +03:00
|
|
|
|
2015-08-27 10:16:11 +03:00
|
|
|
|
2016-09-24 21:26:17 +03:00
|
|
|
class BaseDefaults(object):
|
2016-10-18 17:18:25 +03:00
|
|
|
@classmethod
|
|
|
|
def create_lemmatizer(cls, nlp=None):
|
2017-03-15 12:52:50 +03:00
|
|
|
return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules)
|
2016-10-18 17:18:25 +03:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def create_vocab(cls, nlp=None):
|
|
|
|
lemmatizer = cls.create_lemmatizer(nlp)
|
2017-05-16 12:21:59 +03:00
|
|
|
lex_attr_getters = dict(cls.lex_attr_getters)
|
|
|
|
# This is messy, but it's the minimal working fix to Issue #639.
|
|
|
|
lex_attr_getters[IS_STOP] = lambda string: string.lower() in cls.stop_words
|
|
|
|
vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=cls.tag_map,
|
|
|
|
lemmatizer=lemmatizer)
|
2017-03-15 17:24:40 +03:00
|
|
|
for tag_str, exc in cls.morph_rules.items():
|
|
|
|
for orth_str, attrs in exc.items():
|
|
|
|
vocab.morphology.add_special_case(tag_str, orth_str, attrs)
|
|
|
|
return vocab
|
2016-12-18 18:54:52 +03:00
|
|
|
|
2016-10-18 17:18:25 +03:00
|
|
|
@classmethod
|
|
|
|
def create_tokenizer(cls, nlp=None):
|
|
|
|
rules = cls.tokenizer_exceptions
|
2017-05-16 12:21:59 +03:00
|
|
|
token_match = cls.token_match
|
|
|
|
prefix_search = util.compile_prefix_regex(cls.prefixes).search \
|
|
|
|
if cls.prefixes else None
|
|
|
|
suffix_search = util.compile_suffix_regex(cls.suffixes).search \
|
|
|
|
if cls.suffixes else None
|
|
|
|
infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
|
|
|
|
if cls.infixes else None
|
2016-10-18 17:18:25 +03:00
|
|
|
vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
|
2016-11-26 14:36:04 +03:00
|
|
|
return Tokenizer(vocab, rules=rules,
|
2016-10-18 17:18:25 +03:00
|
|
|
prefix_search=prefix_search, suffix_search=suffix_search,
|
2017-01-03 20:17:57 +03:00
|
|
|
infix_finditer=infix_finditer, token_match=token_match)
|
2016-09-24 15:08:53 +03:00
|
|
|
|
2017-05-16 17:17:30 +03:00
|
|
|
@classmethod
|
|
|
|
def create_tagger(cls, nlp=None, **cfg):
|
|
|
|
if nlp is None:
|
|
|
|
return NeuralTagger(cls.create_vocab(nlp), **cfg)
|
|
|
|
else:
|
|
|
|
return NeuralTagger(nlp.vocab, **cfg)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def create_parser(cls, nlp=None, **cfg):
|
|
|
|
if nlp is None:
|
|
|
|
return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
|
|
|
|
else:
|
|
|
|
return NeuralDependencyParser(nlp.vocab, **cfg)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def create_entity(cls, nlp=None, **cfg):
|
|
|
|
if nlp is None:
|
|
|
|
return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
|
|
|
|
else:
|
|
|
|
return NeuralEntityRecognizer(nlp.vocab, **cfg)
|
|
|
|
|
2016-10-18 17:18:25 +03:00
|
|
|
@classmethod
|
2017-05-16 12:21:59 +03:00
|
|
|
def create_pipeline(cls, nlp=None):
|
|
|
|
meta = nlp.meta if nlp is not None else {}
|
|
|
|
# Resolve strings, like "cnn", "lstm", etc
|
2016-10-14 18:38:29 +03:00
|
|
|
pipeline = []
|
2017-05-16 12:21:59 +03:00
|
|
|
for entry in cls.pipeline:
|
|
|
|
factory = cls.Defaults.factories[entry]
|
2017-05-16 17:17:30 +03:00
|
|
|
pipeline.append(factory(nlp, **meta.get(entry, {})))
|
2016-10-09 13:24:24 +03:00
|
|
|
return pipeline
|
|
|
|
|
2017-05-16 12:21:59 +03:00
|
|
|
factories = {
|
|
|
|
'make_doc': create_tokenizer,
|
2017-05-16 17:17:30 +03:00
|
|
|
'token_vectors': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
|
|
|
|
'tags': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
|
2017-05-16 12:21:59 +03:00
|
|
|
'dependencies': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
|
|
|
|
'entities': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
|
|
|
|
}
|
|
|
|
|
2017-05-09 00:58:31 +03:00
|
|
|
token_match = TOKEN_MATCH
|
|
|
|
prefixes = tuple(TOKENIZER_PREFIXES)
|
|
|
|
suffixes = tuple(TOKENIZER_SUFFIXES)
|
|
|
|
infixes = tuple(TOKENIZER_INFIXES)
|
|
|
|
tag_map = dict(TAG_MAP)
|
2016-10-09 13:24:24 +03:00
|
|
|
tokenizer_exceptions = {}
|
2016-09-26 12:57:54 +03:00
|
|
|
parser_features = get_templates('parser')
|
|
|
|
entity_features = get_templates('ner')
|
2016-10-18 17:18:25 +03:00
|
|
|
tagger_features = Tagger.feature_templates # TODO -- fix this
|
2016-09-24 21:26:17 +03:00
|
|
|
stop_words = set()
|
2016-12-18 17:50:09 +03:00
|
|
|
lemma_rules = {}
|
2017-03-15 12:52:50 +03:00
|
|
|
lemma_exc = {}
|
|
|
|
lemma_index = {}
|
2017-03-15 17:24:40 +03:00
|
|
|
morph_rules = {}
|
2017-05-09 01:58:10 +03:00
|
|
|
lex_attr_getters = LEX_ATTRS
|
2015-09-14 10:48:51 +03:00
|
|
|
|
2015-08-26 20:16:09 +03:00
|
|
|
|
2016-09-24 15:08:53 +03:00
|
|
|
class Language(object):
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
|
|
|
A text-processing pipeline. Usually you'll load this once per process, and
|
2016-09-24 15:08:53 +03:00
|
|
|
pass the instance around your program.
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
2016-09-24 21:26:17 +03:00
|
|
|
Defaults = BaseDefaults
|
2016-09-24 15:08:53 +03:00
|
|
|
lang = None
|
2015-08-25 16:37:17 +03:00
|
|
|
|
2017-05-16 12:21:59 +03:00
|
|
|
def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}):
|
|
|
|
self.meta = dict(meta)
|
|
|
|
|
|
|
|
if vocab is True:
|
|
|
|
factory = self.Defaults.create_vocab
|
|
|
|
vocab = factory(self, **meta.get('vocab', {}))
|
|
|
|
self.vocab = vocab
|
|
|
|
if make_doc is True:
|
|
|
|
factory = self.Defaults.create_tokenizer
|
|
|
|
make_doc = factory(self, **meta.get('tokenizer', {}))
|
|
|
|
self.make_doc = make_doc
|
|
|
|
if pipeline is True:
|
|
|
|
self.pipeline = self.Defaults.create_pipeline(self)
|
|
|
|
elif pipeline:
|
|
|
|
self.pipeline = list(pipeline)
|
|
|
|
# Resolve strings, like "cnn", "lstm", etc
|
|
|
|
for i, entry in enumerate(self.pipeline):
|
|
|
|
if entry in self.Defaults.factories:
|
|
|
|
factory = self.Defaults.factories[entry]
|
|
|
|
self.pipeline[i] = factory(self, **meta.get(entry, {}))
|
2016-10-09 13:24:24 +03:00
|
|
|
else:
|
2017-05-16 12:21:59 +03:00
|
|
|
self.pipeline = []
|
2015-10-12 11:33:11 +03:00
|
|
|
|
2017-05-16 17:17:30 +03:00
|
|
|
def __call__(self, text, state=None, **disabled):
|
2017-04-15 13:05:47 +03:00
|
|
|
"""
|
|
|
|
Apply the pipeline to some text. The text can span multiple sentences,
|
2015-08-25 16:37:17 +03:00
|
|
|
and can contain arbtrary whitespace. Alignment into the original string
|
|
|
|
is preserved.
|
2016-12-18 18:54:52 +03:00
|
|
|
|
2017-05-16 17:17:30 +03:00
|
|
|
Args:
|
2015-08-25 16:37:17 +03:00
|
|
|
text (unicode): The text to be processed.
|
2017-05-16 17:17:30 +03:00
|
|
|
state: Arbitrary
|
2015-08-25 16:37:17 +03:00
|
|
|
|
|
|
|
Returns:
|
2016-11-01 14:25:36 +03:00
|
|
|
doc (Doc): A container for accessing the annotations.
|
|
|
|
|
|
|
|
Example:
|
|
|
|
>>> from spacy.en import English
|
|
|
|
>>> nlp = English()
|
|
|
|
>>> tokens = nlp('An example sentence. Another example sentence.')
|
|
|
|
>>> tokens[0].orth_, tokens[0].head.tag_
|
|
|
|
('An', 'NN')
|
2015-08-25 16:37:17 +03:00
|
|
|
"""
|
2016-10-14 18:38:29 +03:00
|
|
|
doc = self.make_doc(text)
|
|
|
|
for proc in self.pipeline:
|
2017-05-16 12:21:59 +03:00
|
|
|
name = getattr(proc, 'name', None)
|
2017-05-16 17:17:30 +03:00
|
|
|
if name in disabled and not disabled[name]:
|
2017-05-16 12:21:59 +03:00
|
|
|
continue
|
2017-05-16 17:17:30 +03:00
|
|
|
state = proc(doc, state=state)
|
2016-05-17 17:55:42 +03:00
|
|
|
return doc
|
2015-08-25 16:37:17 +03:00
|
|
|
|
2017-05-16 17:17:30 +03:00
|
|
|
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
|
|
|
grads = {}
|
|
|
|
def get_grads(W, dW, key=None):
|
|
|
|
grads[key] = (W, dW)
|
|
|
|
state = {} if state is None else state
|
|
|
|
for process in self.pipeline:
|
|
|
|
if hasattr(process, 'update'):
|
|
|
|
state = process.update(docs, golds,
|
|
|
|
state=state,
|
|
|
|
drop=drop,
|
|
|
|
sgd=sgd)
|
|
|
|
else:
|
|
|
|
process(docs, state=state)
|
|
|
|
if sgd is not None:
|
|
|
|
for key, (W, dW) in grads.items():
|
|
|
|
sgd(W, dW, key=key)
|
|
|
|
return state
|
|
|
|
|
2017-05-16 12:21:59 +03:00
|
|
|
@contextmanager
|
|
|
|
def begin_training(self, gold_tuples, **cfg):
|
|
|
|
contexts = []
|
|
|
|
for proc in self.pipeline:
|
|
|
|
if hasattr(proc, 'begin_training'):
|
|
|
|
context = proc.begin_training(gold_tuples, pipeline=self.pipeline)
|
|
|
|
contexts.append(context)
|
|
|
|
trainer = Trainer(self, gold_tuples, **cfg)
|
|
|
|
yield trainer, trainer.optimizer
|
|
|
|
|
|
|
|
def pipe(self, texts, n_threads=2, batch_size=1000, **disabled):
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
|
|
|
Process texts as a stream, and yield Doc objects in order.
|
2016-12-18 18:54:52 +03:00
|
|
|
|
2016-11-01 14:25:36 +03:00
|
|
|
Supports GIL-free multi-threading.
|
2016-12-18 18:54:52 +03:00
|
|
|
|
2016-11-01 14:25:36 +03:00
|
|
|
Arguments:
|
|
|
|
texts (iterator)
|
|
|
|
tag (bool)
|
|
|
|
parse (bool)
|
|
|
|
entity (bool)
|
2017-04-15 12:59:21 +03:00
|
|
|
"""
|
2017-05-16 17:17:30 +03:00
|
|
|
stream = ((self.make_doc(text), None) for text in texts)
|
2016-10-14 18:38:29 +03:00
|
|
|
for proc in self.pipeline:
|
2017-05-16 12:21:59 +03:00
|
|
|
name = getattr(proc, 'name', None)
|
2017-05-16 17:17:30 +03:00
|
|
|
if name in disabled and not disabled[name]:
|
2017-05-16 12:21:59 +03:00
|
|
|
continue
|
|
|
|
|
|
|
|
if hasattr(proc, 'pipe'):
|
|
|
|
stream = proc.pipe(stream, n_threads=n_threads, batch_size=batch_size)
|
|
|
|
else:
|
2017-05-16 17:17:30 +03:00
|
|
|
stream = (proc(doc, state) for doc, state in stream)
|
|
|
|
for doc, state in stream:
|
2016-02-03 04:04:55 +03:00
|
|
|
yield doc
|
2016-02-01 11:01:13 +03:00
|
|
|
|
2017-05-16 12:21:59 +03:00
|
|
|
def to_disk(self, path):
|
|
|
|
raise NotImplemented
|
2017-04-17 02:40:26 +03:00
|
|
|
|
2017-05-16 12:21:59 +03:00
|
|
|
def from_disk(self, path):
|
|
|
|
raise NotImplemented
|
2016-12-18 18:54:52 +03:00
|
|
|
|
2017-05-16 12:21:59 +03:00
|
|
|
def to_bytes(self, path):
|
|
|
|
raise NotImplemented
|
2017-04-15 13:05:47 +03:00
|
|
|
|
2017-05-16 12:21:59 +03:00
|
|
|
def from_bytes(self, path):
|
|
|
|
raise NotImplemented
|
2016-12-18 18:54:52 +03:00
|
|
|
|