# coding: utf8 from __future__ import absolute_import, unicode_literals from contextlib import contextmanager import dill import numpy from thinc.neural import Model from thinc.neural.ops import NumpyOps, CupyOps from thinc.neural.optimizers import Adam, SGD import random import ujson from collections import OrderedDict import itertools from .tokenizer import Tokenizer from .vocab import Vocab from .tagger import Tagger from .lemmatizer import Lemmatizer from .syntax.parser import get_templates from .syntax import nonproj from .pipeline import NeuralDependencyParser, EntityRecognizer from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer from .pipeline import NeuralLabeller from .pipeline import SimilarityHook from .pipeline import TextCategorizer from . import about from .compat import json_dumps, izip from .attrs import IS_STOP from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS from . import util from .scorer import Scorer class BaseDefaults(object): @classmethod def create_lemmatizer(cls, nlp=None): return Lemmatizer(cls.lemma_index, cls.lemma_exc, cls.lemma_rules) @classmethod def create_vocab(cls, nlp=None): lemmatizer = cls.create_lemmatizer(nlp) lex_attr_getters = dict(cls.lex_attr_getters) # This is messy, but it's the minimal working fix to Issue #639. lex_attr_getters[IS_STOP] = lambda string: string.lower() in cls.stop_words vocab = Vocab(lex_attr_getters=lex_attr_getters, tag_map=cls.tag_map, lemmatizer=lemmatizer) for tag_str, exc in cls.morph_rules.items(): for orth_str, attrs in exc.items(): vocab.morphology.add_special_case(tag_str, orth_str, attrs) return vocab @classmethod def create_tokenizer(cls, nlp=None): rules = cls.tokenizer_exceptions token_match = cls.token_match prefix_search = util.compile_prefix_regex(cls.prefixes).search \ if cls.prefixes else None suffix_search = util.compile_suffix_regex(cls.suffixes).search \ if cls.suffixes else None infix_finditer = util.compile_infix_regex(cls.infixes).finditer \ if cls.infixes else None vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp) return Tokenizer(vocab, rules=rules, prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match) @classmethod def create_tagger(cls, nlp=None, **cfg): if nlp is None: return NeuralTagger(cls.create_vocab(nlp), **cfg) else: return NeuralTagger(nlp.vocab, **cfg) @classmethod def create_parser(cls, nlp=None, **cfg): if nlp is None: return NeuralDependencyParser(cls.create_vocab(nlp), **cfg) else: return NeuralDependencyParser(nlp.vocab, **cfg) @classmethod def create_entity(cls, nlp=None, **cfg): if nlp is None: return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg) else: return NeuralEntityRecognizer(nlp.vocab, **cfg) @classmethod def create_pipeline(cls, nlp=None, disable=tuple()): meta = nlp.meta if nlp is not None else {} # Resolve strings, like "cnn", "lstm", etc pipeline = [] for entry in meta.get('pipeline', []): if entry in disable or getattr(entry, 'name', entry) in disable: continue factory = cls.Defaults.factories[entry] pipeline.append(factory(nlp, **meta.get(entry, {}))) return pipeline factories = { 'make_doc': create_tokenizer, 'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], 'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], 'parser': lambda nlp, **cfg: [ NeuralDependencyParser(nlp.vocab, **cfg), nonproj.deprojectivize], 'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], 'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)], 'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)], # Temporary compatibility -- delete after pivot 'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], 'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], 'dependencies': lambda nlp, **cfg: [ NeuralDependencyParser(nlp.vocab, **cfg), nonproj.deprojectivize, ], 'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], } token_match = TOKEN_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) infixes = tuple(TOKENIZER_INFIXES) tag_map = dict(TAG_MAP) tokenizer_exceptions = {} parser_features = get_templates('parser') entity_features = get_templates('ner') tagger_features = Tagger.feature_templates # TODO -- fix this stop_words = set() lemma_rules = {} lemma_exc = {} lemma_index = {} morph_rules = {} lex_attr_getters = LEX_ATTRS syntax_iterators = {} class Language(object): """A text-processing pipeline. Usually you'll load this once per process, and pass the instance around your application. Defaults (class): Settings, data and factory methods for creating the `nlp` object and processing pipeline. lang (unicode): Two-letter language ID, i.e. ISO code. """ Defaults = BaseDefaults lang = None def __init__(self, vocab=True, make_doc=True, pipeline=None, meta={}, disable=tuple(), **kwargs): """Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via `Language.Defaults.create_vocab`. make_doc (callable): A function that takes text and returns a `Doc` object. Usually a `Tokenizer`. pipeline (list): A list of annotation processes or IDs of annotation, processes, e.g. a `Tagger` object, or `'tagger'`. IDs are looked up in `Language.Defaults.factories`. disable (list): A list of component names to exclude from the pipeline. The disable list has priority over the pipeline list -- if the same string occurs in both, the component is not loaded. meta (dict): Custom meta data for the Language class. Is written to by models to add model meta data. RETURNS (Language): The newly constructed object. """ self._meta = dict(meta) if vocab is True: factory = self.Defaults.create_vocab vocab = factory(self, **meta.get('vocab', {})) self.vocab = vocab if make_doc is True: factory = self.Defaults.create_tokenizer make_doc = factory(self, **meta.get('tokenizer', {})) self.tokenizer = make_doc if pipeline is True: self.pipeline = self.Defaults.create_pipeline(self, disable) elif pipeline: # Careful not to do getattr(p, 'name', None) here # If we had disable=[None], we'd disable everything! self.pipeline = [p for p in pipeline if p not in disable and getattr(p, 'name', p) not in disable] # Resolve strings, like "cnn", "lstm", etc for i, entry in enumerate(self.pipeline): if entry in self.Defaults.factories: factory = self.Defaults.factories[entry] self.pipeline[i] = factory(self, **meta.get(entry, {})) else: self.pipeline = [] flat_list = [] for pipe in self.pipeline: if isinstance(pipe, list): flat_list.extend(pipe) else: flat_list.append(pipe) self.pipeline = flat_list self._optimizer = None @property def meta(self): self._meta.setdefault('lang', self.vocab.lang) self._meta.setdefault('name', '') self._meta.setdefault('version', '0.0.0') self._meta.setdefault('spacy_version', about.__version__) self._meta.setdefault('description', '') self._meta.setdefault('author', '') self._meta.setdefault('email', '') self._meta.setdefault('url', '') self._meta.setdefault('license', '') pipeline = [] for component in self.pipeline: if hasattr(component, 'name'): pipeline.append(component.name) self._meta['pipeline'] = pipeline return self._meta @meta.setter def meta(self, value): self._meta = value # Conveniences to access pipeline components @property def tensorizer(self): return self.get_component('tensorizer') @property def tagger(self): return self.get_component('tagger') @property def parser(self): return self.get_component('parser') @property def entity(self): return self.get_component('ner') @property def matcher(self): return self.get_component('matcher') def get_component(self, name): if self.pipeline in (True, None): return None for proc in self.pipeline: if hasattr(proc, 'name') and proc.name.endswith(name): return proc return None def __call__(self, text, disable=[]): """'Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. text (unicode): The text to be processed. disable (list): Names of the pipeline components to disable. RETURNS (Doc): A container for accessing the annotations. EXAMPLE: >>> tokens = nlp('An example sentence. Another example sentence.') >>> tokens[0].text, tokens[0].head.tag_ ('An', 'NN') """ doc = self.make_doc(text) for proc in self.pipeline: name = getattr(proc, 'name', None) if name in disable: continue doc = proc(doc) return doc def make_doc(self, text): return self.tokenizer(text) def update(self, docs, golds, drop=0., sgd=None, losses=None, update_shared=False): """Update the models in the pipeline. docs (iterable): A batch of `Doc` objects. golds (iterable): A batch of `GoldParse` objects. drop (float): The droput rate. sgd (callable): An optimizer. RETURNS (dict): Results from the update. EXAMPLE: >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer): >>> for epoch in trainer.epochs(gold): >>> for docs, golds in epoch: >>> state = nlp.update(docs, golds, sgd=optimizer) """ if len(docs) != len(golds): raise IndexError("Update expects same number of docs and golds " "Got: %d, %d" % (len(docs), len(golds))) if len(docs) == 0: return if sgd is None: if self._optimizer is None: self._optimizer = Adam(Model.ops, 0.001) sgd = self._optimizer tok2vec = self.pipeline[0] grads = {} def get_grads(W, dW, key=None): grads[key] = (W, dW) pipes = list(self.pipeline[1:]) random.shuffle(pipes) tokvecses, bp_tokvecses = tok2vec.model.begin_update(docs, drop=drop) all_d_tokvecses = [tok2vec.model.ops.allocate(tv.shape) for tv in tokvecses] for proc in pipes: if not hasattr(proc, 'update'): continue d_tokvecses = proc.update((docs, tokvecses), golds, drop=drop, sgd=get_grads, losses=losses) if update_shared and d_tokvecses is not None: for i, d_tv in enumerate(d_tokvecses): all_d_tokvecses[i] += d_tv if update_shared and bp_tokvecses is not None: bp_tokvecses(all_d_tokvecses, sgd=sgd) for key, (W, dW) in grads.items(): sgd(W, dW, key=key) # Clear the tensor variable, to free GPU memory. # If we don't do this, the memory leak gets pretty # bad, because we may be holding part of a batch. for doc in docs: doc.tensor = None def preprocess_gold(self, docs_golds): """Can be called before training to pre-process gold data. By default, it handles nonprojectivity and adds missing tags to the tag map. docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. """ for proc in self.pipeline: if hasattr(proc, 'preprocess_gold'): docs_golds = proc.preprocess_gold(docs_golds) for doc, gold in docs_golds: yield doc, gold def resume_training(self, **cfg): if cfg.get('device', -1) >= 0: device = util.use_gpu(cfg['device']) if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = Model.ops.asarray( self.vocab.vectors.data) else: device = None learn_rate = util.env_opt('learn_rate', 0.001) beta1 = util.env_opt('optimizer_B1', 0.9) beta2 = util.env_opt('optimizer_B2', 0.999) eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device return self._optimizer def begin_training(self, get_gold_tuples=None, **cfg): """Allocate models, pre-process training data and acquire a trainer and optimizer. Used as a contextmanager. get_gold_tuples (function): Function returning gold data **cfg: Config parameters. returns: An optimizer """ if self.parser: self.pipeline.append(NeuralLabeller(self.vocab)) # Populate vocab if get_gold_tuples is not None: for _, annots_brackets in get_gold_tuples(): for annots, _ in annots_brackets: for word in annots[1]: _ = self.vocab[word] contexts = [] if cfg.get('device', -1) >= 0: device = util.use_gpu(cfg['device']) if self.vocab.vectors.data.shape[1] >= 1: self.vocab.vectors.data = Model.ops.asarray( self.vocab.vectors.data) else: device = None for proc in self.pipeline: if hasattr(proc, 'begin_training'): context = proc.begin_training(get_gold_tuples(), pipeline=self.pipeline) contexts.append(context) learn_rate = util.env_opt('learn_rate', 0.001) beta1 = util.env_opt('optimizer_B1', 0.9) beta2 = util.env_opt('optimizer_B2', 0.999) eps = util.env_opt('optimizer_eps', 1e-08) L2 = util.env_opt('L2_penalty', 1e-6) max_grad_norm = util.env_opt('grad_norm_clip', 1.) self._optimizer = Adam(Model.ops, learn_rate, L2=L2, beta1=beta1, beta2=beta2, eps=eps) self._optimizer.max_grad_norm = max_grad_norm self._optimizer.device = device return self._optimizer def evaluate(self, docs_golds): scorer = Scorer() docs, golds = zip(*docs_golds) docs = list(docs) golds = list(golds) for pipe in self.pipeline: if not hasattr(pipe, 'pipe'): for doc in docs: pipe(doc) else: docs = list(pipe.pipe(docs)) assert len(docs) == len(golds) for doc, gold in zip(docs, golds): scorer.score(doc, gold) doc.tensor = None return scorer @contextmanager def use_params(self, params, **cfg): """Replace weights of models in the pipeline with those provided in the params dictionary. Can be used as a contextmanager, in which case, models go back to their original weights after the block. params (dict): A dictionary of parameters keyed by model ID. **cfg: Config parameters. EXAMPLE: >>> with nlp.use_params(optimizer.averages): >>> nlp.to_disk('/tmp/checkpoint') """ contexts = [pipe.use_params(params) for pipe in self.pipeline if hasattr(pipe, 'use_params')] # TODO: Having trouble with contextlib # Workaround: these aren't actually context managers atm. for context in contexts: try: next(context) except StopIteration: pass yield for context in contexts: try: next(context) except StopIteration: pass def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000, disable=[]): """Process texts as a stream, and yield `Doc` objects in order. Supports GIL-free multi-threading. texts (iterator): A sequence of texts to process. as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. n_threads (int): The number of worker threads to use. If -1, OpenMP will decide how many to use at run time. Default is 2. batch_size (int): The number of texts to buffer. disable (list): Names of the pipeline components to disable. YIELDS (Doc): Documents in the order of the original text. EXAMPLE: >>> texts = [u'One document.', u'...', u'Lots of documents'] >>> for doc in nlp.pipe(texts, batch_size=50, n_threads=4): >>> assert doc.is_parsed """ if as_tuples: text_context1, text_context2 = itertools.tee(texts) texts = (tc[0] for tc in text_context1) contexts = (tc[1] for tc in text_context2) docs = self.pipe(texts, n_threads=n_threads, batch_size=batch_size, disable=disable) for doc, context in izip(docs, contexts): yield (doc, context) return docs = (self.make_doc(text) for text in texts) for proc in self.pipeline: name = getattr(proc, 'name', None) if name in disable: continue if hasattr(proc, 'pipe'): docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size) else: # Apply the function, but yield the doc docs = _pipe(proc, docs) for doc in docs: yield doc def to_disk(self, path, disable=tuple()): """Save the current state to a directory. If a model is loaded, this will include the model. path (unicode or Path): A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. disable (list): Names of pipeline components to disable and prevent from being saved. EXAMPLE: >>> nlp.to_disk('/path/to/models') """ path = util.ensure_path(path) serializers = OrderedDict(( ('vocab', lambda p: self.vocab.to_disk(p)), ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) )) for proc in self.pipeline: if not hasattr(proc, 'name'): continue if proc.name in disable: continue if not hasattr(proc, 'to_disk'): continue serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) util.to_disk(path, serializers, {p: False for p in disable}) def from_disk(self, path, disable=tuple()): """Loads state from a directory. Modifies the object in place and returns it. If the saved `Language` object contains a model, the model will be loaded. path (unicode or Path): A path to a directory. Paths may be either strings or `Path`-like objects. disable (list): Names of the pipeline components to disable. RETURNS (Language): The modified `Language` object. EXAMPLE: >>> from spacy.language import Language >>> nlp = Language().from_disk('/path/to/models') """ path = util.ensure_path(path) deserializers = OrderedDict(( ('vocab', lambda p: self.vocab.from_disk(p)), ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) )) for proc in self.pipeline: if not hasattr(proc, 'name'): continue if proc.name in disable: continue if not hasattr(proc, 'to_disk'): continue deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) exclude = {p: False for p in disable} if not (path / 'vocab').exists(): exclude['vocab'] = True util.from_disk(path, deserializers, exclude) return self def to_bytes(self, disable=[]): """Serialize the current state to a binary string. disable (list): Nameds of pipeline components to disable and prevent from being serialized. RETURNS (bytes): The serialized form of the `Language` object. """ serializers = OrderedDict(( ('vocab', lambda: self.vocab.to_bytes()), ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), ('meta', lambda: ujson.dumps(self.meta)) )) for i, proc in enumerate(self.pipeline): if getattr(proc, 'name', None) in disable: continue if not hasattr(proc, 'to_bytes'): continue serializers[i] = lambda proc=proc: proc.to_bytes(vocab=False) return util.to_bytes(serializers, {}) def from_bytes(self, bytes_data, disable=[]): """Load state from a binary string. bytes_data (bytes): The data to load from. disable (list): Names of the pipeline components to disable. RETURNS (Language): The `Language` object. """ deserializers = OrderedDict(( ('vocab', lambda b: self.vocab.from_bytes(b)), ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), ('meta', lambda b: self.meta.update(ujson.loads(b))) )) for i, proc in enumerate(self.pipeline): if getattr(proc, 'name', None) in disable: continue if not hasattr(proc, 'from_bytes'): continue deserializers[i] = lambda b, proc=proc: proc.from_bytes(b, vocab=False) msg = util.from_bytes(bytes_data, deserializers, {}) return self def _pipe(func, docs): for doc in docs: func(doc) yield doc