Tidy up language, lemmatizer and scorer

2025-11-07 19:37:38 +03:00 · 2017-10-27 14:40:14 +02:00 · 2017-10-27 14:40:14 +02:00 · 91899d337b
commit 91899d337b
parent 778212efea
3 changed files with 52 additions and 51 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -11,21 +11,18 @@ from collections import OrderedDict
 import itertools
 import weakref
 import functools
-import tqdm

 from .tokenizer import Tokenizer
 from .vocab import Vocab
-from .tagger import Tagger
 from .lemmatizer import Lemmatizer
-
 from .pipeline import DependencyParser, Tensorizer, Tagger
 from .pipeline import EntityRecognizer, SimilarityHook, TextCategorizer
-
-from .compat import json_dumps, izip, copy_reg
+from .compat import json_dumps, izip
 from .scorer import Scorer
 from ._ml import link_vectors_to_models
 from .attrs import IS_STOP
-from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
+from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
+from .lang.punctuation import TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
 from .lang.tag_map import TAG_MAP
 from .lang.lex_attrs import LEX_ATTRS, is_stop
@ -57,16 +54,18 @@ class BaseDefaults(object):
    def create_tokenizer(cls, nlp=None):
        rules = cls.tokenizer_exceptions
        token_match = cls.token_match
-        prefix_search = util.compile_prefix_regex(cls.prefixes).search \
-                        if cls.prefixes else None
-        suffix_search = util.compile_suffix_regex(cls.suffixes).search \
-                        if cls.suffixes else None
-        infix_finditer = util.compile_infix_regex(cls.infixes).finditer \
-                         if cls.infixes else None
+        prefix_search = (util.compile_prefix_regex(cls.prefixes).search
+                         if cls.prefixes else None)
+        suffix_search = (util.compile_suffix_regex(cls.suffixes).search
+                         if cls.suffixes else None)
+        infix_finditer = (util.compile_infix_regex(cls.infixes).finditer
+                          if cls.infixes else None)
        vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
        return Tokenizer(vocab, rules=rules,
-                         prefix_search=prefix_search, suffix_search=suffix_search,
-                         infix_finditer=infix_finditer, token_match=token_match)
+                         prefix_search=prefix_search,
+                         suffix_search=suffix_search,
+                         infix_finditer=infix_finditer,
+                         token_match=token_match)

    pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
    token_match = TOKEN_MATCH
@ -98,7 +97,7 @@ class Language(object):

    factories = {
        'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
-        'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
+        'tensorizer': lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
        'tagger': lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
        'parser': lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
        'ner': lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
@ -218,14 +217,14 @@ class Language(object):
    def add_pipe(self, component, name=None, before=None, after=None,
                 first=None, last=None):
        """Add a component to the processing pipeline. Valid components are
-        callables that take a `Doc` object, modify it and return it. Only one of
-        before, after, first or last can be set. Default behaviour is "last".
+        callables that take a `Doc` object, modify it and return it. Only one
+        of before/after/first/last can be set. Default behaviour is "last".

        component (callable): The pipeline component.
        name (unicode): Name of pipeline component. Overwrites existing
            component.name attribute if available. If no name is set and
            the component exposes no name attribute, component.__name__ is
-            used. An error is raised if the name already exists in the pipeline.
+            used. An error is raised if a name already exists in the pipeline.
        before (unicode): Component name to insert component directly before.
        after (unicode): Component name to insert component directly after.
        first (bool): Insert component first / not first in the pipeline.
@ -240,7 +239,8 @@ class Language(object):
                name = component.name
            elif hasattr(component, '__name__'):
                name = component.__name__
-            elif hasattr(component, '__class__') and hasattr(component.__class__, '__name__'):
+            elif (hasattr(component, '__class__') and
+                  hasattr(component.__class__, '__name__')):
                name = component.__class__.__name__
            else:
                name = repr(component)
@ -269,7 +269,7 @@ class Language(object):
        `name in nlp.pipe_names`.

        name (unicode): Name of the component.
-        RETURNS (bool): Whether a component of that name exists in the pipeline.
+        RETURNS (bool): Whether a component of the name exists in the pipeline.
        """
        return name in self.pipe_names

@ -332,15 +332,12 @@ class Language(object):
        return doc

    def disable_pipes(self, *names):
-        '''Disable one or more pipeline components.
-
-        If used as a context manager, the pipeline will be restored to the initial
-        state at the end of the block. Otherwise, a DisabledPipes object is
-        returned, that has a `.restore()` method you can use to undo your
-        changes.
+        """Disable one or more pipeline components. If used as a context
+        manager, the pipeline will be restored to the initial state at the end
+        of the block. Otherwise, a DisabledPipes object is returned, that has
+        a `.restore()` method you can use to undo your changes.

        EXAMPLE:
-
            >>> nlp.add_pipe('parser')
            >>> nlp.add_pipe('tagger')
            >>> with nlp.disable_pipes('parser', 'tagger'):
@ -351,7 +348,7 @@ class Language(object):
            >>> assert not nlp.has_pipe('parser')
            >>> disabled.restore()
            >>> assert nlp.has_pipe('parser')
-        '''
+        """
        return DisabledPipes(self, *names)

    def make_doc(self, text):
@ -367,7 +364,7 @@ class Language(object):
        RETURNS (dict): Results from the update.

        EXAMPLE:
-            >>> with nlp.begin_training(gold, use_gpu=True) as (trainer, optimizer):
+            >>> with nlp.begin_training(gold) as (trainer, optimizer):
            >>>    for epoch in trainer.epochs(gold):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
@ -382,8 +379,10 @@ class Language(object):
                self._optimizer = Adam(Model.ops, 0.001)
            sgd = self._optimizer
        grads = {}
+
        def get_grads(W, dW, key=None):
            grads[key] = (W, dW)
+
        pipes = list(self.pipeline)
        random.shuffle(pipes)
        for name, proc in pipes:
@ -513,16 +512,16 @@ class Language(object):

    def pipe(self, texts, as_tuples=False, n_threads=2, batch_size=1000,
             disable=[]):
-        """Process texts as a stream, and yield `Doc` objects in order. Supports
-        GIL-free multi-threading.
+        """Process texts as a stream, and yield `Doc` objects in order.
+        Supports GIL-free multi-threading.

        texts (iterator): A sequence of texts to process.
        as_tuples (bool):
            If set to True, inputs should be a sequence of
            (text, context) tuples. Output will then be a sequence of
            (doc, context) tuples. Defaults to False.
-        n_threads (int): The number of worker threads to use. If -1, OpenMP will
-            decide how many to use at run time. Default is 2.
+        n_threads (int): The number of worker threads to use. If -1, OpenMP
+            will decide how many to use at run time. Default is 2.
        batch_size (int): The number of texts to buffer.
        disable (list): Names of the pipeline components to disable.
        YIELDS (Doc): Documents in the order of the original text.
@ -546,7 +545,8 @@ class Language(object):
            if name in disable:
                continue
            if hasattr(proc, 'pipe'):
-                docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
+                docs = proc.pipe(docs, n_threads=n_threads,
+                                 batch_size=batch_size)
            else:
                # Apply the function, but yield the doc
                docs = _pipe(proc, docs)
@ -583,7 +583,7 @@ class Language(object):
        will include the model.

        path (unicode or Path): A path to a directory, which will be created if
-            it doesn't exist. Paths may be either strings or `Path`-like objects.
+            it doesn't exist. Paths may be strings or `Path`-like objects.
        disable (list): Names of pipeline components to disable and prevent
            from being saved.

@ -682,7 +682,7 @@ class Language(object):


 class DisabledPipes(list):
-    '''Manager for temporary pipeline disabling.'''
+    """Manager for temporary pipeline disabling."""
    def __init__(self, nlp, *names):
        self.nlp = nlp
        self.names = names
@ -702,7 +702,8 @@ class DisabledPipes(list):
    def restore(self):
        '''Restore the pipeline to its state when DisabledPipes was created.'''
        current, self.nlp.pipeline = self.nlp.pipeline, self.original_pipeline
-        unexpected = [name for name, pipe in current if not self.nlp.has_pipe(name)]
+        unexpected = [name for name, pipe in current
+                      if not self.nlp.has_pipe(name)]
        if unexpected:
            # Don't change the pipeline if we're raising an error.
            self.nlp.pipeline = current
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -43,16 +43,15 @@ class Lemmatizer(object):
        morphology = {} if morphology is None else morphology
        others = [key for key in morphology
                  if key not in (POS, 'Number', 'POS', 'VerbForm', 'Tense')]
-        true_morph_key = morphology.get('morph', 0)
        if univ_pos == 'noun' and morphology.get('Number') == 'sing':
            return True
        elif univ_pos == 'verb' and morphology.get('VerbForm') == 'inf':
            return True
        # This maps 'VBP' to base form -- probably just need 'IS_BASE'
        # morphology
-        elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and \
-                                     morphology.get('Tense') == 'pres' and \
-                                     morphology.get('Number') is None and \
+        elif univ_pos == 'verb' and (morphology.get('VerbForm') == 'fin' and
+                                     morphology.get('Tense') == 'pres' and
+                                     morphology.get('Number') is None and
                                     not others):
            return True
        elif univ_pos == 'adj' and morphology.get('Degree') == 'pos':
@ -89,9 +88,6 @@ class Lemmatizer(object):
 def lemmatize(string, index, exceptions, rules):
    string = string.lower()
    forms = []
-    # TODO: Is this correct? See discussion in Issue #435.
-    #if string in index:
-    #    forms.append(string)
    forms.extend(exceptions.get(string, []))
    oov_forms = []
    if not forms:
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@ -74,8 +74,11 @@ class Scorer(object):
    @property
    def scores(self):
        return {
-            'uas': self.uas, 'las': self.las,
-            'ents_p': self.ents_p, 'ents_r': self.ents_r, 'ents_f': self.ents_f,
+            'uas': self.uas,
+            'las': self.las,
+            'ents_p': self.ents_p,
+            'ents_r': self.ents_r,
+            'ents_f': self.ents_f,
            'tags_acc': self.tags_acc,
            'token_acc': self.token_acc
        }
@ -85,7 +88,8 @@ class Scorer(object):

        gold_deps = set()
        gold_tags = set()
-        gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
+        gold_ents = set(tags_to_entities([annot[-1]
+                        for annot in gold.orig_annot]))
        for id_, word, tag, head, dep, ner in gold.orig_annot:
            gold_tags.add((id_, tag))
            if dep not in (None, "") and dep.lower() not in punct_labels: