* Begin refactor

2025-11-03 01:17:52 +03:00 · 2015-07-07 14:00:07 +02:00 · 2015-07-07 14:00:07 +02:00 · 6788c86b2f
commit 6788c86b2f
parent 52fd80c6c6
3 changed files with 116 additions and 54 deletions
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@ -1,9 +1,42 @@
-===
+=====
-API
+Usage
-===
+=====
 Overview
 --------
 spaCy is a suite of natural language processing tools, arranged into
 a pipeline.  It is substantially more opinionated than most similar libraries,
 which often give users the choice of multiple models that compute the same annotation.
 spaCy's philosophy is to just have one --- the best one.  Our perspective is that
 the redundant options are really only useful to researchers, who need to replicate
 some prior work exactly.
 Being opinionated allows us to keep the library small, fast, and up-to-date.  It
 also makes the API much simpler.  Normal usage proceeds in three steps:
 1. Loading resources;
 2. Processing text;
 3. Accessing annotations.
 This document is divided into three parts, to match these stages.  We focus here
 on the library's API. See also: Installation, Annotation Standards, Algorithmic Details,
 and Benchmarks.
 Loading Resources
 -----------------
 99\% of the time, you will load spaCy's resources using a language pipeline class,
 e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
 specified directory.  By default, spaCy installs data into each language's
 package directory, and loads it from there.
 .. autoclass:: spacy.en.English
  :members:
 The class `spacy.en.English` is the main entry-point for the English pipeline
 (other languages to come).
  +------------+----------------------------------------+-------------+--------------------------+
  | Attribute  | Type                                   | Attr API    | Notes                    |
@ -24,7 +57,8 @@ API
  +------------+----------------------------------------+-------------+--------------------------+
-  .. automethod:: spacy.en.English.__call__
+.. autoclass:: spacy.en.English
  :members:
 .. autoclass:: spacy.tokens.Tokens
@ -249,7 +283,7 @@ API
  .. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict])
-.. py:class:: syntax.parser.GreedyParser(self, model_dir: unicode)
+.. py:class:: syntax.parser.Parser(self, model_dir: unicode)
  .. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -44,32 +44,82 @@ parse_if_model_present = -1
 class English(object):
    """The English NLP pipeline.
-    Provides a tokenizer, lexicon, part-of-speech tagger and parser.
+    Example:
        Load data from default directory:
            >>> nlp = English()
            >>> nlp = English(data_dir=u'')
        Load data from specified directory:
            >>> nlp = English(data_dir=u'path/to/data_directory')
        Disable (and avoid loading) parts of the processing pipeline:
            >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
        Start with nothing loaded:
            >>> nlp = English(data_dir=None)
    Keyword args:
        data_dir (unicode):
-            A path to a directory, from which to load the pipeline.
+            A path to a directory from which to load the pipeline;
            or '', to load default; or None, to load nothing.
-            By default, data is installed within the spaCy package directory. So
+        Tokenizer (bool or callable):
-            if no data_dir is specified, spaCy attempts to load from a
+            desc
            directory named "data" that is a sibling of the spacy/en/__init__.py
            file.  You can find the location of this file by running:
-                $ python -c "import spacy.en; print spacy.en.__file__"
+        Vectors (bool or callable):
            desc
-            To prevent any data files from being loaded, pass data_dir=None. This
+        Parser (bool or callable):
-            is useful if you want to construct a lexicon, which you'll then save
+            desc
-            for later loading.
+
        Tagger (bool or callable):
            desc
        Entity (bool or callable):
            desc
        Senser (bool or callable):
            desc
    """
    ParserTransitionSystem = ArcEager
    EntityTransitionSystem = BiluoPushDown
-    def __init__(self, data_dir='', load_vectors=True):
+    def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
                 Tagger=True, Entity=True, Senser=True, load_vectors=True):
        if data_dir == '':
            data_dir = LOCAL_DATA_DIR
-        self._data_dir = data_dir
+        # TODO: Deprecation warning
        if load_vectors is False:
            vectors = False
        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
-                           get_lex_props=get_lex_props, load_vectors=load_vectors)
+                           get_lex_props=get_lex_props, vectors=Vectors)
        if Tokenizer is True:
            Tokenizer = tokenizer.Tokenizer
        if Tagger is True:
            Tagger = pos.EnPosTagger
        if Parser is True:
            transition_system = self.ParserTransitionSystem
            Parser = lambda s, d: parser.Parser(s, d, transition_system
        if Entity is True:
            transition_system = self.EntityTransitionSystem
            Entity = lambda s, d: parser.Parser(s, d, transition_system)
        if Senser is True:
            Senser = wsd.SuperSenseTagger
        self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
        self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
        self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
        self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
        self.senser = Senser(self.vocab.strings, data_dir) if Senser else None
        self._data_dir = data_dir
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
        if data_dir is None:
@ -77,53 +127,22 @@ class English(object):
            prefix_re = None
            suffix_re = None
            infix_re = None
            self.has_parser_model = False
            self.has_tagger_model = False
            self.has_entity_model = False
        else:
            tok_data_dir = path.join(data_dir, 'tokenizer')
            tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
            prefix_re = re.compile(prefix_re)
            suffix_re = re.compile(suffix_re)
            infix_re = re.compile(infix_re)
            self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
            self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
            self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
        self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
                                   suffix_re, infix_re,
                                   POS_TAGS, tag_names)
        self.mwe_merger = RegexMerger([
            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
            ('CD', 'TIME', regexes.TIME_RE),
            ('NNP', 'DATE', regexes.DAYS_RE),
            ('CD', 'MONEY', regexes.MONEY_RE)])
        # These are lazy-loaded
        self._tagger = None
        self._parser = None
        self._entity = None
    @property
    def tagger(self):
        if self._tagger is None:
            self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
        return self._tagger
    @property
    def parser(self):
        if self._parser is None:
            self._parser = Parser(self.vocab.strings,
                                  path.join(self._data_dir, 'deps'),
                                  self.ParserTransitionSystem)
        return self._parser
    @property
    def entity(self):
        if self._entity is None:
            self._entity = Parser(self.vocab.strings,
                                  path.join(self._data_dir, 'ner'),
                                  self.EntityTransitionSystem)
        return self._entity
    def __call__(self, text, tag=True, parse=parse_if_model_present,
                 entity=parse_if_model_present, merge_mwes=False):
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -20,8 +20,7 @@ from .tokens import Tokens
 cdef class Tokenizer:
-    def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re,
+    def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, pos_tags):
                 pos_tags, tag_names):
        self.mem = Pool()
        self._cache = PreshMap()
        self._specials = PreshMap()
@ -29,7 +28,17 @@ cdef class Tokenizer:
        self._suffix_re = suffix_re
        self._infix_re = infix_re
        self.vocab = vocab
-        self._load_special_tokenization(rules, pos_tags, tag_names)
+        self._load_special_tokenization(rules, pos_tags)
    @classmethod
    def from_dir(cls, Vocab vocab, directory):
        data_dir = path.join(data_dir, 'tokenizer')
        rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
        prefix_re = re.compile(prefix_re)
        suffix_re = re.compile(suffix_re)
        infix_re = re.compile(infix_re)
        return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
                   pos_tags)
    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
@ -224,7 +233,7 @@ cdef class Tokenizer:
        match = self._suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0
-    def _load_special_tokenization(self, object rules, object tag_map, object tag_names):
+    def _load_special_tokenization(self, object rules, object tag_map):
        '''Add a special-case tokenization rule.
        '''
        cdef int i