* Begin refactor

2025-01-26 17:24:41 +03:00 · 2015-07-07 14:00:07 +02:00 · 2015-07-07 14:00:07 +02:00 · 6788c86b2f
commit 6788c86b2f
parent 52fd80c6c6
3 changed files with 116 additions and 54 deletions
--- a/docs/source/api.rst
+++ b/docs/source/api.rst
@ -1,9 +1,42 @@
-===
-API
-===
+=====
+Usage
+=====

+Overview
+--------
+
+spaCy is a suite of natural language processing tools, arranged into
+a pipeline.  It is substantially more opinionated than most similar libraries,
+which often give users the choice of multiple models that compute the same annotation.
+spaCy's philosophy is to just have one --- the best one.  Our perspective is that
+the redundant options are really only useful to researchers, who need to replicate
+some prior work exactly.
+
+Being opinionated allows us to keep the library small, fast, and up-to-date.  It
+also makes the API much simpler.  Normal usage proceeds in three steps:
+
+1. Loading resources;
+2. Processing text;
+3. Accessing annotations.
+
+This document is divided into three parts, to match these stages.  We focus here
+on the library's API. See also: Installation, Annotation Standards, Algorithmic Details,
+and Benchmarks.
+
+Loading Resources
+-----------------
+
+99\% of the time, you will load spaCy's resources using a language pipeline class,
+e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
+specified directory.  By default, spaCy installs data into each language's
+package directory, and loads it from there.

 .. autoclass:: spacy.en.English
+  :members:
+
+
+The class `spacy.en.English` is the main entry-point for the English pipeline
+(other languages to come).

  +------------+----------------------------------------+-------------+--------------------------+
  | Attribute  | Type                                   | Attr API    | Notes                    |
@ -24,7 +57,8 @@ API
  +------------+----------------------------------------+-------------+--------------------------+


-  .. automethod:: spacy.en.English.__call__
+.. autoclass:: spacy.en.English
+  :members:


 .. autoclass:: spacy.tokens.Tokens
@ -249,7 +283,7 @@ API

  .. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict])

-.. py:class:: syntax.parser.GreedyParser(self, model_dir: unicode)
+.. py:class:: syntax.parser.Parser(self, model_dir: unicode)

  .. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None

--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -44,32 +44,82 @@ parse_if_model_present = -1
 class English(object):
    """The English NLP pipeline.

-    Provides a tokenizer, lexicon, part-of-speech tagger and parser.
+    Example:
+
+        Load data from default directory:
+
+            >>> nlp = English()
+            >>> nlp = English(data_dir=u'')
+
+        Load data from specified directory:
+    
+            >>> nlp = English(data_dir=u'path/to/data_directory')
+
+        Disable (and avoid loading) parts of the processing pipeline:
+
+            >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
+        
+        Start with nothing loaded:
+
+            >>> nlp = English(data_dir=None)

    Keyword args:
        data_dir (unicode):
-            A path to a directory, from which to load the pipeline.
+            A path to a directory from which to load the pipeline;
+            or '', to load default; or None, to load nothing.

-            By default, data is installed within the spaCy package directory. So
-            if no data_dir is specified, spaCy attempts to load from a
-            directory named "data" that is a sibling of the spacy/en/__init__.py
-            file.  You can find the location of this file by running:
+        Tokenizer (bool or callable):
+            desc

-                $ python -c "import spacy.en; print spacy.en.__file__"
+        Vectors (bool or callable):
+            desc

-            To prevent any data files from being loaded, pass data_dir=None. This
-            is useful if you want to construct a lexicon, which you'll then save
-            for later loading.
+        Parser (bool or callable):
+            desc
+
+        Tagger (bool or callable):
+            desc
+
+        Entity (bool or callable):
+            desc
+
+        Senser (bool or callable):
+            desc
    """
    ParserTransitionSystem = ArcEager
    EntityTransitionSystem = BiluoPushDown

-    def __init__(self, data_dir='', load_vectors=True):
+    def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
+                 Tagger=True, Entity=True, Senser=True, load_vectors=True):
        if data_dir == '':
            data_dir = LOCAL_DATA_DIR
-        self._data_dir = data_dir
+        # TODO: Deprecation warning
+        if load_vectors is False:
+            vectors = False
+
        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
-                           get_lex_props=get_lex_props, load_vectors=load_vectors)
+                           get_lex_props=get_lex_props, vectors=Vectors)
+
+        if Tokenizer is True:
+            Tokenizer = tokenizer.Tokenizer
+        if Tagger is True:
+            Tagger = pos.EnPosTagger
+        if Parser is True:
+            transition_system = self.ParserTransitionSystem
+            Parser = lambda s, d: parser.Parser(s, d, transition_system
+        if Entity is True:
+            transition_system = self.EntityTransitionSystem
+            Entity = lambda s, d: parser.Parser(s, d, transition_system)
+        if Senser is True:
+            Senser = wsd.SuperSenseTagger
+
+        self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
+        self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
+        self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
+        self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
+        self.senser = Senser(self.vocab.strings, data_dir) if Senser else None
+
+        self._data_dir = data_dir
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
        if data_dir is None:
@ -77,53 +127,22 @@ class English(object):
            prefix_re = None
            suffix_re = None
            infix_re = None
-            self.has_parser_model = False
-            self.has_tagger_model = False
-            self.has_entity_model = False
        else:
            tok_data_dir = path.join(data_dir, 'tokenizer')
            tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
            prefix_re = re.compile(prefix_re)
            suffix_re = re.compile(suffix_re)
            infix_re = re.compile(infix_re)
-            self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
-            self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
-            self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))

        self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
                                   suffix_re, infix_re,
                                   POS_TAGS, tag_names)
+
        self.mwe_merger = RegexMerger([
            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
            ('CD', 'TIME', regexes.TIME_RE),
            ('NNP', 'DATE', regexes.DAYS_RE),
            ('CD', 'MONEY', regexes.MONEY_RE)])
-        # These are lazy-loaded
-        self._tagger = None
-        self._parser = None
-        self._entity = None
-
-    @property
-    def tagger(self):
-        if self._tagger is None:
-            self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
-        return self._tagger
-
-    @property
-    def parser(self):
-        if self._parser is None:
-            self._parser = Parser(self.vocab.strings,
-                                  path.join(self._data_dir, 'deps'),
-                                  self.ParserTransitionSystem)
-        return self._parser
-
-    @property
-    def entity(self):
-        if self._entity is None:
-            self._entity = Parser(self.vocab.strings,
-                                  path.join(self._data_dir, 'ner'),
-                                  self.EntityTransitionSystem)
-        return self._entity

    def __call__(self, text, tag=True, parse=parse_if_model_present,
                 entity=parse_if_model_present, merge_mwes=False):
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -20,8 +20,7 @@ from .tokens import Tokens


 cdef class Tokenizer:
-    def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re,
-                 pos_tags, tag_names):
+    def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, pos_tags):
        self.mem = Pool()
        self._cache = PreshMap()
        self._specials = PreshMap()
@ -29,7 +28,17 @@ cdef class Tokenizer:
        self._suffix_re = suffix_re
        self._infix_re = infix_re
        self.vocab = vocab
-        self._load_special_tokenization(rules, pos_tags, tag_names)
+        self._load_special_tokenization(rules, pos_tags)
+
+    @classmethod
+    def from_dir(cls, Vocab vocab, directory):
+        data_dir = path.join(data_dir, 'tokenizer')
+        rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
+        prefix_re = re.compile(prefix_re)
+        suffix_re = re.compile(suffix_re)
+        infix_re = re.compile(infix_re)
+        return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
+                   pos_tags)

    cpdef Tokens tokens_from_list(self, list strings):
        cdef int length = sum([len(s) for s in strings])
@ -224,7 +233,7 @@ cdef class Tokenizer:
        match = self._suffix_re.search(string)
        return (match.end() - match.start()) if match is not None else 0

-    def _load_special_tokenization(self, object rules, object tag_map, object tag_names):
+    def _load_special_tokenization(self, object rules, object tag_map):
        '''Add a special-case tokenization rule.
        '''
        cdef int i