mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Begin refactor
This commit is contained in:
		
							parent
							
								
									52fd80c6c6
								
							
						
					
					
						commit
						6788c86b2f
					
				| 
						 | 
				
			
			@ -1,9 +1,42 @@
 | 
			
		|||
===
 | 
			
		||||
API
 | 
			
		||||
===
 | 
			
		||||
=====
 | 
			
		||||
Usage
 | 
			
		||||
=====
 | 
			
		||||
 | 
			
		||||
Overview
 | 
			
		||||
--------
 | 
			
		||||
 | 
			
		||||
spaCy is a suite of natural language processing tools, arranged into
 | 
			
		||||
a pipeline.  It is substantially more opinionated than most similar libraries,
 | 
			
		||||
which often give users the choice of multiple models that compute the same annotation.
 | 
			
		||||
spaCy's philosophy is to just have one --- the best one.  Our perspective is that
 | 
			
		||||
the redundant options are really only useful to researchers, who need to replicate
 | 
			
		||||
some prior work exactly.
 | 
			
		||||
 | 
			
		||||
Being opinionated allows us to keep the library small, fast, and up-to-date.  It
 | 
			
		||||
also makes the API much simpler.  Normal usage proceeds in three steps:
 | 
			
		||||
 | 
			
		||||
1. Loading resources;
 | 
			
		||||
2. Processing text;
 | 
			
		||||
3. Accessing annotations.
 | 
			
		||||
 | 
			
		||||
This document is divided into three parts, to match these stages.  We focus here
 | 
			
		||||
on the library's API. See also: Installation, Annotation Standards, Algorithmic Details,
 | 
			
		||||
and Benchmarks.
 | 
			
		||||
 | 
			
		||||
Loading Resources
 | 
			
		||||
-----------------
 | 
			
		||||
 | 
			
		||||
99\% of the time, you will load spaCy's resources using a language pipeline class,
 | 
			
		||||
e.g. `spacy.en.English`. The pipeline class reads the data from disk, from a
 | 
			
		||||
specified directory.  By default, spaCy installs data into each language's
 | 
			
		||||
package directory, and loads it from there.
 | 
			
		||||
 | 
			
		||||
.. autoclass:: spacy.en.English
 | 
			
		||||
  :members:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
The class `spacy.en.English` is the main entry-point for the English pipeline
 | 
			
		||||
(other languages to come).
 | 
			
		||||
 | 
			
		||||
  +------------+----------------------------------------+-------------+--------------------------+
 | 
			
		||||
  | Attribute  | Type                                   | Attr API    | Notes                    |
 | 
			
		||||
| 
						 | 
				
			
			@ -24,7 +57,8 @@ API
 | 
			
		|||
  +------------+----------------------------------------+-------------+--------------------------+
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
  .. automethod:: spacy.en.English.__call__
 | 
			
		||||
.. autoclass:: spacy.en.English
 | 
			
		||||
  :members:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
.. autoclass:: spacy.tokens.Tokens
 | 
			
		||||
| 
						 | 
				
			
			@ -249,7 +283,7 @@ API
 | 
			
		|||
 | 
			
		||||
  .. py:method:: load_morph_exceptions(self, exc: Dict[unicode, Dict])
 | 
			
		||||
 | 
			
		||||
.. py:class:: syntax.parser.GreedyParser(self, model_dir: unicode)
 | 
			
		||||
.. py:class:: syntax.parser.Parser(self, model_dir: unicode)
 | 
			
		||||
 | 
			
		||||
  .. py:method:: __call__(self, tokens: spacy.tokens.Tokens) --> None
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -44,32 +44,82 @@ parse_if_model_present = -1
 | 
			
		|||
class English(object):
 | 
			
		||||
    """The English NLP pipeline.
 | 
			
		||||
 | 
			
		||||
    Provides a tokenizer, lexicon, part-of-speech tagger and parser.
 | 
			
		||||
    Example:
 | 
			
		||||
 | 
			
		||||
        Load data from default directory:
 | 
			
		||||
 | 
			
		||||
            >>> nlp = English()
 | 
			
		||||
            >>> nlp = English(data_dir=u'')
 | 
			
		||||
 | 
			
		||||
        Load data from specified directory:
 | 
			
		||||
    
 | 
			
		||||
            >>> nlp = English(data_dir=u'path/to/data_directory')
 | 
			
		||||
 | 
			
		||||
        Disable (and avoid loading) parts of the processing pipeline:
 | 
			
		||||
 | 
			
		||||
            >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
 | 
			
		||||
        
 | 
			
		||||
        Start with nothing loaded:
 | 
			
		||||
 | 
			
		||||
            >>> nlp = English(data_dir=None)
 | 
			
		||||
 | 
			
		||||
    Keyword args:
 | 
			
		||||
        data_dir (unicode):
 | 
			
		||||
            A path to a directory, from which to load the pipeline.
 | 
			
		||||
            A path to a directory from which to load the pipeline;
 | 
			
		||||
            or '', to load default; or None, to load nothing.
 | 
			
		||||
 | 
			
		||||
            By default, data is installed within the spaCy package directory. So
 | 
			
		||||
            if no data_dir is specified, spaCy attempts to load from a
 | 
			
		||||
            directory named "data" that is a sibling of the spacy/en/__init__.py
 | 
			
		||||
            file.  You can find the location of this file by running:
 | 
			
		||||
        Tokenizer (bool or callable):
 | 
			
		||||
            desc
 | 
			
		||||
 | 
			
		||||
                $ python -c "import spacy.en; print spacy.en.__file__"
 | 
			
		||||
        Vectors (bool or callable):
 | 
			
		||||
            desc
 | 
			
		||||
 | 
			
		||||
            To prevent any data files from being loaded, pass data_dir=None. This
 | 
			
		||||
            is useful if you want to construct a lexicon, which you'll then save
 | 
			
		||||
            for later loading.
 | 
			
		||||
        Parser (bool or callable):
 | 
			
		||||
            desc
 | 
			
		||||
 | 
			
		||||
        Tagger (bool or callable):
 | 
			
		||||
            desc
 | 
			
		||||
 | 
			
		||||
        Entity (bool or callable):
 | 
			
		||||
            desc
 | 
			
		||||
 | 
			
		||||
        Senser (bool or callable):
 | 
			
		||||
            desc
 | 
			
		||||
    """
 | 
			
		||||
    ParserTransitionSystem = ArcEager
 | 
			
		||||
    EntityTransitionSystem = BiluoPushDown
 | 
			
		||||
 | 
			
		||||
    def __init__(self, data_dir='', load_vectors=True):
 | 
			
		||||
    def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
 | 
			
		||||
                 Tagger=True, Entity=True, Senser=True, load_vectors=True):
 | 
			
		||||
        if data_dir == '':
 | 
			
		||||
            data_dir = LOCAL_DATA_DIR
 | 
			
		||||
        self._data_dir = data_dir
 | 
			
		||||
        # TODO: Deprecation warning
 | 
			
		||||
        if load_vectors is False:
 | 
			
		||||
            vectors = False
 | 
			
		||||
 | 
			
		||||
        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
 | 
			
		||||
                           get_lex_props=get_lex_props, load_vectors=load_vectors)
 | 
			
		||||
                           get_lex_props=get_lex_props, vectors=Vectors)
 | 
			
		||||
 | 
			
		||||
        if Tokenizer is True:
 | 
			
		||||
            Tokenizer = tokenizer.Tokenizer
 | 
			
		||||
        if Tagger is True:
 | 
			
		||||
            Tagger = pos.EnPosTagger
 | 
			
		||||
        if Parser is True:
 | 
			
		||||
            transition_system = self.ParserTransitionSystem
 | 
			
		||||
            Parser = lambda s, d: parser.Parser(s, d, transition_system
 | 
			
		||||
        if Entity is True:
 | 
			
		||||
            transition_system = self.EntityTransitionSystem
 | 
			
		||||
            Entity = lambda s, d: parser.Parser(s, d, transition_system)
 | 
			
		||||
        if Senser is True:
 | 
			
		||||
            Senser = wsd.SuperSenseTagger
 | 
			
		||||
 | 
			
		||||
        self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
 | 
			
		||||
        self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
 | 
			
		||||
        self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
 | 
			
		||||
        self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
 | 
			
		||||
        self.senser = Senser(self.vocab.strings, data_dir) if Senser else None
 | 
			
		||||
 | 
			
		||||
        self._data_dir = data_dir
 | 
			
		||||
        tag_names = list(POS_TAGS.keys())
 | 
			
		||||
        tag_names.sort()
 | 
			
		||||
        if data_dir is None:
 | 
			
		||||
| 
						 | 
				
			
			@ -77,53 +127,22 @@ class English(object):
 | 
			
		|||
            prefix_re = None
 | 
			
		||||
            suffix_re = None
 | 
			
		||||
            infix_re = None
 | 
			
		||||
            self.has_parser_model = False
 | 
			
		||||
            self.has_tagger_model = False
 | 
			
		||||
            self.has_entity_model = False
 | 
			
		||||
        else:
 | 
			
		||||
            tok_data_dir = path.join(data_dir, 'tokenizer')
 | 
			
		||||
            tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
 | 
			
		||||
            prefix_re = re.compile(prefix_re)
 | 
			
		||||
            suffix_re = re.compile(suffix_re)
 | 
			
		||||
            infix_re = re.compile(infix_re)
 | 
			
		||||
            self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
 | 
			
		||||
            self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
 | 
			
		||||
            self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
 | 
			
		||||
 | 
			
		||||
        self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
 | 
			
		||||
                                   suffix_re, infix_re,
 | 
			
		||||
                                   POS_TAGS, tag_names)
 | 
			
		||||
 | 
			
		||||
        self.mwe_merger = RegexMerger([
 | 
			
		||||
            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
 | 
			
		||||
            ('CD', 'TIME', regexes.TIME_RE),
 | 
			
		||||
            ('NNP', 'DATE', regexes.DAYS_RE),
 | 
			
		||||
            ('CD', 'MONEY', regexes.MONEY_RE)])
 | 
			
		||||
        # These are lazy-loaded
 | 
			
		||||
        self._tagger = None
 | 
			
		||||
        self._parser = None
 | 
			
		||||
        self._entity = None
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def tagger(self):
 | 
			
		||||
        if self._tagger is None:
 | 
			
		||||
            self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
 | 
			
		||||
        return self._tagger
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def parser(self):
 | 
			
		||||
        if self._parser is None:
 | 
			
		||||
            self._parser = Parser(self.vocab.strings,
 | 
			
		||||
                                  path.join(self._data_dir, 'deps'),
 | 
			
		||||
                                  self.ParserTransitionSystem)
 | 
			
		||||
        return self._parser
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def entity(self):
 | 
			
		||||
        if self._entity is None:
 | 
			
		||||
            self._entity = Parser(self.vocab.strings,
 | 
			
		||||
                                  path.join(self._data_dir, 'ner'),
 | 
			
		||||
                                  self.EntityTransitionSystem)
 | 
			
		||||
        return self._entity
 | 
			
		||||
 | 
			
		||||
    def __call__(self, text, tag=True, parse=parse_if_model_present,
 | 
			
		||||
                 entity=parse_if_model_present, merge_mwes=False):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -20,8 +20,7 @@ from .tokens import Tokens
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
cdef class Tokenizer:
 | 
			
		||||
    def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re,
 | 
			
		||||
                 pos_tags, tag_names):
 | 
			
		||||
    def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re, pos_tags):
 | 
			
		||||
        self.mem = Pool()
 | 
			
		||||
        self._cache = PreshMap()
 | 
			
		||||
        self._specials = PreshMap()
 | 
			
		||||
| 
						 | 
				
			
			@ -29,7 +28,17 @@ cdef class Tokenizer:
 | 
			
		|||
        self._suffix_re = suffix_re
 | 
			
		||||
        self._infix_re = infix_re
 | 
			
		||||
        self.vocab = vocab
 | 
			
		||||
        self._load_special_tokenization(rules, pos_tags, tag_names)
 | 
			
		||||
        self._load_special_tokenization(rules, pos_tags)
 | 
			
		||||
 | 
			
		||||
    @classmethod
 | 
			
		||||
    def from_dir(cls, Vocab vocab, directory):
 | 
			
		||||
        data_dir = path.join(data_dir, 'tokenizer')
 | 
			
		||||
        rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
 | 
			
		||||
        prefix_re = re.compile(prefix_re)
 | 
			
		||||
        suffix_re = re.compile(suffix_re)
 | 
			
		||||
        infix_re = re.compile(infix_re)
 | 
			
		||||
        return cls(vocab, tok_rules, prefix_re, suffix_re, infix_re,
 | 
			
		||||
                   pos_tags)
 | 
			
		||||
 | 
			
		||||
    cpdef Tokens tokens_from_list(self, list strings):
 | 
			
		||||
        cdef int length = sum([len(s) for s in strings])
 | 
			
		||||
| 
						 | 
				
			
			@ -224,7 +233,7 @@ cdef class Tokenizer:
 | 
			
		|||
        match = self._suffix_re.search(string)
 | 
			
		||||
        return (match.end() - match.start()) if match is not None else 0
 | 
			
		||||
 | 
			
		||||
    def _load_special_tokenization(self, object rules, object tag_map, object tag_names):
 | 
			
		||||
    def _load_special_tokenization(self, object rules, object tag_map):
 | 
			
		||||
        '''Add a special-case tokenization rule.
 | 
			
		||||
        '''
 | 
			
		||||
        cdef int i
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user