spaCy/spacy/en/__init__.py

from __future__ import unicode_literals
from os import path
import re

from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..tokens import Tokens
from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags


from ..util import read_lang_data


def get_lex_props(string):
    return {
        'flags': get_flags(string),
        'length': len(string),
        'orth': string,
        'lower': string.lower(),
        'norm': string,
        'shape': orth.word_shape(string),
        'prefix': string[0],
        'suffix': string[-3:],
        'cluster': 0,
        'prob': 0,
        'sentiment': 0
    }


LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')

parse_if_model_present = -1


class English(object):
    """The English NLP pipeline.

    Provides a tokenizer, lexicon, part-of-speech tagger and parser.

    Keyword args:
        data_dir (unicode): A path to a directory, from which to load the pipeline.
            If empty string ('') --- the default --- it looks for a directory
            named "data/" in the same directory as the present file, i.e.
            
                >>> data_dir = path.join(path.dirname(__file__, 'data'))

            If path.join(data_dir, 'pos') exists, the tagger is loaded from there.

            If path.join(data_dir, 'deps') exists, the parser is loaded from there.

            To prevent any data files from being loaded, pass data_dir=None. This
            is useful if you want to construct a lexicon, which you'll then save
            for later loading.
    """
    def __init__(self, data_dir=''):
        if data_dir == '':
            data_dir = LOCAL_DATA_DIR
        self._data_dir = data_dir
        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
                           get_lex_props=get_lex_props)
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
        if data_dir is None:
            tok_rules = {}
            prefix_re = None
            suffix_re = None
            infix_re = None
            self.has_parser_model = False
            self.has_tagger_model = False
        else:
            tok_data_dir = path.join(data_dir, 'tokenizer')
            tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
            prefix_re = re.compile(prefix_re)
            suffix_re = re.compile(suffix_re)
            infix_re = re.compile(infix_re)
            self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
            self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))

        self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
                                   suffix_re, infix_re,
                                   POS_TAGS, tag_names)
        # These are lazy-loaded
        self._tagger = None
        self._parser = None


    @property
    def tagger(self):
        if self._tagger is None:
            self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
        return self._tagger

    @property
    def parser(self):
        if self._parser is None:
            self._parser = GreedyParser(path.join(self._data_dir, 'deps'))
        return self._parser

    def __call__(self, text, tag=True, parse=parse_if_model_present):
        """Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string
        
        The tagger and parser are lazy-loaded the first time they are required.
        Loading the parser model usually takes 5-10 seconds.
        
        Args:
            text (unicode): The text to be processed.

        Keyword args:
            tag (bool): Whether to add part-of-speech tags to the text.  Also
                sets morphological analysis and lemmas.
        
            parse (True, False, -1): Whether to add labelled syntactic dependencies.
            
              -1 (default) is "guess": It will guess True if tag=True and the
                model has been installed.

        Returns:
            tokens (spacy.tokens.Tokens):

        >>> from spacy.en import English
        >>> nlp = English()
        >>> tokens = nlp('An example sentence. Another example sentence.')
        >>> tokens[0].orth_, tokens[0].head.tag_
        ('An', 'NN')
        """
        if parse == True and tag == False:
            msg = ("Incompatible arguments: tag=False, parse=True"
                   "Part-of-speech tags are required for parsing.")
            raise ValueError(msg)
        tokens = self.tokenizer(text)
        if parse == -1 and tag == False:
            parse = False
        elif parse == -1 and not self.has_parser_model:
            parse = False
        if tag and self.has_tagger_model:
            self.tagger(tokens)
        if parse == True and not self.has_parser_model:
            msg = ("Receive parse=True, but parser model not found.\n\n"
                  "Run:\n"
                  "$ python -m spacy.en.download\n"
                  "To install the model.")
            raise IOError(msg)
        if parse and self.has_parser_model:
            self.parser(tokens)
        return tokens

    @property
    def tags(self):
        """List of part-of-speech tag names."""
        return self.tagger.tag_names
* Work on train 2014-12-21 23:25:43 +03:00			`from __future__ import unicode_literals`
			`from os import path`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`import re`
* Work on train 2014-12-21 23:25:43 +03:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`from .. import orth`
* Work on train 2014-12-21 23:25:43 +03:00			`from ..vocab import Vocab`
			`from ..tokenizer import Tokenizer`
			`from ..syntax.parser import GreedyParser`
			`from ..tokens import Tokens`
			`from .pos import EnPosTagger`
* POS tagger training working after reorg 2014-12-22 00:54:47 +03:00			`from .pos import POS_TAGS`
* Work on train 2014-12-21 23:25:43 +03:00			`from .attrs import get_flags`


* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`from ..util import read_lang_data`


* Work on train 2014-12-21 23:25:43 +03:00			`def get_lex_props(string):`
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00			`return {`
			`'flags': get_flags(string),`
			`'length': len(string),`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`'orth': string,`
* Rename NORM1 and NORM2 attrs to lower and norm 2015-01-23 22:17:03 +03:00			`'lower': string.lower(),`
			`'norm': string,`
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00			`'shape': orth.word_shape(string),`
			`'prefix': string[0],`
			`'suffix': string[-3:],`
			`'cluster': 0,`
			`'prob': 0,`
			`'sentiment': 0`
			`}`

* Work on train 2014-12-21 23:25:43 +03:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')`
* Work on train 2014-12-21 23:25:43 +03:00
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`parse_if_model_present = -1`

* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00
* Work on train 2014-12-21 23:25:43 +03:00			`class English(object):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""The English NLP pipeline.`

			`Provides a tokenizer, lexicon, part-of-speech tagger and parser.`

			`Keyword args:`
			`data_dir (unicode): A path to a directory, from which to load the pipeline.`
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`If empty string ('') --- the default --- it looks for a directory`
			`named "data/" in the same directory as the present file, i.e.`

			`>>> data_dir = path.join(path.dirname(__file__, 'data'))`
* Upd docstrings 2014-12-27 10:45:16 +03:00
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`If path.join(data_dir, 'pos') exists, the tagger is loaded from there.`
* Upd docstrings 2014-12-27 10:45:16 +03:00
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`If path.join(data_dir, 'deps') exists, the parser is loaded from there.`
* Fix default model path for English 2015-01-31 08:38:27 +03:00
			`To prevent any data files from being loaded, pass data_dir=None. This`
			`is useful if you want to construct a lexicon, which you'll then save`
			`for later loading.`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""`
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`def __init__(self, data_dir=''):`
			`if data_dir == '':`
			`data_dir = LOCAL_DATA_DIR`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`self._data_dir = data_dir`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 16:03:48 +03:00			`self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,`
* Move around data files for test release 2015-01-02 17:59:22 +03:00			`get_lex_props=get_lex_props)`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`tag_names = list(POS_TAGS.keys())`
			`tag_names.sort()`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`if data_dir is None:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tok_rules = {}`
			`prefix_re = None`
			`suffix_re = None`
			`infix_re = None`
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`self.has_parser_model = False`
			`self.has_tagger_model = False`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`else:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tok_data_dir = path.join(data_dir, 'tokenizer')`
			`tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)`
* Fix data_dir=None argument to English class 2015-01-21 10:27:31 +03:00			`prefix_re = re.compile(prefix_re)`
			`suffix_re = re.compile(suffix_re)`
			`infix_re = re.compile(infix_re)`
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))`
			`self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))`

* Fix data_dir=None argument to English class 2015-01-21 10:27:31 +03:00			`self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,`
			`suffix_re, infix_re,`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`POS_TAGS, tag_names)`
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`# These are lazy-loaded`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`self._tagger = None`
			`self._parser = None`

* Silently don't parse if data is not present 2015-01-25 06:47:38 +03:00
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`@property`
			`def tagger(self):`
			`if self._tagger is None:`
			`self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)`
			`return self._tagger`

			`@property`
			`def parser(self):`
			`if self._parser is None:`
			`self._parser = GreedyParser(path.join(self._data_dir, 'deps'))`
			`return self._parser`
* Work on train 2014-12-21 23:25:43 +03:00
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`def __call__(self, text, tag=True, parse=parse_if_model_present):`
			`"""Apply the pipeline to some text. The text can span multiple sentences,`
			`and can contain arbtrary whitespace. Alignment into the original string`

			`The tagger and parser are lazy-loaded the first time they are required.`
			`Loading the parser model usually takes 5-10 seconds.`
* Upd docstrings 2014-12-27 10:45:16 +03:00
			`Args:`
			`text (unicode): The text to be processed.`

			`Keyword args:`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`tag (bool): Whether to add part-of-speech tags to the text. Also`
			`sets morphological analysis and lemmas.`

			`parse (True, False, -1): Whether to add labelled syntactic dependencies.`

			`-1 (default) is "guess": It will guess True if tag=True and the`
			`model has been installed.`
* Upd docstrings 2014-12-27 10:45:16 +03:00
			`Returns:`
			`tokens (spacy.tokens.Tokens):`
* Add docstring to English class 2015-01-26 18:45:21 +03:00
			`>>> from spacy.en import English`
			`>>> nlp = English()`
			`>>> tokens = nlp('An example sentence. Another example sentence.')`
			`>>> tokens[0].orth_, tokens[0].head.tag_`
			`('An', 'NN')`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`if parse == True and tag == False:`
			`msg = ("Incompatible arguments: tag=False, parse=True"`
			`"Part-of-speech tags are required for parsing.")`
			`raise ValueError(msg)`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tokens = self.tokenizer(text)`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`if parse == -1 and tag == False:`
			`parse = False`
			`elif parse == -1 and not self.has_parser_model:`
			`parse = False`
			`if tag and self.has_tagger_model:`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`self.tagger(tokens)`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`if parse == True and not self.has_parser_model:`
			`msg = ("Receive parse=True, but parser model not found.\n\n"`
			`"Run:\n"`
			`"$ python -m spacy.en.download\n"`
			`"To install the model.")`
			`raise IOError(msg)`
* Silently don't parse if data is not present 2015-01-25 06:47:38 +03:00			`if parse and self.has_parser_model:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`self.parser(tokens)`
* Work on train 2014-12-21 23:25:43 +03:00			`return tokens`
* Tmp 2014-12-24 09:42:00 +03:00
			`@property`
			`def tags(self):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""List of part-of-speech tag names."""`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`return self.tagger.tag_names`