spaCy/spacy/en/__init__.py

from __future__ import unicode_literals
from os import path
import re

from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..tokens import Tokens
from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags


from ..util import read_lang_data


def get_lex_props(string):
    return {
        'flags': get_flags(string),
        'length': len(string),
        'orth': string,
        'lower': string.lower(),
        'norm': string,
        'shape': orth.word_shape(string),
        'prefix': string[0],
        'suffix': string[-3:],
        'cluster': 0,
        'prob': 0,
        'sentiment': 0
    }


LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')


class English(object):
    """The English NLP pipeline.

    Provides a tokenizer, lexicon, part-of-speech tagger and parser.

    Keyword args:
        data_dir (unicode): A path to a directory, from which to load the pipeline.
            If None, looks for a directory named "data/" in the same directory as
            the present file, i.e. path.join(path.dirname(__file__, 'data')).
            If path.join(data_dir, 'pos') exists, the tagger is loaded from it.
            If path.join(data_dir, 'deps') exists, the parser is loaded from it.
            See Pipeline Directory Structure for details.

    Attributes:
        vocab (spacy.vocab.Vocab): The lexicon.

        strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.

        tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.

        tagger (spacy.en.pos.EnPosTagger):
            The part-of-speech tagger, which also performs lemmatization and
            morphological analysis.

        parser (spacy.syntax.parser.GreedyParser):
            A greedy shift-reduce dependency parser.
    """
    def __init__(self, data_dir=LOCAL_DATA_DIR):
        self._data_dir = data_dir
        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
                           get_lex_props=get_lex_props)
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
        if data_dir is None:
            tok_rules = {}
            prefix_re = None
            suffix_re = None
            infix_re = None
        else:
            tok_data_dir = path.join(data_dir, 'tokenizer')
            tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
            prefix_re = re.compile(prefix_re)
            suffix_re = re.compile(suffix_re)
            infix_re = re.compile(infix_re)
        self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
                                   suffix_re, infix_re,
                                   POS_TAGS, tag_names)
        self._tagger = None
        self._parser = None

    @property
    def tagger(self):
        if self._tagger is None:
            self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
        return self._tagger

    @property
    def parser(self):
        if self._parser is None:
            self._parser = GreedyParser(path.join(self._data_dir, 'deps'))
        return self._parser

    def __call__(self, text, tag=True, parse=True):
        """Apply the pipeline to some text.
        
        Args:
            text (unicode): The text to be processed.

        Keyword args:
            tag (bool): Whether to add part-of-speech tags to the text.  This
                will also set morphological analysis and lemmas.

            parse (bool): Whether to add dependency-heads and labels to the text.

        Returns:
            tokens (spacy.tokens.Tokens):
        """
        tokens = self.tokenizer(text)
        if tag:
            self.tagger(tokens)
        if parse:
            self.parser(tokens)
        return tokens

    @property
    def tags(self):
        """List of part-of-speech tag names."""
        return self.tagger.tag_names
* Work on train 2014-12-21 23:25:43 +03:00			`from __future__ import unicode_literals`
			`from os import path`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`import re`
* Work on train 2014-12-21 23:25:43 +03:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`from .. import orth`
* Work on train 2014-12-21 23:25:43 +03:00			`from ..vocab import Vocab`
			`from ..tokenizer import Tokenizer`
			`from ..syntax.parser import GreedyParser`
			`from ..tokens import Tokens`
			`from .pos import EnPosTagger`
* POS tagger training working after reorg 2014-12-22 00:54:47 +03:00			`from .pos import POS_TAGS`
* Work on train 2014-12-21 23:25:43 +03:00			`from .attrs import get_flags`


* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`from ..util import read_lang_data`


* Work on train 2014-12-21 23:25:43 +03:00			`def get_lex_props(string):`
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00			`return {`
			`'flags': get_flags(string),`
			`'length': len(string),`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`'orth': string,`
* Rename NORM1 and NORM2 attrs to lower and norm 2015-01-23 22:17:03 +03:00			`'lower': string.lower(),`
			`'norm': string,`
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00			`'shape': orth.word_shape(string),`
			`'prefix': string[0],`
			`'suffix': string[-3:],`
			`'cluster': 0,`
			`'prob': 0,`
			`'sentiment': 0`
			`}`

* Work on train 2014-12-21 23:25:43 +03:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')`
* Work on train 2014-12-21 23:25:43 +03:00
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00
* Work on train 2014-12-21 23:25:43 +03:00			`class English(object):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""The English NLP pipeline.`

			`Provides a tokenizer, lexicon, part-of-speech tagger and parser.`

			`Keyword args:`
			`data_dir (unicode): A path to a directory, from which to load the pipeline.`
			`If None, looks for a directory named "data/" in the same directory as`
			`the present file, i.e. path.join(path.dirname(__file__, 'data')).`
			`If path.join(data_dir, 'pos') exists, the tagger is loaded from it.`
			`If path.join(data_dir, 'deps') exists, the parser is loaded from it.`
			`See Pipeline Directory Structure for details.`

			`Attributes:`
			`vocab (spacy.vocab.Vocab): The lexicon.`

			`strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.`

			`tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.`

			`tagger (spacy.en.pos.EnPosTagger):`
			`The part-of-speech tagger, which also performs lemmatization and`
			`morphological analysis.`

			`parser (spacy.syntax.parser.GreedyParser):`
			`A greedy shift-reduce dependency parser.`
			`"""`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`def __init__(self, data_dir=LOCAL_DATA_DIR):`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`self._data_dir = data_dir`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 16:03:48 +03:00			`self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,`
* Move around data files for test release 2015-01-02 17:59:22 +03:00			`get_lex_props=get_lex_props)`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`tag_names = list(POS_TAGS.keys())`
			`tag_names.sort()`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`if data_dir is None:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tok_rules = {}`
			`prefix_re = None`
			`suffix_re = None`
			`infix_re = None`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`else:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tok_data_dir = path.join(data_dir, 'tokenizer')`
			`tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)`
* Fix data_dir=None argument to English class 2015-01-21 10:27:31 +03:00			`prefix_re = re.compile(prefix_re)`
			`suffix_re = re.compile(suffix_re)`
			`infix_re = re.compile(infix_re)`
			`self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,`
			`suffix_re, infix_re,`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`POS_TAGS, tag_names)`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`self._tagger = None`
			`self._parser = None`

			`@property`
			`def tagger(self):`
			`if self._tagger is None:`
			`self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)`
			`return self._tagger`

			`@property`
			`def parser(self):`
			`if self._parser is None:`
			`self._parser = GreedyParser(path.join(self._data_dir, 'deps'))`
			`return self._parser`
* Work on train 2014-12-21 23:25:43 +03:00
* Set parse=True by default --- i.e. parse unless told not to. 2015-01-24 17:28:28 +03:00			`def __call__(self, text, tag=True, parse=True):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""Apply the pipeline to some text.`

			`Args:`
			`text (unicode): The text to be processed.`

			`Keyword args:`
			`tag (bool): Whether to add part-of-speech tags to the text. This`
			`will also set morphological analysis and lemmas.`

			`parse (bool): Whether to add dependency-heads and labels to the text.`

			`Returns:`
			`tokens (spacy.tokens.Tokens):`
			`"""`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tokens = self.tokenizer(text)`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`if tag:`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`self.tagger(tokens)`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`if parse:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`self.parser(tokens)`
* Work on train 2014-12-21 23:25:43 +03:00			`return tokens`
* Tmp 2014-12-24 09:42:00 +03:00
			`@property`
			`def tags(self):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""List of part-of-speech tag names."""`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`return self.tagger.tag_names`