spaCy/spacy/en/__init__.py

from __future__ import unicode_literals
from os import path

from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..tokens import Tokens
from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags


def get_lex_props(string):
    return {'flags': get_flags(string), 'dense': 1}


class English(object):
    """The English NLP pipeline.

    Provides a tokenizer, lexicon, part-of-speech tagger and parser.

    Keyword args:
        data_dir (unicode): A path to a directory, from which to load the pipeline.
            If None, looks for a directory named "data/" in the same directory as
            the present file, i.e. path.join(path.dirname(__file__, 'data')).
            If path.join(data_dir, 'pos') exists, the tagger is loaded from it.
            If path.join(data_dir, 'deps') exists, the parser is loaded from it.
            See Pipeline Directory Structure for details.

    Attributes:
        vocab (spacy.vocab.Vocab): The lexicon.

        strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.

        tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.

        tagger (spacy.en.pos.EnPosTagger):
            The part-of-speech tagger, which also performs lemmatization and
            morphological analysis.

        parser (spacy.syntax.parser.GreedyParser):
            A greedy shift-reduce dependency parser.


    """
    def __init__(self, data_dir=None):
        if data_dir is None:
            data_dir = path.join(path.dirname(__file__), 'data')
        self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
        self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
        if path.exists(path.join(data_dir, 'pos')):
            self.tagger = EnPosTagger(self.vocab.strings, data_dir)
        else:
            self.tagger = None
        if path.exists(path.join(data_dir, 'deps')):
            self.parser = GreedyParser(path.join(data_dir, 'deps'))
        else:
            self.parser = None
        self.strings = self.vocab.strings

    def __call__(self, text, tag=True, parse=True):
        """Apply the pipeline to some text.
        
        Args:
            text (unicode): The text to be processed.

        Keyword args:
            tag (bool): Whether to add part-of-speech tags to the text.  This
                will also set morphological analysis and lemmas.

            parse (bool): Whether to add dependency-heads and labels to the text.

        Returns:
            tokens (spacy.tokens.Tokens):
        """
        tokens = self.tokenizer.tokenize(text)
        if self.tagger and tag:
            self.tagger(tokens)
        if self.parser and parse:
            self.parser.parse(tokens)
        return tokens

    @property
    def tags(self):
        """List of part-of-speech tag names."""
        if self.tagger is None:
            return []
        else:
            return self.tagger.tag_names
* Work on train 2014-12-21 23:25:43 +03:00			`from __future__ import unicode_literals`
			`from os import path`

			`from .. import orth`
			`from ..vocab import Vocab`
			`from ..tokenizer import Tokenizer`
			`from ..syntax.parser import GreedyParser`
			`from ..tokens import Tokens`
			`from .pos import EnPosTagger`
* POS tagger training working after reorg 2014-12-22 00:54:47 +03:00			`from .pos import POS_TAGS`
* Work on train 2014-12-21 23:25:43 +03:00			`from .attrs import get_flags`


			`def get_lex_props(string):`
			`return {'flags': get_flags(string), 'dense': 1}`


			`class English(object):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""The English NLP pipeline.`

			`Provides a tokenizer, lexicon, part-of-speech tagger and parser.`

			`Keyword args:`
			`data_dir (unicode): A path to a directory, from which to load the pipeline.`
			`If None, looks for a directory named "data/" in the same directory as`
			`the present file, i.e. path.join(path.dirname(__file__, 'data')).`
			`If path.join(data_dir, 'pos') exists, the tagger is loaded from it.`
			`If path.join(data_dir, 'deps') exists, the parser is loaded from it.`
			`See Pipeline Directory Structure for details.`

			`Attributes:`
			`vocab (spacy.vocab.Vocab): The lexicon.`

			`strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.`

			`tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.`

			`tagger (spacy.en.pos.EnPosTagger):`
			`The part-of-speech tagger, which also performs lemmatization and`
			`morphological analysis.`

			`parser (spacy.syntax.parser.GreedyParser):`
			`A greedy shift-reduce dependency parser.`


			`"""`
			`def __init__(self, data_dir=None):`
* Work on train 2014-12-21 23:25:43 +03:00			`if data_dir is None:`
			`data_dir = path.join(path.dirname(__file__), 'data')`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)`
* Work on train 2014-12-21 23:25:43 +03:00			`self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`if path.exists(path.join(data_dir, 'pos')):`
			`self.tagger = EnPosTagger(self.vocab.strings, data_dir)`
			`else:`
			`self.tagger = None`
			`if path.exists(path.join(data_dir, 'deps')):`
			`self.parser = GreedyParser(path.join(data_dir, 'deps'))`
			`else:`
			`self.parser = None`
* Tmp 2014-12-24 09:42:00 +03:00			`self.strings = self.vocab.strings`
* Work on train 2014-12-21 23:25:43 +03:00
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`def __call__(self, text, tag=True, parse=True):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""Apply the pipeline to some text.`

			`Args:`
			`text (unicode): The text to be processed.`

			`Keyword args:`
			`tag (bool): Whether to add part-of-speech tags to the text. This`
			`will also set morphological analysis and lemmas.`

			`parse (bool): Whether to add dependency-heads and labels to the text.`

			`Returns:`
			`tokens (spacy.tokens.Tokens):`
			`"""`
* Work on train 2014-12-21 23:25:43 +03:00			`tokens = self.tokenizer.tokenize(text)`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`if self.tagger and tag:`
			`self.tagger(tokens)`
* Work on train 2014-12-21 23:25:43 +03:00			`if self.parser and parse:`
			`self.parser.parse(tokens)`
			`return tokens`
* Tmp 2014-12-24 09:42:00 +03:00
			`@property`
			`def tags(self):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""List of part-of-speech tag names."""`
* Tmp 2014-12-24 09:42:00 +03:00			`if self.tagger is None:`
			`return []`
			`else:`
			`return self.tagger.tag_names`
* Upd docstrings 2014-12-27 10:45:16 +03:00