spaCy/spacy/en/__init__.py

from __future__ import unicode_literals
from os import path
import re

from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import Parser
from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown
from ..tokens import Tokens
from ..multi_words import RegexMerger

from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags
from . import regexes


from ..util import read_lang_data


def get_lex_props(string):
    return {
        'flags': get_flags(string),
        'length': len(string),
        'orth': string,
        'lower': string.lower(),
        'norm': string,
        'shape': orth.word_shape(string),
        'prefix': string[0],
        'suffix': string[-3:],
        'cluster': 0,
        'prob': 0,
        'sentiment': 0
    }


LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')

parse_if_model_present = -1


class English(object):
    """The English NLP pipeline.

    Example:

        Load data from default directory:

            >>> nlp = English()
            >>> nlp = English(data_dir=u'')

        Load data from specified directory:

            >>> nlp = English(data_dir=u'path/to/data_directory')

        Disable (and avoid loading) parts of the processing pipeline:

            >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)

        Start with nothing loaded:

            >>> nlp = English(data_dir=None)

    Keyword args:
        data_dir (unicode):
            A path to a directory from which to load the pipeline;
            or '', to load default; or None, to load nothing.

        Tokenizer (bool or callable):
            desc

        Vectors (bool or callable):
            desc

        Parser (bool or callable):
            desc

        Tagger (bool or callable):
            desc

        Entity (bool or callable):
            desc

        Senser (bool or callable):
            desc
    """
    ParserTransitionSystem = ArcEager
    EntityTransitionSystem = BiluoPushDown

    def __init__(self, data_dir='', Tokenizer=True, Vectors=True, Parser=True,
                 Tagger=True, Entity=True, Senser=True, load_vectors=True):
        if data_dir == '':
            data_dir = LOCAL_DATA_DIR
        # TODO: Deprecation warning
        if load_vectors is False:
            vectors = False

        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
                           get_lex_props=get_lex_props, vectors=Vectors)

        if Tokenizer is True:
            Tokenizer = tokenizer.Tokenizer
        if Tagger is True:
            Tagger = pos.EnPosTagger
        if Parser is True:
            transition_system = self.ParserTransitionSystem
            Parser = lambda s, d: parser.Parser(s, d, transition_system
        if Entity is True:
            transition_system = self.EntityTransitionSystem
            Entity = lambda s, d: parser.Parser(s, d, transition_system)
        if Senser is True:
            Senser = wsd.SuperSenseTagger

        self.tokenizer = Tokenizer(self.vocab, data_dir) if Tokenizer else None
        self.tagger = Tagger(self.vocab.strings, data_dir) if Tagger else None
        self.parser = Parser(self.vocab.strings, data_dir) if Parser else None
        self.entity = Entity(self.vocab.strings, data_dir) if Entity else None
        self.senser = Senser(self.vocab.strings, data_dir) if Senser else None

        self._data_dir = data_dir
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
        if data_dir is None:
            tok_rules = {}
            prefix_re = None
            suffix_re = None
            infix_re = None
        else:
            tok_data_dir = path.join(data_dir, 'tokenizer')
            tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
            prefix_re = re.compile(prefix_re)
            suffix_re = re.compile(suffix_re)
            infix_re = re.compile(infix_re)

        self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
                                   suffix_re, infix_re,
                                   POS_TAGS, tag_names)

        self.mwe_merger = RegexMerger([
            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
            ('CD', 'TIME', regexes.TIME_RE),
            ('NNP', 'DATE', regexes.DAYS_RE),
            ('CD', 'MONEY', regexes.MONEY_RE)])

    def __call__(self, text, tag=True, parse=parse_if_model_present,
                 entity=parse_if_model_present, merge_mwes=False):
        """Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string

        The tagger and parser are lazy-loaded the first time they are required.
        Loading the parser model usually takes 5-10 seconds.

        Args:
            text (unicode): The text to be processed.

        Keyword args:
            tag (bool): Whether to add part-of-speech tags to the text.  Also
                sets morphological analysis and lemmas.

            parse (True, False, -1): Whether to add labelled syntactic dependencies.

              -1 (default) is "guess": It will guess True if tag=True and the
                model has been installed.

        Returns:
            tokens (spacy.tokens.Tokens):

        >>> from spacy.en import English
        >>> nlp = English()
        >>> tokens = nlp('An example sentence. Another example sentence.')
        >>> tokens[0].orth_, tokens[0].head.tag_
        ('An', 'NN')
        """
        if parse == True and tag == False:
            msg = ("Incompatible arguments: tag=False, parse=True"
                   "Part-of-speech tags are required for parsing.")
            raise ValueError(msg)
        if entity == True and tag == False:
            msg = ("Incompatible arguments: tag=False, entity=True"
                   "Part-of-speech tags are required for entity recognition.")
            raise ValueError(msg)

        tokens = self.tokenizer(text)
        if parse == -1 and tag == False:
            parse = False
        elif parse == -1 and not self.has_parser_model:
            parse = False
        if entity == -1 and tag == False:
            entity = False
        elif entity == -1 and not self.has_entity_model:
            entity = False
        if tag and self.has_tagger_model:
            self.tagger(tokens)
        if parse == True and not self.has_parser_model:
            msg = ("Received parse=True, but parser model not found.\n\n"
                  "Run:\n"
                  "$ python -m spacy.en.download\n"
                  "To install the model.")
            raise IOError(msg)
        if entity == True and not self.has_entity_model:
            msg = ("Received entity=True, but entity model not found.\n\n"
                  "Run:\n"
                  "$ python -m spacy.en.download\n"
                  "To install the model.")
            raise IOError(msg)

        if parse and self.has_parser_model:
            self.parser(tokens)
        if entity and self.has_entity_model:
            self.entity(tokens)
        if merge_mwes and self.mwe_merger is not None:
            self.mwe_merger(tokens)
        return tokens

    @property
    def tags(self):
        """List of part-of-speech tag names."""
        return self.tagger.tag_names