spaCy/spacy/en/__init__.py

from __future__ import unicode_literals
from os import path
import re

from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown
from ..tokens import Tokens
from ..multi_words import RegexMerger

from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags
from . import regexes


from ..util import read_lang_data


def get_lex_props(string):
    return {
        'flags': get_flags(string),
        'length': len(string),
        'orth': string,
        'lower': string.lower(),
        'norm': string,
        'shape': orth.word_shape(string),
        'prefix': string[0],
        'suffix': string[-3:],
        'cluster': 0,
        'prob': 0,
        'sentiment': 0
    }


LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')

parse_if_model_present = -1


class English(object):
    """The English NLP pipeline.

    Provides a tokenizer, lexicon, part-of-speech tagger and parser.

    Keyword args:
        data_dir (unicode):
            A path to a directory, from which to load the pipeline.

            By default, data is installed within the spaCy package directory. So
            if no data_dir is specified, spaCy attempts to load from a
            directory named "data" that is a sibling of the spacy/en/__init__.py
            file.  You can find the location of this file by running:

                $ python -c "import spacy.en; print spacy.en.__file__"

            To prevent any data files from being loaded, pass data_dir=None. This
            is useful if you want to construct a lexicon, which you'll then save
            for later loading.
    """
    ParserTransitionSystem = ArcEager
    EntityTransitionSystem = BiluoPushDown

    def __init__(self, data_dir=''):
        if data_dir == '':
            data_dir = LOCAL_DATA_DIR
        self._data_dir = data_dir
        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
                           get_lex_props=get_lex_props)
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
        if data_dir is None:
            tok_rules = {}
            prefix_re = None
            suffix_re = None
            infix_re = None
            self.has_parser_model = False
            self.has_tagger_model = False
            self.has_entity_model = False
        else:
            tok_data_dir = path.join(data_dir, 'tokenizer')
            tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
            prefix_re = re.compile(prefix_re)
            suffix_re = re.compile(suffix_re)
            infix_re = re.compile(infix_re)
            self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
            self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
            self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))

        self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
                                   suffix_re, infix_re,
                                   POS_TAGS, tag_names)
        self.mwe_merger = RegexMerger([
            ('IN', 'O', regexes.MW_PREPOSITIONS_RE),
            ('CD', 'TIME', regexes.TIME_RE),
            ('NNP', 'DATE', regexes.DAYS_RE),
            ('CD', 'MONEY', regexes.MONEY_RE)])
        # These are lazy-loaded
        self._tagger = None
        self._parser = None
        self._entity = None

    @property
    def tagger(self):
        if self._tagger is None:
            self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
        return self._tagger

    @property
    def parser(self):
        if self._parser is None:
            self._parser = GreedyParser(self.vocab.strings,
                                        path.join(self._data_dir, 'deps'),
                                        self.ParserTransitionSystem)
        return self._parser

    @property
    def entity(self):
        if self._entity is None:
            self._entity = GreedyParser(self.vocab.strings,
                                        path.join(self._data_dir, 'ner'),
                                        self.EntityTransitionSystem)
        return self._entity

    def __call__(self, text, tag=True, parse=parse_if_model_present,
                 entity=parse_if_model_present, merge_mwes=True):
        """Apply the pipeline to some text.  The text can span multiple sentences,
        and can contain arbtrary whitespace.  Alignment into the original string
        
        The tagger and parser are lazy-loaded the first time they are required.
        Loading the parser model usually takes 5-10 seconds.
        
        Args:
            text (unicode): The text to be processed.

        Keyword args:
            tag (bool): Whether to add part-of-speech tags to the text.  Also
                sets morphological analysis and lemmas.
        
            parse (True, False, -1): Whether to add labelled syntactic dependencies.
            
              -1 (default) is "guess": It will guess True if tag=True and the
                model has been installed.

        Returns:
            tokens (spacy.tokens.Tokens):

        >>> from spacy.en import English
        >>> nlp = English()
        >>> tokens = nlp('An example sentence. Another example sentence.')
        >>> tokens[0].orth_, tokens[0].head.tag_
        ('An', 'NN')
        """
        if parse == True and tag == False:
            msg = ("Incompatible arguments: tag=False, parse=True"
                   "Part-of-speech tags are required for parsing.")
            raise ValueError(msg)
        if entity == True and tag == False:
            msg = ("Incompatible arguments: tag=False, entity=True"
                   "Part-of-speech tags are required for entity recognition.")
            raise ValueError(msg)

        tokens = self.tokenizer(text)
        if parse == -1 and tag == False:
            parse = False
        elif parse == -1 and not self.has_parser_model:
            parse = False
        if entity == -1 and tag == False:
            entity = False
        elif entity == -1 and not self.has_entity_model:
            entity = False
        if tag and self.has_tagger_model:
            self.tagger(tokens)
        if parse == True and not self.has_parser_model:
            msg = ("Received parse=True, but parser model not found.\n\n"
                  "Run:\n"
                  "$ python -m spacy.en.download\n"
                  "To install the model.")
            raise IOError(msg)
        if entity == True and not self.has_entity_model:
            msg = ("Received entity=True, but entity model not found.\n\n"
                  "Run:\n"
                  "$ python -m spacy.en.download\n"
                  "To install the model.")
            raise IOError(msg)

        if parse and self.has_parser_model:
            self.parser(tokens)
        if entity and self.has_entity_model:
            self.entity(tokens)
        if merge_mwes and self.mwe_merger is not None:
            self.mwe_merger(tokens)
        return tokens

    @property
    def tags(self):
        """List of part-of-speech tag names."""
        return self.tagger.tag_names
* Work on train 2014-12-21 23:25:43 +03:00			`from __future__ import unicode_literals`
			`from os import path`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`import re`
* Work on train 2014-12-21 23:25:43 +03:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`from .. import orth`
* Work on train 2014-12-21 23:25:43 +03:00			`from ..vocab import Vocab`
			`from ..tokenizer import Tokenizer`
			`from ..syntax.parser import GreedyParser`
* Specify parser transition system in language 2015-02-22 08:32:33 +03:00			`from ..syntax.arc_eager import ArcEager`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`from ..syntax.ner import BiluoPushDown`
* Work on train 2014-12-21 23:25:43 +03:00			`from ..tokens import Tokens`
* Add support for units to English.__init__, by loading and applying regular expressions 2015-04-07 05:02:32 +03:00			`from ..multi_words import RegexMerger`

* Work on train 2014-12-21 23:25:43 +03:00			`from .pos import EnPosTagger`
* POS tagger training working after reorg 2014-12-22 00:54:47 +03:00			`from .pos import POS_TAGS`
* Work on train 2014-12-21 23:25:43 +03:00			`from .attrs import get_flags`
* Add support for units to English.__init__, by loading and applying regular expressions 2015-04-07 05:02:32 +03:00			`from . import regexes`
* Work on train 2014-12-21 23:25:43 +03:00

* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`from ..util import read_lang_data`


* Work on train 2014-12-21 23:25:43 +03:00			`def get_lex_props(string):`
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00			`return {`
			`'flags': get_flags(string),`
			`'length': len(string),`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`'orth': string,`
* Rename NORM1 and NORM2 attrs to lower and norm 2015-01-23 22:17:03 +03:00			`'lower': string.lower(),`
			`'norm': string,`
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00			`'shape': orth.word_shape(string),`
			`'prefix': string[0],`
			`'suffix': string[-3:],`
			`'cluster': 0,`
			`'prob': 0,`
			`'sentiment': 0`
			`}`

* Work on train 2014-12-21 23:25:43 +03:00
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')`
* Work on train 2014-12-21 23:25:43 +03:00
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`parse_if_model_present = -1`

* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme 2015-01-14 16:33:16 +03:00
* Work on train 2014-12-21 23:25:43 +03:00			`class English(object):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""The English NLP pipeline.`

			`Provides a tokenizer, lexicon, part-of-speech tagger and parser.`

			`Keyword args:`
* Improve docstring on English 2015-02-11 23:13:20 +03:00			`data_dir (unicode):`
			`A path to a directory, from which to load the pipeline.`
* Upd docstrings 2014-12-27 10:45:16 +03:00
* Improve docstring on English 2015-02-11 23:13:20 +03:00			`By default, data is installed within the spaCy package directory. So`
			`if no data_dir is specified, spaCy attempts to load from a`
			`directory named "data" that is a sibling of the spacy/en/__init__.py`
			`file. You can find the location of this file by running:`
* Upd docstrings 2014-12-27 10:45:16 +03:00
* Improve docstring on English 2015-02-11 23:13:20 +03:00			`$ python -c "import spacy.en; print spacy.en.__file__"`
* Fix default model path for English 2015-01-31 08:38:27 +03:00
			`To prevent any data files from being loaded, pass data_dir=None. This`
			`is useful if you want to construct a lexicon, which you'll then save`
			`for later loading.`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""`
* Specify parser transition system in language 2015-02-22 08:32:33 +03:00			`ParserTransitionSystem = ArcEager`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`EntityTransitionSystem = BiluoPushDown`
* Specify parser transition system in language 2015-02-22 08:32:33 +03:00
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`def __init__(self, data_dir=''):`
			`if data_dir == '':`
			`data_dir = LOCAL_DATA_DIR`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`self._data_dir = data_dir`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 16:03:48 +03:00			`self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,`
* Move around data files for test release 2015-01-02 17:59:22 +03:00			`get_lex_props=get_lex_props)`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`tag_names = list(POS_TAGS.keys())`
			`tag_names.sort()`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`if data_dir is None:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tok_rules = {}`
			`prefix_re = None`
			`suffix_re = None`
			`infix_re = None`
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`self.has_parser_model = False`
			`self.has_tagger_model = False`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`self.has_entity_model = False`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`else:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tok_data_dir = path.join(data_dir, 'tokenizer')`
			`tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)`
* Fix data_dir=None argument to English class 2015-01-21 10:27:31 +03:00			`prefix_re = re.compile(prefix_re)`
			`suffix_re = re.compile(suffix_re)`
			`infix_re = re.compile(infix_re)`
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))`
			`self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))`
* Fix default model path for English 2015-01-31 08:38:27 +03:00
* Fix data_dir=None argument to English class 2015-01-21 10:27:31 +03:00			`self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,`
			`suffix_re, infix_re,`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`POS_TAGS, tag_names)`
* Add support for units to English.__init__, by loading and applying regular expressions 2015-04-07 05:02:32 +03:00			`self.mwe_merger = RegexMerger([`
			`('IN', 'O', regexes.MW_PREPOSITIONS_RE),`
			`('CD', 'TIME', regexes.TIME_RE),`
			`('NNP', 'DATE', regexes.DAYS_RE),`
			`('CD', 'MONEY', regexes.MONEY_RE)])`
* Fix default model path for English 2015-01-31 08:38:27 +03:00			`# These are lazy-loaded`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`self._tagger = None`
			`self._parser = None`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`self._entity = None`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00
			`@property`
			`def tagger(self):`
			`if self._tagger is None:`
			`self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)`
			`return self._tagger`

			`@property`
			`def parser(self):`
			`if self._parser is None:`
* Clean up handling of dep_strings and ent_strings, using StringStore to encode the label names. 2015-03-14 18:10:27 +03:00			`self._parser = GreedyParser(self.vocab.strings,`
			`path.join(self._data_dir, 'deps'),`
* Specify parser transition system in language 2015-02-22 08:32:33 +03:00			`self.ParserTransitionSystem)`
* Lazy-load tagger and parser 2014-12-30 15:25:09 +03:00			`return self._parser`
* Work on train 2014-12-21 23:25:43 +03:00
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`@property`
			`def entity(self):`
			`if self._entity is None:`
* Clean up handling of dep_strings and ent_strings, using StringStore to encode the label names. 2015-03-14 18:10:27 +03:00			`self._entity = GreedyParser(self.vocab.strings,`
			`path.join(self._data_dir, 'ner'),`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`self.EntityTransitionSystem)`
			`return self._entity`

			`def __call__(self, text, tag=True, parse=parse_if_model_present,`
* Add support for units to English.__init__, by loading and applying regular expressions 2015-04-07 05:02:32 +03:00			`entity=parse_if_model_present, merge_mwes=True):`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`"""Apply the pipeline to some text. The text can span multiple sentences,`
			`and can contain arbtrary whitespace. Alignment into the original string`

			`The tagger and parser are lazy-loaded the first time they are required.`
			`Loading the parser model usually takes 5-10 seconds.`
* Upd docstrings 2014-12-27 10:45:16 +03:00
			`Args:`
			`text (unicode): The text to be processed.`

			`Keyword args:`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`tag (bool): Whether to add part-of-speech tags to the text. Also`
			`sets morphological analysis and lemmas.`

			`parse (True, False, -1): Whether to add labelled syntactic dependencies.`

			`-1 (default) is "guess": It will guess True if tag=True and the`
			`model has been installed.`
* Upd docstrings 2014-12-27 10:45:16 +03:00
			`Returns:`
			`tokens (spacy.tokens.Tokens):`
* Add docstring to English class 2015-01-26 18:45:21 +03:00
			`>>> from spacy.en import English`
			`>>> nlp = English()`
			`>>> tokens = nlp('An example sentence. Another example sentence.')`
			`>>> tokens[0].orth_, tokens[0].head.tag_`
			`('An', 'NN')`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`if parse == True and tag == False:`
			`msg = ("Incompatible arguments: tag=False, parse=True"`
			`"Part-of-speech tags are required for parsing.")`
			`raise ValueError(msg)`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`if entity == True and tag == False:`
			`msg = ("Incompatible arguments: tag=False, entity=True"`
			`"Part-of-speech tags are required for entity recognition.")`
			`raise ValueError(msg)`

* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`tokens = self.tokenizer(text)`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`if parse == -1 and tag == False:`
			`parse = False`
			`elif parse == -1 and not self.has_parser_model:`
			`parse = False`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`if entity == -1 and tag == False:`
			`entity = False`
			`elif entity == -1 and not self.has_entity_model:`
			`entity = False`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`if tag and self.has_tagger_model:`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`self.tagger(tokens)`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`if parse == True and not self.has_parser_model:`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`msg = ("Received parse=True, but parser model not found.\n\n"`
			`"Run:\n"`
			`"$ python -m spacy.en.download\n"`
			`"To install the model.")`
			`raise IOError(msg)`
			`if entity == True and not self.has_entity_model:`
			`msg = ("Received entity=True, but entity model not found.\n\n"`
* Add docstring to English class 2015-01-26 18:45:21 +03:00			`"Run:\n"`
			`"$ python -m spacy.en.download\n"`
			`"To install the model.")`
			`raise IOError(msg)`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00
* Silently don't parse if data is not present 2015-01-25 06:47:38 +03:00			`if parse and self.has_parser_model:`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`self.parser(tokens)`
* Prepare English class for NER 2015-03-09 02:04:00 +03:00			`if entity and self.has_entity_model:`
			`self.entity(tokens)`
* Add support for units to English.__init__, by loading and applying regular expressions 2015-04-07 05:02:32 +03:00			`if merge_mwes and self.mwe_merger is not None:`
			`self.mwe_merger(tokens)`
* Work on train 2014-12-21 23:25:43 +03:00			`return tokens`
* Tmp 2014-12-24 09:42:00 +03:00
			`@property`
			`def tags(self):`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""List of part-of-speech tag names."""`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`return self.tagger.tag_names`