2014-12-21 23:25:43 +03:00
|
|
|
from __future__ import unicode_literals
|
|
|
|
from os import path
|
2015-01-17 08:21:17 +03:00
|
|
|
import re
|
2014-12-21 23:25:43 +03:00
|
|
|
|
2015-01-12 02:26:22 +03:00
|
|
|
from .. import orth
|
2014-12-21 23:25:43 +03:00
|
|
|
from ..vocab import Vocab
|
|
|
|
from ..tokenizer import Tokenizer
|
|
|
|
from ..syntax.parser import GreedyParser
|
|
|
|
from ..tokens import Tokens
|
|
|
|
from .pos import EnPosTagger
|
2014-12-22 00:54:47 +03:00
|
|
|
from .pos import POS_TAGS
|
2014-12-21 23:25:43 +03:00
|
|
|
from .attrs import get_flags
|
|
|
|
|
|
|
|
|
2015-01-17 08:21:17 +03:00
|
|
|
from ..util import read_lang_data
|
|
|
|
|
|
|
|
|
2014-12-21 23:25:43 +03:00
|
|
|
def get_lex_props(string):
|
2015-01-14 16:33:16 +03:00
|
|
|
return {
|
|
|
|
'flags': get_flags(string),
|
|
|
|
'length': len(string),
|
2015-01-22 18:08:25 +03:00
|
|
|
'orth': string,
|
2015-01-23 22:17:03 +03:00
|
|
|
'lower': string.lower(),
|
|
|
|
'norm': string,
|
2015-01-14 16:33:16 +03:00
|
|
|
'shape': orth.word_shape(string),
|
|
|
|
'prefix': string[0],
|
|
|
|
'suffix': string[-3:],
|
|
|
|
'cluster': 0,
|
|
|
|
'prob': 0,
|
|
|
|
'sentiment': 0
|
|
|
|
}
|
|
|
|
|
2014-12-21 23:25:43 +03:00
|
|
|
|
2015-01-12 02:26:22 +03:00
|
|
|
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
2014-12-21 23:25:43 +03:00
|
|
|
|
2015-01-14 16:33:16 +03:00
|
|
|
|
2014-12-21 23:25:43 +03:00
|
|
|
class English(object):
|
2014-12-27 10:45:16 +03:00
|
|
|
"""The English NLP pipeline.
|
|
|
|
|
|
|
|
Provides a tokenizer, lexicon, part-of-speech tagger and parser.
|
|
|
|
|
|
|
|
Keyword args:
|
|
|
|
data_dir (unicode): A path to a directory, from which to load the pipeline.
|
|
|
|
If None, looks for a directory named "data/" in the same directory as
|
|
|
|
the present file, i.e. path.join(path.dirname(__file__, 'data')).
|
|
|
|
If path.join(data_dir, 'pos') exists, the tagger is loaded from it.
|
|
|
|
If path.join(data_dir, 'deps') exists, the parser is loaded from it.
|
|
|
|
See Pipeline Directory Structure for details.
|
|
|
|
|
|
|
|
Attributes:
|
|
|
|
vocab (spacy.vocab.Vocab): The lexicon.
|
|
|
|
|
|
|
|
strings (spacy.strings.StringStore): Encode/decode strings to/from integer IDs.
|
|
|
|
|
|
|
|
tokenizer (spacy.tokenizer.Tokenizer): The start of the pipeline.
|
|
|
|
|
|
|
|
tagger (spacy.en.pos.EnPosTagger):
|
|
|
|
The part-of-speech tagger, which also performs lemmatization and
|
|
|
|
morphological analysis.
|
|
|
|
|
|
|
|
parser (spacy.syntax.parser.GreedyParser):
|
|
|
|
A greedy shift-reduce dependency parser.
|
|
|
|
"""
|
2015-01-12 02:26:22 +03:00
|
|
|
def __init__(self, data_dir=LOCAL_DATA_DIR):
|
2014-12-30 15:25:09 +03:00
|
|
|
self._data_dir = data_dir
|
2015-01-13 16:03:48 +03:00
|
|
|
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
2015-01-02 17:59:22 +03:00
|
|
|
get_lex_props=get_lex_props)
|
2014-12-30 15:25:09 +03:00
|
|
|
tag_names = list(POS_TAGS.keys())
|
|
|
|
tag_names.sort()
|
2015-01-12 02:26:22 +03:00
|
|
|
if data_dir is None:
|
2015-01-17 08:21:17 +03:00
|
|
|
tok_rules = {}
|
|
|
|
prefix_re = None
|
|
|
|
suffix_re = None
|
|
|
|
infix_re = None
|
2015-01-12 02:26:22 +03:00
|
|
|
else:
|
2015-01-17 08:21:17 +03:00
|
|
|
tok_data_dir = path.join(data_dir, 'tokenizer')
|
|
|
|
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
|
2015-01-21 10:27:31 +03:00
|
|
|
prefix_re = re.compile(prefix_re)
|
|
|
|
suffix_re = re.compile(suffix_re)
|
|
|
|
infix_re = re.compile(infix_re)
|
|
|
|
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
|
|
|
suffix_re, infix_re,
|
2015-01-17 08:21:17 +03:00
|
|
|
POS_TAGS, tag_names)
|
2014-12-30 15:25:09 +03:00
|
|
|
self._tagger = None
|
|
|
|
self._parser = None
|
|
|
|
|
|
|
|
@property
|
|
|
|
def tagger(self):
|
|
|
|
if self._tagger is None:
|
|
|
|
self._tagger = EnPosTagger(self.vocab.strings, self._data_dir)
|
|
|
|
return self._tagger
|
|
|
|
|
|
|
|
@property
|
|
|
|
def parser(self):
|
|
|
|
if self._parser is None:
|
|
|
|
self._parser = GreedyParser(path.join(self._data_dir, 'deps'))
|
|
|
|
return self._parser
|
2014-12-21 23:25:43 +03:00
|
|
|
|
2015-01-24 17:28:28 +03:00
|
|
|
def __call__(self, text, tag=True, parse=True):
|
2014-12-27 10:45:16 +03:00
|
|
|
"""Apply the pipeline to some text.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
text (unicode): The text to be processed.
|
|
|
|
|
|
|
|
Keyword args:
|
|
|
|
tag (bool): Whether to add part-of-speech tags to the text. This
|
|
|
|
will also set morphological analysis and lemmas.
|
|
|
|
|
|
|
|
parse (bool): Whether to add dependency-heads and labels to the text.
|
|
|
|
|
|
|
|
Returns:
|
|
|
|
tokens (spacy.tokens.Tokens):
|
|
|
|
"""
|
2015-01-17 08:21:17 +03:00
|
|
|
tokens = self.tokenizer(text)
|
2014-12-31 11:40:59 +03:00
|
|
|
if tag:
|
2014-12-23 03:40:32 +03:00
|
|
|
self.tagger(tokens)
|
2014-12-31 11:40:59 +03:00
|
|
|
if parse:
|
2015-01-17 08:21:17 +03:00
|
|
|
self.parser(tokens)
|
2014-12-21 23:25:43 +03:00
|
|
|
return tokens
|
2014-12-24 09:42:00 +03:00
|
|
|
|
|
|
|
@property
|
|
|
|
def tags(self):
|
2014-12-27 10:45:16 +03:00
|
|
|
"""List of part-of-speech tag names."""
|
2014-12-31 11:40:59 +03:00
|
|
|
return self.tagger.tag_names
|