spaCy/spacy/en/__init__.py

139 lines
4.1 KiB
Python
Raw Normal View History

2014-12-21 23:25:43 +03:00
from __future__ import unicode_literals
from os import path
import re
2014-12-21 23:25:43 +03:00
from .. import orth
2014-12-21 23:25:43 +03:00
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.arc_eager import ArcEager
2015-03-09 02:04:00 +03:00
from ..syntax.ner import BiluoPushDown
from ..syntax.parser import ParserFactory
2015-07-08 19:53:00 +03:00
from ..tokens import Doc
from ..multi_words import RegexMerger
2014-12-21 23:25:43 +03:00
from .pos import EnPosTagger
from .pos import POS_TAGS
2014-12-21 23:25:43 +03:00
from .attrs import get_flags
from . import regexes
2014-12-21 23:25:43 +03:00
from ..util import read_lang_data
2014-12-21 23:25:43 +03:00
def get_lex_props(string):
return {
'flags': get_flags(string),
'length': len(string),
2015-01-22 18:08:25 +03:00
'orth': string,
'lower': string.lower(),
'norm': string,
'shape': orth.word_shape(string),
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': 0,
'sentiment': 0
}
if_model_present = -1
2015-01-26 18:45:21 +03:00
2014-12-21 23:25:43 +03:00
class English(object):
2014-12-27 10:45:16 +03:00
"""The English NLP pipeline.
2015-07-07 15:00:07 +03:00
Example:
Load data from default directory:
>>> nlp = English()
>>> nlp = English(data_dir=u'')
Load data from specified directory:
>>> nlp = English(data_dir=u'path/to/data_directory')
Disable (and avoid loading) parts of the processing pipeline:
>>> nlp = English(vectors=False, parser=False, tagger=False, entity=False)
Start with nothing loaded:
>>> nlp = English(data_dir=None)
2014-12-27 10:45:16 +03:00
"""
ParserTransitionSystem = ArcEager
2015-03-09 02:04:00 +03:00
EntityTransitionSystem = BiluoPushDown
def __init__(self,
data_dir=path.join(path.dirname(__file__), 'data'),
Tokenizer=Tokenizer.from_dir,
Tagger=EnPosTagger,
Parser=ParserFactory(ParserTransitionSystem),
Entity=ParserFactory(EntityTransitionSystem),
load_vectors=True
):
self._data_dir = data_dir
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
get_lex_props=get_lex_props, load_vectors=load_vectors,
pos_tags=POS_TAGS)
2015-07-07 15:00:07 +03:00
if Tagger is True:
Tagger = EnPosTagger
2015-07-07 15:00:07 +03:00
if Parser is True:
transition_system = self.ParserTransitionSystem
Parser = lambda s, d: parser.Parser(s, d, transition_system)
2015-07-07 15:00:07 +03:00
if Entity is True:
transition_system = self.EntityTransitionSystem
Entity = lambda s, d: parser.Parser(s, d, transition_system)
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
if Tagger:
self.tagger = Tagger(self.vocab.strings, data_dir)
else:
self.tagger = None
if Parser:
self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps'))
else:
self.parser = None
if Entity:
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
else:
self.entity = None
self.mwe_merger = RegexMerger([
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
('CD', 'TIME', regexes.TIME_RE),
('NNP', 'DATE', regexes.DAYS_RE),
('CD', 'MONEY', regexes.MONEY_RE)])
2015-03-09 02:04:00 +03:00
def __call__(self, text, tag=True, parse=True, entity=True):
2015-01-26 18:45:21 +03:00
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
2014-12-27 10:45:16 +03:00
Args:
text (unicode): The text to be processed.
Returns:
2015-07-08 19:56:27 +03:00
tokens (spacy.tokens.Doc):
2015-01-26 18:45:21 +03:00
>>> from spacy.en import English
>>> nlp = English()
>>> tokens = nlp('An example sentence. Another example sentence.')
>>> tokens[0].orth_, tokens[0].head.tag_
('An', 'NN')
2014-12-27 10:45:16 +03:00
"""
tokens = self.tokenizer(text)
if self.tagger and tag:
self.tagger(tokens)
if self.parser and parse:
self.parser(tokens)
if self.entity and entity:
2015-03-09 02:04:00 +03:00
self.entity(tokens)
2014-12-21 23:25:43 +03:00
return tokens
2014-12-24 09:42:00 +03:00
@property
def tags(self):
2014-12-27 10:45:16 +03:00
"""List of part-of-speech tag names."""
return self.tagger.tag_names