mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
* Refactor __init__ for simplicity. Allow parse=True, tag=True etc flags to be passed at top-level. Do not lazy-load parser.
This commit is contained in:
parent
4d24d513ad
commit
4e4fac452b
|
@ -5,9 +5,10 @@ import re
|
|||
from .. import orth
|
||||
from ..vocab import Vocab
|
||||
from ..tokenizer import Tokenizer
|
||||
from ..syntax import parser
|
||||
from ..syntax.arc_eager import ArcEager
|
||||
from ..syntax.ner import BiluoPushDown
|
||||
from ..syntax.parser import ParserFactory
|
||||
|
||||
from ..tokens import Tokens
|
||||
from ..multi_words import RegexMerger
|
||||
|
||||
|
@ -36,10 +37,7 @@ def get_lex_props(string):
|
|||
'sentiment': 0
|
||||
}
|
||||
|
||||
|
||||
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||
|
||||
parse_if_model_present = -1
|
||||
if_model_present = -1
|
||||
|
||||
|
||||
class English(object):
|
||||
|
@ -63,45 +61,23 @@ class English(object):
|
|||
Start with nothing loaded:
|
||||
|
||||
>>> nlp = English(data_dir=None)
|
||||
|
||||
Keyword args:
|
||||
data_dir (unicode):
|
||||
A path to a directory from which to load the pipeline;
|
||||
or '', to load default; or None, to load nothing.
|
||||
|
||||
Tokenizer (bool or callable):
|
||||
desc
|
||||
|
||||
Vectors (bool or callable):
|
||||
desc
|
||||
|
||||
Parser (bool or callable):
|
||||
desc
|
||||
|
||||
Tagger (bool or callable):
|
||||
desc
|
||||
|
||||
Entity (bool or callable):
|
||||
desc
|
||||
|
||||
Senser (bool or callable):
|
||||
desc
|
||||
"""
|
||||
ParserTransitionSystem = ArcEager
|
||||
EntityTransitionSystem = BiluoPushDown
|
||||
|
||||
def __init__(self, data_dir='', Tokenizer=Tokenizer.from_dir, Vectors=True,
|
||||
Parser=True, Tagger=EnPosTagger, Entity=True, load_vectors=True):
|
||||
if data_dir == '':
|
||||
data_dir = LOCAL_DATA_DIR
|
||||
def __init__(self,
|
||||
data_dir=path.join(path.dirname(__file__), 'data'),
|
||||
Tokenizer=Tokenizer.from_dir,
|
||||
Tagger=EnPosTagger,
|
||||
Parser=ParserFactory(ParserTransitionSystem),
|
||||
Entity=ParserFactory(EntityTransitionSystem),
|
||||
load_vectors=True
|
||||
):
|
||||
|
||||
self._data_dir = data_dir
|
||||
|
||||
# TODO: Deprecation warning
|
||||
if load_vectors is False:
|
||||
vectors = False
|
||||
|
||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None,
|
||||
get_lex_props=get_lex_props, load_vectors=Vectors,
|
||||
get_lex_props=get_lex_props, load_vectors=load_vectors,
|
||||
pos_tags=POS_TAGS)
|
||||
if Tagger is True:
|
||||
Tagger = EnPosTagger
|
||||
|
@ -112,10 +88,8 @@ class English(object):
|
|||
transition_system = self.EntityTransitionSystem
|
||||
Entity = lambda s, d: parser.Parser(s, d, transition_system)
|
||||
|
||||
if Tokenizer:
|
||||
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
|
||||
else:
|
||||
self.tokenizer = None
|
||||
self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer'))
|
||||
|
||||
if Tagger:
|
||||
self.tagger = Tagger(self.vocab.strings, data_dir)
|
||||
else:
|
||||
|
@ -128,33 +102,20 @@ class English(object):
|
|||
self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner'))
|
||||
else:
|
||||
self.entity = None
|
||||
|
||||
self.mwe_merger = RegexMerger([
|
||||
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
||||
('CD', 'TIME', regexes.TIME_RE),
|
||||
('NNP', 'DATE', regexes.DAYS_RE),
|
||||
('CD', 'MONEY', regexes.MONEY_RE)])
|
||||
|
||||
def __call__(self, text, tag=True, parse=parse_if_model_present,
|
||||
entity=parse_if_model_present, merge_mwes=False):
|
||||
def __call__(self, text, tag=True, parse=True, entity=True):
|
||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||
and can contain arbtrary whitespace. Alignment into the original string
|
||||
|
||||
The tagger and parser are lazy-loaded the first time they are required.
|
||||
Loading the parser model usually takes 5-10 seconds.
|
||||
|
||||
is preserved.
|
||||
|
||||
Args:
|
||||
text (unicode): The text to be processed.
|
||||
|
||||
Keyword args:
|
||||
tag (bool): Whether to add part-of-speech tags to the text. Also
|
||||
sets morphological analysis and lemmas.
|
||||
|
||||
parse (True, False, -1): Whether to add labelled syntactic dependencies.
|
||||
|
||||
-1 (default) is "guess": It will guess True if tag=True and the
|
||||
model has been installed.
|
||||
|
||||
Returns:
|
||||
tokens (spacy.tokens.Tokens):
|
||||
|
||||
|
@ -164,36 +125,13 @@ class English(object):
|
|||
>>> tokens[0].orth_, tokens[0].head.tag_
|
||||
('An', 'NN')
|
||||
"""
|
||||
if parse == True and tag == False:
|
||||
msg = ("Incompatible arguments: tag=False, parse=True"
|
||||
"Part-of-speech tags are required for parsing.")
|
||||
raise ValueError(msg)
|
||||
if entity == True and tag == False:
|
||||
msg = ("Incompatible arguments: tag=False, entity=True"
|
||||
"Part-of-speech tags are required for entity recognition.")
|
||||
raise ValueError(msg)
|
||||
|
||||
tokens = self.tokenizer(text)
|
||||
if parse == -1 and tag == False:
|
||||
parse = False
|
||||
elif parse == -1 and self.parser is None:
|
||||
parse = False
|
||||
if entity == -1 and tag == False:
|
||||
entity = False
|
||||
elif entity == -1 and self.entity is None:
|
||||
entity = False
|
||||
if tag:
|
||||
ModelNotLoaded.check(self.tagger, 'tagger')
|
||||
if self.tagger and tag:
|
||||
self.tagger(tokens)
|
||||
if parse:
|
||||
ModelNotLoaded.check(self.parser, 'parser')
|
||||
if self.parser and parse:
|
||||
self.parser(tokens)
|
||||
if entity:
|
||||
ModelNotLoaded.check(self.entity, 'entity')
|
||||
if self.entity and entity:
|
||||
self.entity(tokens)
|
||||
|
||||
if merge_mwes and self.mwe_merger is not None:
|
||||
self.mwe_merger(tokens)
|
||||
return tokens
|
||||
|
||||
@property
|
||||
|
|
Loading…
Reference in New Issue
Block a user