* Prepare English class for NER

This commit is contained in:
Matthew Honnibal 2015-03-08 19:04:00 -04:00
parent f5830dc1c1
commit 220ce8bfed

View File

@ -7,6 +7,7 @@ from ..vocab import Vocab
from ..tokenizer import Tokenizer from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser from ..syntax.parser import GreedyParser
from ..syntax.arc_eager import ArcEager from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown
from ..tokens import Tokens from ..tokens import Tokens
from .pos import EnPosTagger from .pos import EnPosTagger
from .pos import POS_TAGS from .pos import POS_TAGS
@ -58,6 +59,7 @@ class English(object):
for later loading. for later loading.
""" """
ParserTransitionSystem = ArcEager ParserTransitionSystem = ArcEager
EntityTransitionSystem = BiluoPushDown
def __init__(self, data_dir=''): def __init__(self, data_dir=''):
if data_dir == '': if data_dir == '':
@ -74,6 +76,7 @@ class English(object):
infix_re = None infix_re = None
self.has_parser_model = False self.has_parser_model = False
self.has_tagger_model = False self.has_tagger_model = False
self.has_entity_model = False
else: else:
tok_data_dir = path.join(data_dir, 'tokenizer') tok_data_dir = path.join(data_dir, 'tokenizer')
tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir) tok_rules, prefix_re, suffix_re, infix_re = read_lang_data(tok_data_dir)
@ -82,6 +85,7 @@ class English(object):
infix_re = re.compile(infix_re) infix_re = re.compile(infix_re)
self.has_parser_model = path.exists(path.join(self._data_dir, 'deps')) self.has_parser_model = path.exists(path.join(self._data_dir, 'deps'))
self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos')) self.has_tagger_model = path.exists(path.join(self._data_dir, 'pos'))
self.has_entity_model = path.exists(path.join(self._data_dir, 'ner'))
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re, self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
suffix_re, infix_re, suffix_re, infix_re,
@ -89,6 +93,7 @@ class English(object):
# These are lazy-loaded # These are lazy-loaded
self._tagger = None self._tagger = None
self._parser = None self._parser = None
self._entity = None
@property @property
def tagger(self): def tagger(self):
@ -103,7 +108,15 @@ class English(object):
self.ParserTransitionSystem) self.ParserTransitionSystem)
return self._parser return self._parser
def __call__(self, text, tag=True, parse=parse_if_model_present): @property
def entity(self):
if self._entity is None:
self._entity = GreedyParser(path.join(self._data_dir, 'ner'),
self.EntityTransitionSystem)
return self._entity
def __call__(self, text, tag=True, parse=parse_if_model_present,
entity=parse_if_model_present):
"""Apply the pipeline to some text. The text can span multiple sentences, """Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbtrary whitespace. Alignment into the original string
@ -135,21 +148,39 @@ class English(object):
msg = ("Incompatible arguments: tag=False, parse=True" msg = ("Incompatible arguments: tag=False, parse=True"
"Part-of-speech tags are required for parsing.") "Part-of-speech tags are required for parsing.")
raise ValueError(msg) raise ValueError(msg)
if entity == True and tag == False:
msg = ("Incompatible arguments: tag=False, entity=True"
"Part-of-speech tags are required for entity recognition.")
raise ValueError(msg)
tokens = self.tokenizer(text) tokens = self.tokenizer(text)
if parse == -1 and tag == False: if parse == -1 and tag == False:
parse = False parse = False
elif parse == -1 and not self.has_parser_model: elif parse == -1 and not self.has_parser_model:
parse = False parse = False
if entity == -1 and tag == False:
entity = False
elif entity == -1 and not self.has_entity_model:
entity = False
if tag and self.has_tagger_model: if tag and self.has_tagger_model:
self.tagger(tokens) self.tagger(tokens)
if parse == True and not self.has_parser_model: if parse == True and not self.has_parser_model:
msg = ("Receive parse=True, but parser model not found.\n\n" msg = ("Received parse=True, but parser model not found.\n\n"
"Run:\n" "Run:\n"
"$ python -m spacy.en.download\n" "$ python -m spacy.en.download\n"
"To install the model.") "To install the model.")
raise IOError(msg) raise IOError(msg)
if entity == True and not self.has_entity_model:
msg = ("Received entity=True, but entity model not found.\n\n"
"Run:\n"
"$ python -m spacy.en.download\n"
"To install the model.")
raise IOError(msg)
if parse and self.has_parser_model: if parse and self.has_parser_model:
self.parser(tokens) self.parser(tokens)
if entity and self.has_entity_model:
self.entity(tokens)
return tokens return tokens
@property @property