* Add support for units to English.__init__, by loading and applying regular expressions

This commit is contained in:
Matthew Honnibal 2015-04-07 04:02:32 +02:00
parent 0ea5af88b6
commit b8d34531c4

View File

@ -9,9 +9,12 @@ from ..syntax.parser import GreedyParser
from ..syntax.arc_eager import ArcEager from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown from ..syntax.ner import BiluoPushDown
from ..tokens import Tokens from ..tokens import Tokens
from ..multi_words import RegexMerger
from .pos import EnPosTagger from .pos import EnPosTagger
from .pos import POS_TAGS from .pos import POS_TAGS
from .attrs import get_flags from .attrs import get_flags
from . import regexes
from ..util import read_lang_data from ..util import read_lang_data
@ -90,6 +93,11 @@ class English(object):
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re, self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
suffix_re, infix_re, suffix_re, infix_re,
POS_TAGS, tag_names) POS_TAGS, tag_names)
self.mwe_merger = RegexMerger([
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
('CD', 'TIME', regexes.TIME_RE),
('NNP', 'DATE', regexes.DAYS_RE),
('CD', 'MONEY', regexes.MONEY_RE)])
# These are lazy-loaded # These are lazy-loaded
self._tagger = None self._tagger = None
self._parser = None self._parser = None
@ -118,7 +126,7 @@ class English(object):
return self._entity return self._entity
def __call__(self, text, tag=True, parse=parse_if_model_present, def __call__(self, text, tag=True, parse=parse_if_model_present,
entity=parse_if_model_present): entity=parse_if_model_present, merge_mwes=True):
"""Apply the pipeline to some text. The text can span multiple sentences, """Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string and can contain arbtrary whitespace. Alignment into the original string
@ -183,6 +191,8 @@ class English(object):
self.parser(tokens) self.parser(tokens)
if entity and self.has_entity_model: if entity and self.has_entity_model:
self.entity(tokens) self.entity(tokens)
if merge_mwes and self.mwe_merger is not None:
self.mwe_merger(tokens)
return tokens return tokens
@property @property