mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
* Add support for units to English.__init__, by loading and applying regular expressions
This commit is contained in:
parent
0ea5af88b6
commit
b8d34531c4
|
@ -9,9 +9,12 @@ from ..syntax.parser import GreedyParser
|
||||||
from ..syntax.arc_eager import ArcEager
|
from ..syntax.arc_eager import ArcEager
|
||||||
from ..syntax.ner import BiluoPushDown
|
from ..syntax.ner import BiluoPushDown
|
||||||
from ..tokens import Tokens
|
from ..tokens import Tokens
|
||||||
|
from ..multi_words import RegexMerger
|
||||||
|
|
||||||
from .pos import EnPosTagger
|
from .pos import EnPosTagger
|
||||||
from .pos import POS_TAGS
|
from .pos import POS_TAGS
|
||||||
from .attrs import get_flags
|
from .attrs import get_flags
|
||||||
|
from . import regexes
|
||||||
|
|
||||||
|
|
||||||
from ..util import read_lang_data
|
from ..util import read_lang_data
|
||||||
|
@ -90,6 +93,11 @@ class English(object):
|
||||||
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
self.tokenizer = Tokenizer(self.vocab, tok_rules, prefix_re,
|
||||||
suffix_re, infix_re,
|
suffix_re, infix_re,
|
||||||
POS_TAGS, tag_names)
|
POS_TAGS, tag_names)
|
||||||
|
self.mwe_merger = RegexMerger([
|
||||||
|
('IN', 'O', regexes.MW_PREPOSITIONS_RE),
|
||||||
|
('CD', 'TIME', regexes.TIME_RE),
|
||||||
|
('NNP', 'DATE', regexes.DAYS_RE),
|
||||||
|
('CD', 'MONEY', regexes.MONEY_RE)])
|
||||||
# These are lazy-loaded
|
# These are lazy-loaded
|
||||||
self._tagger = None
|
self._tagger = None
|
||||||
self._parser = None
|
self._parser = None
|
||||||
|
@ -118,7 +126,7 @@ class English(object):
|
||||||
return self._entity
|
return self._entity
|
||||||
|
|
||||||
def __call__(self, text, tag=True, parse=parse_if_model_present,
|
def __call__(self, text, tag=True, parse=parse_if_model_present,
|
||||||
entity=parse_if_model_present):
|
entity=parse_if_model_present, merge_mwes=True):
|
||||||
"""Apply the pipeline to some text. The text can span multiple sentences,
|
"""Apply the pipeline to some text. The text can span multiple sentences,
|
||||||
and can contain arbtrary whitespace. Alignment into the original string
|
and can contain arbtrary whitespace. Alignment into the original string
|
||||||
|
|
||||||
|
@ -183,6 +191,8 @@ class English(object):
|
||||||
self.parser(tokens)
|
self.parser(tokens)
|
||||||
if entity and self.has_entity_model:
|
if entity and self.has_entity_model:
|
||||||
self.entity(tokens)
|
self.entity(tokens)
|
||||||
|
if merge_mwes and self.mwe_merger is not None:
|
||||||
|
self.mwe_merger(tokens)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
Loading…
Reference in New Issue
Block a user