mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 09:44:36 +03:00
34 lines
1.1 KiB
Python
34 lines
1.1 KiB
Python
from __future__ import unicode_literals
|
|
from os import path
|
|
|
|
from .. import orth
|
|
from ..vocab import Vocab
|
|
from ..tokenizer import Tokenizer
|
|
from ..syntax.parser import GreedyParser
|
|
from ..tokens import Tokens
|
|
from .pos import EnPosTagger
|
|
from .pos import POS_TAGS
|
|
from .attrs import get_flags
|
|
|
|
|
|
def get_lex_props(string):
|
|
return {'flags': get_flags(string), 'dense': 1}
|
|
|
|
|
|
class English(object):
|
|
def __init__(self, data_dir=None, tag=True, parse=False):
|
|
if data_dir is None:
|
|
data_dir = path.join(path.dirname(__file__), 'data')
|
|
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
|
|
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
|
|
self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
|
|
self.parser = GreedyParser(path.join(data_dir, 'deps')) if parse else None
|
|
|
|
def __call__(self, text, tag=True, parse=True):
|
|
tokens = self.tokenizer.tokenize(text)
|
|
if self.tagger and tag:
|
|
self.tagger(tokens)
|
|
if self.parser and parse:
|
|
self.parser.parse(tokens)
|
|
return tokens
|