spaCy/spacy/en/__init__.py

34 lines
1.1 KiB
Python
Raw Normal View History

2014-12-21 23:25:43 +03:00
from __future__ import unicode_literals
from os import path
from .. import orth
from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..tokens import Tokens
from .pos import EnPosTagger
from .pos import POS_TAGS
2014-12-21 23:25:43 +03:00
from .attrs import get_flags
def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1}
class English(object):
def __init__(self, data_dir=None, tag=True, parse=False):
2014-12-21 23:25:43 +03:00
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
2014-12-21 23:25:43 +03:00
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
2014-12-23 07:09:09 +03:00
self.parser = GreedyParser(path.join(data_dir, 'deps')) if parse else None
2014-12-21 23:25:43 +03:00
def __call__(self, text, tag=True, parse=True):
2014-12-21 23:25:43 +03:00
tokens = self.tokenizer.tokenize(text)
if self.tagger and tag:
self.tagger(tokens)
2014-12-21 23:25:43 +03:00
if self.parser and parse:
self.parser.parse(tokens)
return tokens