mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 10:46:29 +03:00
* Move around data files for test release
This commit is contained in:
parent
47c71ce1c0
commit
f5d41028b5
|
@ -11,6 +11,9 @@ from .pos import POS_TAGS
|
|||
from .attrs import get_flags
|
||||
|
||||
|
||||
DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||
|
||||
|
||||
def get_lex_props(string):
|
||||
return {'flags': get_flags(string), 'dense': 1}
|
||||
|
||||
|
@ -46,10 +49,12 @@ class English(object):
|
|||
if data_dir is None:
|
||||
data_dir = path.join(path.dirname(__file__), 'data')
|
||||
self._data_dir = data_dir
|
||||
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
|
||||
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
|
||||
get_lex_props=get_lex_props)
|
||||
tag_names = list(POS_TAGS.keys())
|
||||
tag_names.sort()
|
||||
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir, POS_TAGS, tag_names)
|
||||
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
|
||||
POS_TAGS, tag_names)
|
||||
self.strings = self.vocab.strings
|
||||
self._tagger = None
|
||||
self._parser = None
|
||||
|
|
|
@ -21,21 +21,21 @@ def read_lang_data(data_dir):
|
|||
|
||||
|
||||
def read_prefix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'prefix')) as file_:
|
||||
with utf8open(path.join(data_dir, 'prefix.txt')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
|
||||
def read_suffix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
||||
with utf8open(path.join(data_dir, 'suffix.txt')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
||||
|
||||
def read_infix(data_dir):
|
||||
with utf8open(path.join(data_dir, 'infix')) as file_:
|
||||
with utf8open(path.join(data_dir, 'infix.txt')) as file_:
|
||||
entries = file_.read().split('\n')
|
||||
expression = '|'.join([piece for piece in entries if piece.strip()])
|
||||
return expression
|
||||
|
|
|
@ -47,8 +47,8 @@ cdef class Vocab:
|
|||
if data_dir is not None:
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
self.strings.load(path.join(data_dir, 'strings'))
|
||||
self.load(path.join(data_dir, 'lexemes'))
|
||||
self.strings.load(path.join(data_dir, 'strings.txt'))
|
||||
self.load(path.join(data_dir, 'lexemes.bin'))
|
||||
|
||||
def __len__(self):
|
||||
"""The current number of lexemes stored."""
|
||||
|
|
Loading…
Reference in New Issue
Block a user