* Move around data files for test release

This commit is contained in:
Matthew Honnibal 2015-01-03 01:59:22 +11:00
parent 47c71ce1c0
commit f5d41028b5
3 changed files with 12 additions and 7 deletions

View File

@ -11,6 +11,9 @@ from .pos import POS_TAGS
from .attrs import get_flags
DATA_DIR = path.join(path.dirname(__file__), 'data')
def get_lex_props(string):
return {'flags': get_flags(string), 'dense': 1}
@ -46,10 +49,12 @@ class English(object):
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
self._data_dir = data_dir
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
get_lex_props=get_lex_props)
tag_names = list(POS_TAGS.keys())
tag_names.sort()
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir, POS_TAGS, tag_names)
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
POS_TAGS, tag_names)
self.strings = self.vocab.strings
self._tagger = None
self._parser = None

View File

@ -21,21 +21,21 @@ def read_lang_data(data_dir):
def read_prefix(data_dir):
with utf8open(path.join(data_dir, 'prefix')) as file_:
with utf8open(path.join(data_dir, 'prefix.txt')) as file_:
entries = file_.read().split('\n')
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return expression
def read_suffix(data_dir):
with utf8open(path.join(data_dir, 'suffix')) as file_:
with utf8open(path.join(data_dir, 'suffix.txt')) as file_:
entries = file_.read().split('\n')
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
return expression
def read_infix(data_dir):
with utf8open(path.join(data_dir, 'infix')) as file_:
with utf8open(path.join(data_dir, 'infix.txt')) as file_:
entries = file_.read().split('\n')
expression = '|'.join([piece for piece in entries if piece.strip()])
return expression

View File

@ -47,8 +47,8 @@ cdef class Vocab:
if data_dir is not None:
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.strings.load(path.join(data_dir, 'strings'))
self.load(path.join(data_dir, 'lexemes'))
self.strings.load(path.join(data_dir, 'strings.txt'))
self.load(path.join(data_dir, 'lexemes.bin'))
def __len__(self):
"""The current number of lexemes stored."""