From f5d41028b58cb9e36c9cc338c52391e97f1466c4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 3 Jan 2015 01:59:22 +1100 Subject: [PATCH] * Move around data files for test release --- spacy/en/__init__.py | 9 +++++++-- spacy/util.py | 6 +++--- spacy/vocab.pyx | 4 ++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index df2b26b42..d862672cc 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -11,6 +11,9 @@ from .pos import POS_TAGS from .attrs import get_flags +DATA_DIR = path.join(path.dirname(__file__), 'data') + + def get_lex_props(string): return {'flags': get_flags(string), 'dense': 1} @@ -46,10 +49,12 @@ class English(object): if data_dir is None: data_dir = path.join(path.dirname(__file__), 'data') self._data_dir = data_dir - self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props) + self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'), + get_lex_props=get_lex_props) tag_names = list(POS_TAGS.keys()) tag_names.sort() - self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir, POS_TAGS, tag_names) + self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'), + POS_TAGS, tag_names) self.strings = self.vocab.strings self._tagger = None self._parser = None diff --git a/spacy/util.py b/spacy/util.py index 0bb5868ce..c2d3ba150 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -21,21 +21,21 @@ def read_lang_data(data_dir): def read_prefix(data_dir): - with utf8open(path.join(data_dir, 'prefix')) as file_: + with utf8open(path.join(data_dir, 'prefix.txt')) as file_: entries = file_.read().split('\n') expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return expression def read_suffix(data_dir): - with utf8open(path.join(data_dir, 'suffix')) as file_: + with utf8open(path.join(data_dir, 'suffix.txt')) as file_: entries = file_.read().split('\n') expression = '|'.join([piece + '$' for piece in entries if piece.strip()]) return expression def read_infix(data_dir): - with utf8open(path.join(data_dir, 'infix')) as file_: + with utf8open(path.join(data_dir, 'infix.txt')) as file_: entries = file_.read().split('\n') expression = '|'.join([piece for piece in entries if piece.strip()]) return expression diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 1b5fb9443..b843261ee 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -47,8 +47,8 @@ cdef class Vocab: if data_dir is not None: if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - self.strings.load(path.join(data_dir, 'strings')) - self.load(path.join(data_dir, 'lexemes')) + self.strings.load(path.join(data_dir, 'strings.txt')) + self.load(path.join(data_dir, 'lexemes.bin')) def __len__(self): """The current number of lexemes stored."""