* Move around data files for test release

2025-07-30 01:50:03 +03:00 · 2015-01-03 01:59:22 +11:00 · 2015-01-03 01:59:22 +11:00 · f5d41028b5
commit f5d41028b5
parent 47c71ce1c0
3 changed files with 12 additions and 7 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -11,6 +11,9 @@ from .pos import POS_TAGS
 from .attrs import get_flags


+DATA_DIR = path.join(path.dirname(__file__), 'data')
+
+
 def get_lex_props(string):
    return {'flags': get_flags(string), 'dense': 1}

@ -46,10 +49,12 @@ class English(object):
        if data_dir is None:
            data_dir = path.join(path.dirname(__file__), 'data')
        self._data_dir = data_dir
-        self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
+        self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
+                           get_lex_props=get_lex_props)
        tag_names = list(POS_TAGS.keys())
        tag_names.sort()
-        self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir, POS_TAGS, tag_names)
+        self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
+                                            POS_TAGS, tag_names)
        self.strings = self.vocab.strings
        self._tagger = None
        self._parser = None
--- a/spacy/util.py
+++ b/spacy/util.py
@ -21,21 +21,21 @@ def read_lang_data(data_dir):


 def read_prefix(data_dir):
-    with  utf8open(path.join(data_dir, 'prefix')) as file_:
+    with  utf8open(path.join(data_dir, 'prefix.txt')) as file_:
        entries = file_.read().split('\n')
        expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
    return expression


 def read_suffix(data_dir):
-    with utf8open(path.join(data_dir, 'suffix')) as file_:
+    with utf8open(path.join(data_dir, 'suffix.txt')) as file_:
        entries = file_.read().split('\n')
        expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
    return expression


 def read_infix(data_dir):
-    with utf8open(path.join(data_dir, 'infix')) as file_:
+    with utf8open(path.join(data_dir, 'infix.txt')) as file_:
        entries = file_.read().split('\n')
        expression = '|'.join([piece for piece in entries if piece.strip()])
    return expression
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -47,8 +47,8 @@ cdef class Vocab:
        if data_dir is not None:
            if not path.isdir(data_dir):
                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
-            self.strings.load(path.join(data_dir, 'strings'))
-            self.load(path.join(data_dir, 'lexemes'))
+            self.strings.load(path.join(data_dir, 'strings.txt'))
+            self.load(path.join(data_dir, 'lexemes.bin'))

    def __len__(self):
        """The current number of lexemes stored."""