mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-28 02:04:07 +03:00
* Move around data files for test release
This commit is contained in:
parent
47c71ce1c0
commit
f5d41028b5
|
@ -11,6 +11,9 @@ from .pos import POS_TAGS
|
||||||
from .attrs import get_flags
|
from .attrs import get_flags
|
||||||
|
|
||||||
|
|
||||||
|
DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||||
|
|
||||||
|
|
||||||
def get_lex_props(string):
|
def get_lex_props(string):
|
||||||
return {'flags': get_flags(string), 'dense': 1}
|
return {'flags': get_flags(string), 'dense': 1}
|
||||||
|
|
||||||
|
@ -46,10 +49,12 @@ class English(object):
|
||||||
if data_dir is None:
|
if data_dir is None:
|
||||||
data_dir = path.join(path.dirname(__file__), 'data')
|
data_dir = path.join(path.dirname(__file__), 'data')
|
||||||
self._data_dir = data_dir
|
self._data_dir = data_dir
|
||||||
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
|
self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab'),
|
||||||
|
get_lex_props=get_lex_props)
|
||||||
tag_names = list(POS_TAGS.keys())
|
tag_names = list(POS_TAGS.keys())
|
||||||
tag_names.sort()
|
tag_names.sort()
|
||||||
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir, POS_TAGS, tag_names)
|
self.tokenizer = Tokenizer.from_dir(self.vocab, path.join(data_dir, 'tokenizer'),
|
||||||
|
POS_TAGS, tag_names)
|
||||||
self.strings = self.vocab.strings
|
self.strings = self.vocab.strings
|
||||||
self._tagger = None
|
self._tagger = None
|
||||||
self._parser = None
|
self._parser = None
|
||||||
|
|
|
@ -21,21 +21,21 @@ def read_lang_data(data_dir):
|
||||||
|
|
||||||
|
|
||||||
def read_prefix(data_dir):
|
def read_prefix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'prefix')) as file_:
|
with utf8open(path.join(data_dir, 'prefix.txt')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
|
|
||||||
def read_suffix(data_dir):
|
def read_suffix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'suffix')) as file_:
|
with utf8open(path.join(data_dir, 'suffix.txt')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
|
||||||
return expression
|
return expression
|
||||||
|
|
||||||
|
|
||||||
def read_infix(data_dir):
|
def read_infix(data_dir):
|
||||||
with utf8open(path.join(data_dir, 'infix')) as file_:
|
with utf8open(path.join(data_dir, 'infix.txt')) as file_:
|
||||||
entries = file_.read().split('\n')
|
entries = file_.read().split('\n')
|
||||||
expression = '|'.join([piece for piece in entries if piece.strip()])
|
expression = '|'.join([piece for piece in entries if piece.strip()])
|
||||||
return expression
|
return expression
|
||||||
|
|
|
@ -47,8 +47,8 @@ cdef class Vocab:
|
||||||
if data_dir is not None:
|
if data_dir is not None:
|
||||||
if not path.isdir(data_dir):
|
if not path.isdir(data_dir):
|
||||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||||
self.strings.load(path.join(data_dir, 'strings'))
|
self.strings.load(path.join(data_dir, 'strings.txt'))
|
||||||
self.load(path.join(data_dir, 'lexemes'))
|
self.load(path.join(data_dir, 'lexemes.bin'))
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""The current number of lexemes stored."""
|
"""The current number of lexemes stored."""
|
||||||
|
|
Loading…
Reference in New Issue
Block a user