access model via sputnik

This commit is contained in:
Henning Peters 2015-12-07 06:01:28 +01:00
parent a9fc35d3bf
commit 9027cef3bc
20 changed files with 161 additions and 199 deletions

View File

@ -10,4 +10,4 @@ plac
six six
ujson ujson
cloudpickle cloudpickle
sputnik == 0.5.2 sputnik == 0.6.0

View File

@ -179,7 +179,7 @@ def run_setup(exts):
license="MIT", license="MIT",
install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44', install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44',
'thinc == 4.0.0', "text_unidecode", 'plac', 'six', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six',
'ujson', 'cloudpickle', 'sputnik == 0.5.2'], 'ujson', 'cloudpickle', 'sputnik == 0.6.0'],
setup_requires=["headers_workaround"], setup_requires=["headers_workaround"],
cmdclass = {'build_ext': build_ext_subclass }, cmdclass = {'build_ext': build_ext_subclass },
) )

View File

@ -6,6 +6,4 @@ from ..language import Language
class German(Language): class German(Language):
@classmethod pass
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')

View File

@ -4,8 +4,6 @@ from os import path
from ..language import Language from ..language import Language
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
# improved list from Stone, Denis, Kwantes (2010) # improved list from Stone, Denis, Kwantes (2010)
STOPWORDS = """ STOPWORDS = """
@ -35,10 +33,6 @@ your yours yourself yourselves
STOPWORDS = set(w for w in STOPWORDS.split() if w) STOPWORDS = set(w for w in STOPWORDS.split() if w)
class English(Language): class English(Language):
@classmethod
def default_data_dir(cls):
return LOCAL_DATA_DIR
@staticmethod @staticmethod
def is_stop(string): def is_stop(string):
return 1 if string.lower() in STOPWORDS else 0 return 1 if string.lower() in STOPWORDS else 0

View File

@ -6,6 +6,4 @@ from ..language import Language
class Finnish(Language): class Finnish(Language):
@classmethod pass
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')

View File

@ -6,6 +6,4 @@ from ..language import Language
class Italian(Language): class Italian(Language):
@classmethod pass
def default_data_dir(cls):
return path.join(path.dirname(__file__), 'data')

View File

@ -20,6 +20,7 @@ from .syntax.ner import BiluoPushDown
from .syntax.arc_eager import ArcEager from .syntax.arc_eager import ArcEager
from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
from .util import default_package
class Language(object): class Language(object):
@ -100,7 +101,7 @@ class Language(object):
return 0 return 0
@classmethod @classmethod
def default_lex_attrs(cls, data_dir=None): def default_lex_attrs(cls):
return { return {
attrs.LOWER: cls.lower, attrs.LOWER: cls.lower,
attrs.NORM: cls.norm, attrs.NORM: cls.norm,
@ -134,73 +135,42 @@ class Language(object):
return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
@classmethod @classmethod
def default_data_dir(cls): def default_vocab(cls, package=None, get_lex_attr=None):
return path.join(path.dirname(__file__), 'data') if package is None:
package = default_package()
@classmethod
def default_vocab(cls, data_dir=None, get_lex_attr=None):
if data_dir is None:
data_dir = cls.default_data_dir()
if get_lex_attr is None: if get_lex_attr is None:
get_lex_attr = cls.default_lex_attrs(data_dir) get_lex_attr = cls.default_lex_attrs()
return Vocab.from_dir( return Vocab.from_package(package, get_lex_attr=get_lex_attr)
path.join(data_dir, 'vocab'),
get_lex_attr=get_lex_attr)
@classmethod @classmethod
def default_tokenizer(cls, vocab, data_dir): def default_parser(cls, package, vocab):
if path.exists(data_dir): data_dir = package.dir_path('data', 'deps')
return Tokenizer.from_dir(vocab, data_dir) return Parser.from_dir(data_dir, vocab.strings, ArcEager)
else:
return Tokenizer(vocab, {}, None, None, None)
@classmethod @classmethod
def default_tagger(cls, vocab, data_dir): def default_entity(cls, package, vocab):
if path.exists(data_dir): data_dir = package.dir_path('data', 'ner')
return Tagger.from_dir(data_dir, vocab) return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
else:
return None
@classmethod def __init__(self, package=None, vocab=None, tokenizer=None, tagger=None,
def default_parser(cls, vocab, data_dir):
if path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, ArcEager)
else:
return None
@classmethod
def default_entity(cls, vocab, data_dir):
if path.exists(data_dir):
return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown)
else:
return None
@classmethod
def default_matcher(cls, vocab, data_dir):
if path.exists(data_dir):
return Matcher.from_dir(data_dir, vocab)
else:
return None
def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None,
parser=None, entity=None, matcher=None, serializer=None, parser=None, entity=None, matcher=None, serializer=None,
load_vectors=True): load_vectors=True):
if load_vectors is not True: if load_vectors is not True:
warn("load_vectors is deprecated", DeprecationWarning) warn("load_vectors is deprecated", DeprecationWarning)
if data_dir in (None, True): if package in (None, True):
data_dir = self.default_data_dir() package = default_package()
if vocab in (None, True): if vocab in (None, True):
vocab = self.default_vocab(data_dir) vocab = self.default_vocab(package)
if tokenizer in (None, True): if tokenizer in (None, True):
tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer')) tokenizer = Tokenizer.from_package(package, vocab)
if tagger in (None, True): if tagger in (None, True):
tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos')) tagger = Tagger.from_package(package, vocab)
if entity in (None, True): if entity in (None, True):
entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner')) entity = self.default_entity(package, vocab)
if parser in (None, True): if parser in (None, True):
parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps')) parser = self.default_parser(package, vocab)
if matcher in (None, True): if matcher in (None, True):
matcher = self.default_matcher(vocab, data_dir=data_dir) matcher = Matcher.from_package(package, vocab)
self.vocab = vocab self.vocab = vocab
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.tagger = tagger self.tagger = tagger

View File

@ -12,16 +12,21 @@ from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
class Lemmatizer(object): class Lemmatizer(object):
@classmethod @classmethod
def from_dir(cls, data_dir): def from_package(cls, package):
index = {} index = {}
exc = {} exc = {}
for pos in ['adj', 'noun', 'verb']: for pos in ['adj', 'noun', 'verb']:
index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos)) index[pos] = package.load_utf8(read_index,
exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos)) 'data', 'wordnet', 'index.%s' % pos,
if path.exists(path.join(data_dir, 'vocab', 'lemma_rules.json')): default=set()) # TODO: really optional?
rules = json.load(codecs.open(path.join(data_dir, 'vocab', 'lemma_rules.json'), encoding='utf_8')) exc[pos] = package.load_utf8(read_exc,
else: 'data', 'wordnet', '%s.exc' % pos,
rules = {} default={}) # TODO: really optional?
rules = package.load_utf8(json.load,
'data', 'vocab', 'lemma_rules.json',
default={}) # TODO: really optional?
return cls(index, exc, rules) return cls(index, exc, rules)
def __init__(self, index, exceptions, rules): def __init__(self, index, exceptions, rules):
@ -70,11 +75,9 @@ def lemmatize(string, index, exceptions, rules):
return set(forms) return set(forms)
def read_index(loc): def read_index(fileobj):
index = set() index = set()
if not path.exists(loc): for line in fileobj:
return index
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '): if line.startswith(' '):
continue continue
pieces = line.split() pieces = line.split()
@ -84,11 +87,9 @@ def read_index(loc):
return index return index
def read_exc(loc): def read_exc(fileobj):
exceptions = {} exceptions = {}
if not path.exists(loc): for line in fileobj:
return exceptions
for line in codecs.open(loc, 'r', 'utf8'):
if line.startswith(' '): if line.startswith(' '):
continue continue
pieces = line.split() pieces = line.split()

View File

@ -169,14 +169,11 @@ cdef class Matcher:
cdef object _patterns cdef object _patterns
@classmethod @classmethod
def from_dir(cls, data_dir, Vocab vocab): def from_package(cls, package, Vocab vocab):
patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') patterns = package.load_utf8(json.load,
if path.exists(patterns_loc): 'data', 'vocab', 'gazetteer.json',
patterns_data = open(patterns_loc).read() default={}) # TODO: really optional?
patterns = json.loads(patterns_data) return cls(vocab, patterns)
return cls(vocab, patterns)
else:
return cls(vocab, {})
def __init__(self, vocab, patterns): def __init__(self, vocab, patterns):
self.vocab = vocab self.vocab = vocab

View File

@ -146,15 +146,17 @@ cdef class Tagger:
return cls(vocab, model) return cls(vocab, model)
@classmethod @classmethod
def from_dir(cls, data_dir, vocab): def from_package(cls, package, vocab):
if path.exists(path.join(data_dir, 'templates.json')): # TODO: templates.json deprecated? not present in latest package
templates = json.loads(open(path.join(data_dir, 'templates.json'))) templates = package.load_utf8(json.load,
else: 'data', 'pos', 'templates.json',
templates = cls.default_templates() default=cls.default_templates())
model = TaggerModel(vocab.morphology.n_tags, model = TaggerModel(vocab.morphology.n_tags,
ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
if path.exists(path.join(data_dir, 'model')):
model.load(path.join(data_dir, 'model')) model.load(package.file_path('data', 'pos', 'model', require=False)) # TODO: really optional?
return cls(vocab, model) return cls(vocab, model)
def __init__(self, Vocab vocab, TaggerModel model): def __init__(self, Vocab vocab, TaggerModel model):

View File

@ -1,12 +1,11 @@
from spacy.en import English
import pytest import pytest
from spacy.en import English, LOCAL_DATA_DIR
import os
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def EN(): def EN():
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) return English()
return English(data_dir=data_dir)
def pytest_addoption(parser): def pytest_addoption(parser):

View File

@ -10,7 +10,6 @@ from spacy.en import English
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens.doc import Doc from spacy.tokens.doc import Doc
from spacy.tokenizer import Tokenizer from spacy.tokenizer import Tokenizer
from spacy.en import LOCAL_DATA_DIR
from os import path from os import path
from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD

View File

@ -1,9 +1,8 @@
import pytest import pytest
from spacy.en import English, LOCAL_DATA_DIR from spacy.en import English
import os import os
@pytest.fixture(scope="session") @pytest.fixture(scope="session")
def en_nlp(): def en_nlp():
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) return English()
return English(data_dir=data_dir)

View File

@ -4,31 +4,33 @@ import io
import pickle import pickle
from spacy.lemmatizer import Lemmatizer, read_index, read_exc from spacy.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.en import LOCAL_DATA_DIR from spacy.util import default_package
from os import path
import pytest import pytest
def test_read_index(): @pytest.fixture
wn = path.join(LOCAL_DATA_DIR, 'wordnet') def package():
index = read_index(path.join(wn, 'index.noun')) return default_package()
@pytest.fixture
def lemmatizer(package):
return Lemmatizer.from_package(package)
def test_read_index(package):
index = package.load_utf8(read_index, 'data', 'wordnet', 'index.noun')
assert 'man' in index assert 'man' in index
assert 'plantes' not in index assert 'plantes' not in index
assert 'plant' in index assert 'plant' in index
def test_read_exc(): def test_read_exc(package):
wn = path.join(LOCAL_DATA_DIR, 'wordnet') exc = package.load_utf8(read_exc, 'data', 'wordnet', 'verb.exc')
exc = read_exc(path.join(wn, 'verb.exc'))
assert exc['was'] == ('be',) assert exc['was'] == ('be',)
@pytest.fixture
def lemmatizer():
return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR))
def test_noun_lemmas(lemmatizer): def test_noun_lemmas(lemmatizer):
do = lemmatizer.noun do = lemmatizer.noun

View File

@ -2,16 +2,15 @@ from __future__ import unicode_literals
import pytest import pytest
import gc import gc
from spacy.en import English, LOCAL_DATA_DIR from spacy.en import English
import os import os
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
# Let this have its own instances, as we have to be careful about memory here # Let this have its own instances, as we have to be careful about memory here
# that's the point, after all # that's the point, after all
@pytest.mark.models @pytest.mark.models
def get_orphan_token(text, i): def get_orphan_token(text, i):
nlp = English(data_dir=data_dir) nlp = English()
tokens = nlp(text) tokens = nlp(text)
gc.collect() gc.collect()
token = tokens[i] token = tokens[i]
@ -41,7 +40,7 @@ def _orphan_from_list(toks):
@pytest.mark.models @pytest.mark.models
def test_list_orphans(): def test_list_orphans():
# Test case from NSchrading # Test case from NSchrading
nlp = English(data_dir=data_dir) nlp = English()
samples = ["a", "test blah wat okay"] samples = ["a", "test blah wat okay"]
lst = [] lst = []
for sample in samples: for sample in samples:

View File

@ -5,9 +5,8 @@ import os
@pytest.fixture(scope='session') @pytest.fixture(scope='session')
def nlp(): def nlp():
from spacy.en import English, LOCAL_DATA_DIR from spacy.en import English
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) return English()
return English(data_dir=data_dir)
@pytest.fixture() @pytest.fixture()

View File

@ -10,9 +10,8 @@ def token(doc):
def test_load_resources_and_process_text(): def test_load_resources_and_process_text():
from spacy.en import English, LOCAL_DATA_DIR from spacy.en import English
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) nlp = English()
nlp = English(data_dir=data_dir)
doc = nlp('Hello, world. Here are two sentences.') doc = nlp('Hello, world. Here are two sentences.')

View File

@ -41,8 +41,8 @@ cdef class Tokenizer:
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
@classmethod @classmethod
def from_dir(cls, Vocab vocab, data_dir): def from_package(cls, package, Vocab vocab):
rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir) rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
prefix_re = re.compile(prefix_re) prefix_re = re.compile(prefix_re)
suffix_re = re.compile(suffix_re) suffix_re = re.compile(suffix_re)
infix_re = re.compile(infix_re) infix_re = re.compile(infix_re)

View File

@ -1,10 +1,23 @@
from os import path import os
import io import io
import json import json
import re import re
from sputnik import Sputnik
from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
DATA_DIR = path.join(path.dirname(__file__), '..', 'data')
def default_package():
if os.environ.get('SPACY_DATA'):
data_path = os.environ.get('SPACY_DATA')
else:
data_path = os.path.abspath(
os.path.join(os.path.dirname(__file__), 'data'))
sputnik = Sputnik('spacy', '0.99.0') # TODO: retrieve version
pool = sputnik.pool(data_path)
return pool.get('en_default')
def normalize_slice(length, start, stop, step=None): def normalize_slice(length, start, stop, step=None):
@ -31,67 +44,63 @@ def utf8open(loc, mode='r'):
return io.open(loc, mode, encoding='utf8') return io.open(loc, mode, encoding='utf8')
def read_lang_data(data_dir): def read_lang_data(package):
with open(path.join(data_dir, 'specials.json')) as file_: tokenization = package.load_utf8(json.load, 'data', 'tokenizer', 'specials.json')
tokenization = json.load(file_) prefix = package.load_utf8(read_prefix, 'data', 'tokenizer', 'prefix.txt')
prefix = read_prefix(data_dir) suffix = package.load_utf8(read_suffix, 'data', 'tokenizer', 'suffix.txt')
suffix = read_suffix(data_dir) infix = package.load_utf8(read_infix, 'data', 'tokenizer', 'infix.txt')
infix = read_infix(data_dir)
return tokenization, prefix, suffix, infix return tokenization, prefix, suffix, infix
def read_prefix(data_dir): def read_prefix(fileobj):
with utf8open(path.join(data_dir, 'prefix.txt')) as file_: entries = fileobj.read().split('\n')
entries = file_.read().split('\n') expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
return expression return expression
def read_suffix(data_dir): def read_suffix(fileobj):
with utf8open(path.join(data_dir, 'suffix.txt')) as file_: entries = fileobj.read().split('\n')
entries = file_.read().split('\n') expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
return expression return expression
def read_infix(data_dir): def read_infix(fileobj):
with utf8open(path.join(data_dir, 'infix.txt')) as file_: entries = fileobj.read().split('\n')
entries = file_.read().split('\n') expression = '|'.join([piece for piece in entries if piece.strip()])
expression = '|'.join([piece for piece in entries if piece.strip()])
return expression return expression
def read_tokenization(lang): # def read_tokenization(lang):
loc = path.join(DATA_DIR, lang, 'tokenization') # loc = path.join(DATA_DIR, lang, 'tokenization')
entries = [] # entries = []
seen = set() # seen = set()
with utf8open(loc) as file_: # with utf8open(loc) as file_:
for line in file_: # for line in file_:
line = line.strip() # line = line.strip()
if line.startswith('#'): # if line.startswith('#'):
continue # continue
if not line: # if not line:
continue # continue
pieces = line.split() # pieces = line.split()
chunk = pieces.pop(0) # chunk = pieces.pop(0)
assert chunk not in seen, chunk # assert chunk not in seen, chunk
seen.add(chunk) # seen.add(chunk)
entries.append((chunk, list(pieces))) # entries.append((chunk, list(pieces)))
if chunk[0].isalpha() and chunk[0].islower(): # if chunk[0].isalpha() and chunk[0].islower():
chunk = chunk[0].title() + chunk[1:] # chunk = chunk[0].title() + chunk[1:]
pieces[0] = pieces[0][0].title() + pieces[0][1:] # pieces[0] = pieces[0][0].title() + pieces[0][1:]
seen.add(chunk) # seen.add(chunk)
entries.append((chunk, pieces)) # entries.append((chunk, pieces))
return entries # return entries
def read_detoken_rules(lang): # Deprecated? # def read_detoken_rules(lang): # Deprecated?
loc = path.join(DATA_DIR, lang, 'detokenize') # loc = path.join(DATA_DIR, lang, 'detokenize')
entries = [] # entries = []
with utf8open(loc) as file_: # with utf8open(loc) as file_:
for line in file_: # for line in file_:
entries.append(line.strip()) # entries.append(line.strip())
return entries # return entries
def align_tokens(ref, indices): # Deprecated, surely? def align_tokens(ref, indices): # Deprecated, surely?

View File

@ -47,28 +47,27 @@ cdef class Vocab:
'''A map container for a language's LexemeC structs. '''A map container for a language's LexemeC structs.
''' '''
@classmethod @classmethod
def from_dir(cls, data_dir, get_lex_attr=None): def from_package(cls, package, get_lex_attr=None):
if not path.exists(data_dir): tag_map = package.load_utf8(json.load,
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) 'data', 'vocab', 'tag_map.json')
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) lemmatizer = Lemmatizer.from_package(package)
serializer_freqs = package.load_utf8(json.load,
'data', 'vocab', 'serializer.json',
require=False) # TODO: really optional?
tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
lemmatizer = Lemmatizer.from_dir(path.join(data_dir, '..'))
if path.exists(path.join(data_dir, 'serializer.json')):
serializer_freqs = json.load(open(path.join(data_dir, 'serializer.json')))
else:
serializer_freqs = None
cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
if path.exists(path.join(data_dir, 'strings.json')): if package.has_file('data', 'vocab', 'strings.json'): # TODO: really optional?
with io.open(path.join(data_dir, 'strings.json'), 'r', encoding='utf8') as file_: package.load_utf8(self.strings.load, 'data', 'vocab', 'strings.json')
self.strings.load(file_) self.load_lexemes(package.file_path('data', 'vocab', 'lexemes.bin'))
self.load_lexemes(path.join(data_dir, 'lexemes.bin'))
if package.has_file('data', 'vocab', 'vec.bin'): # TODO: really optional?
if path.exists(path.join(data_dir, 'vec.bin')): self.vectors_length = self.load_vectors_from_bin_loc(
self.vectors_length = self.load_vectors_from_bin_loc(path.join(data_dir, 'vec.bin')) package.file_path('data', 'vocab', 'vec.bin'))
return self return self
def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None): def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):