* Tests passing except for morphology/lemmatization stuff

This commit is contained in:
Matthew Honnibal 2014-12-23 11:40:32 +11:00
parent cf8d26c3d2
commit 73f200436f
27 changed files with 197 additions and 155 deletions

View File

@ -6,8 +6,6 @@ from ..vocab import Vocab
from ..tokenizer import Tokenizer
from ..syntax.parser import GreedyParser
from ..tokens import Tokens
from ..morphology import Morphologizer
from .lemmatizer import Lemmatizer
from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags
@ -18,28 +16,18 @@ def get_lex_props(string):
class English(object):
def __init__(self, data_dir=None, pos_tag=True, parse=False):
def __init__(self, data_dir=None, tag=True, parse=False):
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
for pos_str in POS_TAGS:
_ = self.vocab.strings.pos_tags[pos_str]
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
if pos_tag:
morph = Morphologizer(self.vocab.strings, POS_TAGS,
Lemmatizer(path.join(data_dir, 'wordnet')))
self.pos_tagger = EnPosTagger(data_dir, morph)
else:
self.pos_tagger = None
if parse:
self.parser = GreedyParser(data_dir)
else:
self.parser = None
self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
self.parser = GreedyParser(data_dir) if parse else None
def __call__(self, text, pos_tag=True, parse=True):
def __call__(self, text, tag=True, parse=True):
tokens = self.tokenizer.tokenize(text)
if self.pos_tagger and pos_tag:
self.pos_tagger(tokens)
if self.tagger and tag:
self.tagger(tokens)
if self.parser and parse:
self.parser.parse(tokens)
return tokens

View File

@ -1,6 +1,9 @@
from ..tagger cimport Tagger
from ..morphology cimport Morphologizer
from ..strings cimport StringStore
cdef class EnPosTagger(Tagger):
cdef readonly StringStore strings
cdef readonly StringStore tags
cdef readonly Morphologizer morphologizer

View File

@ -1,10 +1,13 @@
from os import path
import json
from thinc.typedefs cimport atom_t
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..typedefs cimport X, PUNCT, EOL
from ..structs cimport TokenC, Morphology
from ..tokens cimport Tokens
from .lemmatizer import Lemmatizer
cpdef enum en_person_t:
@ -192,10 +195,18 @@ POS_TEMPLATES = (
cdef class EnPosTagger(Tagger):
def __init__(self, data_dir, morphologizer):
def __init__(self, StringStore strings, data_dir):
model_dir = path.join(data_dir, 'pos')
Tagger.__init__(self, path.join(model_dir))
self.morphologizer = morphologizer
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
self.tags = StringStore()
for tag in sorted(cfg['tag_names']):
_ = self.tags[tag]
self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
cfg['tag_map'],
Lemmatizer(path.join(data_dir, 'wordnet'),
NOUN, VERB, ADJ))
def __call__(self, Tokens tokens):
cdef int i

View File

@ -35,15 +35,15 @@ cdef struct _Cached:
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
def __init__(self, StringStore strings, object tag_map, object lemmatizer,
irregulars=None):
def __init__(self, StringStore strings, object tag_names, object tag_map,
object lemmatizer, irregulars=None):
self.mem = Pool()
self.strings = strings
self.lemmatizer = lemmatizer
cdef int n_tags = len(self.strings.pos_tags) + 1
cdef int n_tags = len(tag_names) + 1
self._cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for tag, i in self.strings.pos_tags:
for i, tag in enumerate(sorted(tag_names)):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
@ -65,13 +65,7 @@ cdef class Morphologizer:
cdef bytes py_string = self.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
if pos == NOUN:
lemma_strings = self.lemmatizer.noun(py_string)
elif pos == VERB:
lemma_strings = self.lemmatizer.verb(py_string)
else:
assert pos == ADJ
lemma_strings = self.lemmatizer.adj(py_string)
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
return lemma

View File

@ -11,16 +11,9 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
cdef class _SymbolMap:
cdef dict _string_to_id
cdef list _id_to_string
cdef class StringStore:
cdef Pool mem
cdef Utf8Str* strings
cdef readonly _SymbolMap pos_tags
cdef readonly _SymbolMap dep_tags
cdef size_t size
cdef PreshMap _map

View File

@ -9,7 +9,8 @@ from .typedefs cimport hash_t
SEPARATOR = '\n|-SEP-|\n'
cdef class _SymbolMap:
"""
cdef class SymbolMap:
def __init__(self):
self._string_to_id = {'': 0}
self._id_to_string = ['']
@ -38,6 +39,7 @@ cdef class _SymbolMap:
self._string_to_id[string] = id_
self._id_to_string.append(string)
return id_
"""
cdef class StringStore:
@ -47,8 +49,6 @@ cdef class StringStore:
self._resize_at = 10000
self.size = 1
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.pos_tags = _SymbolMap()
self.dep_tags = _SymbolMap()
property size:
def __get__(self):

View File

@ -12,13 +12,14 @@ import cython
from thinc.features cimport Feature, count_feats
def setup_model_dir(tag_names, templates, model_dir):
def setup_model_dir(tag_names, tag_map, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'templates': templates,
'tag_names': tag_names,
'tag_map': tag_map
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)

View File

@ -43,6 +43,7 @@ cdef class Token:
cdef readonly int dep_id
cdef int lemma
cdef public int head
cdef public int dep_tag
cdef public atom_t id
cdef public atom_t cluster

View File

@ -158,7 +158,8 @@ cdef class Token:
property dep:
def __get__(self):
return self.string_store.dep_tags[self.dep]
return self.string_store.dep_tags[self.dep_id]
property pos:
def __get__(self):
return self.string_store.pos_tags[self.pos]
return self.pos_id

View File

@ -12,23 +12,21 @@ cdef class Vocab:
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
'''
def __init__(self, object get_lex_props):
def __init__(self, data_dir=None, get_lex_props=None):
self.mem = Pool()
self._map = PreshMap(2 ** 20)
self.strings = StringStore()
self.lexemes.push_back(&EMPTY_LEXEME)
self.get_lex_props = get_lex_props
@classmethod
def from_dir(cls, object data_dir, object get_lex_props=None):
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
cdef Vocab self = cls(get_lex_props)
self.strings.load(path.join(data_dir, 'strings'))
self.load(path.join(data_dir, 'lexemes'))
return self
if data_dir is not None:
if not path.exists(data_dir):
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
if data_dir is not None:
if not path.isdir(data_dir):
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
self.strings.load(path.join(data_dir, 'strings'))
self.load(path.join(data_dir, 'lexemes'))
def __len__(self):
return self.lexemes.size()

View File

@ -1,14 +1,20 @@
from spacy.en import English
import pytest
@pytest.fixture
def EN():
return English(pos_tag=True, parse=False)
return English(tag=True, parse=False)
@pytest.fixture
def tagged(EN):
string = u'Bananas in pyjamas are geese.'
tokens = EN(string, pos_tag=True)
tokens = EN(string, tag=True)
assert EN.tagger.tags[tokens[0].pos] == 'NNP'
assert EN.tagger.tags[tokens[1].pos] == 'IN'
assert EN.tagger.tags[tokens[2].pos] == 'NNS'
assert EN.tagger.tags[tokens[3].pos] == 'VBP'
assert EN.tagger.tags[tokens[3].pos] == 'NNS'
return tokens

View File

@ -5,7 +5,7 @@ from spacy.en import English
@pytest.fixture
def EN():
return English(pos_tag=False)
return English(tag=False)
def test_possess(EN):

View File

@ -6,7 +6,7 @@ from spacy.en import English
@pytest.fixture
def EN():
return English(pos_tag=False)
return English(tag=False)
def test_tweebo_challenge(EN):
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""

View File

@ -2,7 +2,7 @@ from __future__ import unicode_literals
import pytest
from spacy.en import EN
from spacy.en import English
#def test_hyphen():
@ -11,7 +11,8 @@ from spacy.en import EN
def test_period():
tokens = EN.tokenize('best.Known')
EN = English()
tokens = EN('best.Known')
assert len(tokens) == 3
tokens = EN.tokenize('zombo.com')
tokens = EN('zombo.com')
assert len(tokens) == 1

View File

@ -4,7 +4,7 @@ from spacy.en import English
@pytest.fixture
def EN():
return English(pos_tag=True)
return English(tag=True)
def test_range_iter(EN):
for i in range(len(EN.vocab)):

View File

@ -3,12 +3,12 @@ from __future__ import unicode_literals
import pytest
from spacy.en import English
from spacy.en.attrs import IS_ALPHA, IS_DIGIT
from spacy.en.attrs import *
@pytest.fixture
def EN():
return English(pos_tag=False)
return English(tag=False)
def test_is_alpha(EN):

View File

@ -8,7 +8,7 @@ from spacy.en import English
@pytest.fixture
def EN():
return English(pos_tag=True, parse=False)
return English(tag=True, parse=False)
@pytest.fixture
@ -18,8 +18,8 @@ def morph_exc():
}
def test_load_exc(EN, morph_exc):
EN.pos_tagger.morphologizer.load_exceptions(morph_exc)
tokens = EN('I like his style.', pos_tag=True)
EN.tagger.morphologizer.load_exceptions(morph_exc)
tokens = EN('I like his style.', tag=True)
his = tokens[2]
assert his.pos == 'PRP$'
assert his.lemma == '-PRP-'

View File

@ -1,11 +1,13 @@
from __future__ import unicode_literals
import pytest
from spacy.en import EN
from spacy.en import English
def test_only_pre1():
assert len(EN.tokenize("(")) == 1
EN = English()
assert len(EN("(")) == 1
def test_only_pre2():
assert len(EN.tokenize("((")) == 2
EN = English()
assert len(EN("((")) == 2

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals
from spacy.en import EN
from spacy.en import English
import pytest
@ -10,38 +10,43 @@ def close_puncts():
return [')', ']', '}', '*']
def test_close(close_puncts):
@pytest.fixture
def EN():
return English()
def test_close(close_puncts, EN):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 2
assert tokens[1].string == p
assert tokens[0].string == word_str
def test_two_different_close(close_puncts):
def test_two_different_close(close_puncts, EN):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p + "'"
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 3
assert tokens[0].string == word_str
assert tokens[1].string == p
assert tokens[2].string == "'"
def test_three_same_close(close_puncts):
def test_three_same_close(close_puncts, EN):
word_str = 'Hello'
for p in close_puncts:
string = word_str + p + p + p
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 4
assert tokens[0].string == word_str
assert tokens[1].string == p
def test_double_end_quote():
assert len(EN.tokenize("Hello''")) == 2
assert len(EN.tokenize("''")) == 1
def test_double_end_quote(EN):
assert len(EN("Hello''")) == 2
assert len(EN("''")) == 1

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals
from spacy.en import EN
from spacy.en import English
import pytest
@ -10,39 +10,44 @@ def open_puncts():
return ['(', '[', '{', '*']
def test_open(open_puncts):
@pytest.fixture
def EN():
return English()
def test_open(open_puncts, EN):
word_str = 'Hello'
for p in open_puncts:
string = p + word_str
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 2
assert tokens[0].string == p
assert tokens[1].string == word_str
def test_two_different_open(open_puncts):
def test_two_different_open(open_puncts, EN):
word_str = 'Hello'
for p in open_puncts:
string = p + "`" + word_str
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 3
assert tokens[0].string == p
assert tokens[1].string == "`"
assert tokens[2].string == word_str
def test_three_same_open(open_puncts):
def test_three_same_open(open_puncts, EN):
word_str = 'Hello'
for p in open_puncts:
string = p + p + p + word_str
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 4
assert tokens[0].string == p
assert tokens[3].string == word_str
def test_open_appostrophe():
def test_open_appostrophe(EN):
string = "'The"
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 2
assert tokens[0].string == "'"

View File

@ -3,43 +3,48 @@ and suffix punctuation."""
from __future__ import unicode_literals
import pytest
from spacy.en import EN
from spacy.en import English
def test_no_special():
assert len(EN.tokenize("(can)")) == 3
def test_no_punct():
assert len(EN.tokenize("can't")) == 2
def test_prefix():
assert len(EN.tokenize("(can't")) == 3
@pytest.fixture
def EN():
return English()
def test_suffix():
assert len(EN.tokenize("can't)")) == 3
def test_no_special(EN):
assert len(EN("(can)")) == 3
def test_no_punct(EN):
assert len(EN("can't")) == 2
def test_prefix(EN):
assert len(EN("(can't")) == 3
def test_wrap():
assert len(EN.tokenize("(can't)")) == 4
def test_suffix(EN):
assert len(EN("can't)")) == 3
def test_uneven_wrap():
assert len(EN.tokenize("(can't?)")) == 5
def test_wrap(EN):
assert len(EN("(can't)")) == 4
def test_prefix_interact():
assert len(EN.tokenize("U.S.")) == 1
assert len(EN.tokenize("us.")) == 2
assert len(EN.tokenize("(U.S.")) == 2
def test_uneven_wrap(EN):
assert len(EN("(can't?)")) == 5
def test_suffix_interact():
assert len(EN.tokenize("U.S.)")) == 2
def test_prefix_interact(EN):
assert len(EN("U.S.")) == 1
assert len(EN("us.")) == 2
assert len(EN("(U.S.")) == 2
def test_even_wrap_interact():
assert len(EN.tokenize("(U.S.)")) == 3
def test_suffix_interact(EN):
assert len(EN("U.S.)")) == 2
def test_uneven_wrap_interact():
assert len(EN.tokenize("(U.S.?)")) == 4
def test_even_wrap_interact(EN):
assert len(EN("(U.S.)")) == 3
def test_uneven_wrap_interact(EN):
assert len(EN("(U.S.?)")) == 4

View File

@ -3,13 +3,18 @@ from __future__ import unicode_literals
import pytest
from spacy.en import EN
from spacy.en import English
def test_one():
tokens = EN.tokenize('Betty Botter bought a pound of butter.')
@pytest.fixture
def EN():
return English()
def test_one(EN):
tokens = EN('Betty Botter bought a pound of butter.')
assert tokens[0].string == 'Betty'
tokens2 = EN.tokenize('Betty also bought a pound of butter.')
tokens2 = EN('Betty also bought a pound of butter.')
assert tokens2[0].string == 'Betty'

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals
from spacy.en import EN
from spacy.en import English
import pytest
@ -10,22 +10,27 @@ def paired_puncts():
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
def test_token(paired_puncts):
@pytest.fixture
def EN():
return English()
def test_token(paired_puncts, EN):
word_str = 'Hello'
for open_, close_ in paired_puncts:
string = open_ + word_str + close_
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 3
assert tokens[0].string == open_
assert tokens[1].string == word_str
assert tokens[2].string == close_
def test_two_different(paired_puncts):
def test_two_different(paired_puncts, EN):
word_str = 'Hello'
for open_, close_ in paired_puncts:
string = "`" + open_ + word_str + close_ + "'"
tokens = EN.tokenize(string)
tokens = EN(string)
assert len(tokens) == 5
assert tokens[0].string == "`"
assert tokens[1].string == open_

View File

@ -8,7 +8,7 @@ from spacy.en import English
@pytest.fixture
def EN():
return English(pos_tag=False, parse=False)
return English(tag=False, parse=False)
def test_single_word(EN):
tokens = EN(u'hello')

View File

@ -1,9 +1,16 @@
from __future__ import unicode_literals
import pytest
from spacy.en import EN
from spacy.en import English
def test1():
@pytest.fixture
def EN():
return English()
def test1(EN):
words = ['JAPAN', 'GET', 'LUCKY']
tokens = EN.tokens_from_list(words)
tokens = EN.tokenizer.tokens_from_list(words)
assert len(tokens) == 3
assert tokens[0].string == 'JAPAN'

View File

@ -1,23 +1,29 @@
from __future__ import unicode_literals
import pytest
from spacy.en import EN
from spacy.en import English
def test_neq():
addr = EN.lexicon['Hello']
assert EN.lexicon['bye']['sic'] != addr['sic']
@pytest.fixture
def EN():
return English()
def test_eq():
addr = EN.lexicon['Hello']
assert EN.lexicon['Hello']['sic'] == addr['sic']
def test_neq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['bye']['sic'] != addr['sic']
def test_case_neq():
addr = EN.lexicon['Hello']
assert EN.lexicon['hello']['sic'] != addr['sic']
def test_eq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['Hello']['sic'] == addr['sic']
def test_punct_neq():
addr = EN.lexicon['Hello']
assert EN.lexicon['Hello,']['sic'] != addr['sic']
def test_case_neq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['hello']['sic'] != addr['sic']
def test_punct_neq(EN):
addr = EN.vocab['Hello']
assert EN.vocab['Hello,']['sic'] != addr['sic']

View File

@ -1,38 +1,43 @@
"""Test that tokens are created correctly for whitespace."""
from __future__ import unicode_literals
from spacy.en import EN
from spacy.en import English
import pytest
def test_single_space():
tokens = EN.tokenize('hello possums')
@pytest.fixture
def EN():
return English(tag=False)
def test_single_space(EN):
tokens = EN('hello possums')
assert len(tokens) == 2
def test_double_space():
tokens = EN.tokenize('hello possums')
def test_double_space(EN):
tokens = EN('hello possums')
assert len(tokens) == 3
assert tokens[1].string == ' '
def test_newline():
tokens = EN.tokenize('hello\npossums')
def test_newline(EN):
tokens = EN('hello\npossums')
assert len(tokens) == 3
def test_newline_space():
tokens = EN.tokenize('hello \npossums')
def test_newline_space(EN):
tokens = EN('hello \npossums')
assert len(tokens) == 3
def test_newline_double_space():
tokens = EN.tokenize('hello \npossums')
def test_newline_double_space(EN):
tokens = EN('hello \npossums')
assert len(tokens) == 3
def test_newline_space_wrap():
tokens = EN.tokenize('hello \n possums')
def test_newline_space_wrap(EN):
tokens = EN('hello \n possums')
assert len(tokens) == 3