* All tests now passing for reorg

This commit is contained in:
Matthew Honnibal 2014-12-23 13:18:59 +11:00
parent 73f200436f
commit b00bc01d8c
10 changed files with 91 additions and 144 deletions

View File

@ -1,9 +1,21 @@
from preshed.maps cimport PreshMapArray
from ..tagger cimport Tagger
from ..morphology cimport Morphologizer
from ..strings cimport StringStore
from ..structs cimport TokenC, Lexeme, Morphology, PosTag
from ..typedefs cimport univ_tag_t
from .lemmatizer import Lemmatizer
cdef class EnPosTagger(Tagger):
cdef readonly StringStore strings
cdef readonly StringStore tags
cdef readonly Morphologizer morphologizer
cdef public object lemmatizer
cdef PreshMapArray _morph_cache
cdef PosTag* tags
cdef readonly object tag_names
cdef readonly object tag_map
cdef int set_morph(self, const int i, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1

View File

@ -3,10 +3,13 @@ import json
from thinc.typedefs cimport atom_t
from ..typedefs cimport univ_tag_t
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..typedefs cimport X, PUNCT, EOL
from ..structs cimport TokenC, Morphology
from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, Lexeme
from ..tokens cimport Tokens
from ..morphology cimport set_morph_from_dict
from .lemmatizer import Lemmatizer
@ -194,29 +197,39 @@ POS_TEMPLATES = (
)
cdef struct _CachedMorph:
Morphology morph
int lemma
cdef class EnPosTagger(Tagger):
def __init__(self, StringStore strings, data_dir):
model_dir = path.join(data_dir, 'pos')
Tagger.__init__(self, path.join(model_dir))
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
self.tags = StringStore()
for tag in sorted(cfg['tag_names']):
_ = self.tags[tag]
self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
cfg['tag_map'],
Lemmatizer(path.join(data_dir, 'wordnet'),
NOUN, VERB, ADJ))
self.tag_names = sorted(cfg['tag_names'])
self.tag_map = cfg['tag_map']
cdef int n_tags = len(self.tag_names) + 1
self._morph_cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for i, tag in enumerate(sorted(self.tag_names)):
pos, props = self.tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
set_morph_from_dict(&self.tags[i].morph, props)
if path.exists(path.join(data_dir, 'morphs.json')):
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
def __call__(self, Tokens tokens):
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
cdef TokenC* t = tokens.data
assert self.morphologizer is not None
for i in range(tokens.length):
fill_context(context, i, t)
t[i].pos = self.predict(context)
self.morphologizer.set_morph(i, t)
self.set_morph(i, t)
def train(self, Tokens tokens, golds):
cdef int i
@ -226,10 +239,53 @@ cdef class EnPosTagger(Tagger):
for i in range(tokens.length):
fill_context(context, i, t)
t[i].pos = self.predict(context, [golds[i]])
self.morphologizer.set_morph(i, t)
self.set_morph(i, t)
c += t[i].pos == golds[i]
return c
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].pos]
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
if cached is NULL:
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
cdef bytes py_string = self.strings[lex.sic]
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef set lemma_strings
cdef bytes lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
return lemma
def load_morph_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef id_t sic
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
sic = self.strings[form_str]
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._morph_cache.set(pos, sic, <void*>cached)
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_sic], &tokens[i-2])

View File

@ -1,21 +1,4 @@
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray
from .structs cimport TokenC, Lexeme, Morphology, PosTag
from .strings cimport StringStore
from .typedefs cimport id_t, univ_tag_t
cdef class Morphologizer:
cdef Pool mem
cdef StringStore strings
cdef object lemmatizer
cdef PosTag* tags
cdef readonly list tag_names
cdef PreshMapArray _cache
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cdef int set_morph(self, const int i, TokenC* tokens) except -1
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1

View File

@ -1,106 +1,6 @@
# cython: profile=True
# cython: embedsignature=True
from os import path
import json
from .typedefs cimport id_t, univ_tag_t
from .typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT
from .typedefs cimport VERB, X, PUNCT, EOL
from . import util
UNIV_TAGS = {
'NULL': NO_TAG,
'ADJ': ADJ,
'ADV': ADV,
'ADP': ADP,
'CONJ': CONJ,
'DET': DET,
'NOUN': NOUN,
'NUM': NUM,
'PRON': PRON,
'PRT': PRT,
'VERB': VERB,
'X': X,
'.': PUNCT,
'EOL': EOL
}
cdef struct _Cached:
Morphology morph
int lemma
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
def __init__(self, StringStore strings, object tag_names, object tag_map,
object lemmatizer, irregulars=None):
self.mem = Pool()
self.strings = strings
self.lemmatizer = lemmatizer
cdef int n_tags = len(tag_names) + 1
self._cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for i, tag in enumerate(sorted(tag_names)):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
self.tags[i].morph.number = props.get('number', 0)
self.tags[i].morph.tenspect = props.get('tenspect', 0)
self.tags[i].morph.mood = props.get('mood', 0)
self.tags[i].morph.gender = props.get('gender', 0)
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
if irregulars is not None:
self.load_exceptions(irregulars)
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef bytes py_string = self.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
return lemma
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].pos]
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
if cached is NULL:
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
def load_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef id_t sic
cdef int pos
for pos_str, entries in exc.items():
pos = self.strings.pos_tags[pos_str]
assert pos < len(self.strings.pos_tags)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
sic = self.strings[form_str]
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._cache.set(pos, sic, <void*>cached)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0)

View File

@ -10,11 +10,6 @@ def EN():
def tagged(EN):
string = u'Bananas in pyjamas are geese.'
tokens = EN(string, tag=True)
assert EN.tagger.tags[tokens[0].pos] == 'NNP'
assert EN.tagger.tags[tokens[1].pos] == 'IN'
assert EN.tagger.tags[tokens[2].pos] == 'NNS'
assert EN.tagger.tags[tokens[3].pos] == 'VBP'
assert EN.tagger.tags[tokens[3].pos] == 'NNS'
return tokens

View File

@ -21,7 +21,7 @@ def test_read_exc():
@pytest.fixture
def lemmatizer():
return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0)
def test_noun_lemmas(lemmatizer):

View File

@ -18,8 +18,8 @@ def morph_exc():
}
def test_load_exc(EN, morph_exc):
EN.tagger.morphologizer.load_exceptions(morph_exc)
EN.tagger.load_morph_exceptions(morph_exc)
tokens = EN('I like his style.', tag=True)
his = tokens[2]
assert his.pos == 'PRP$'
assert EN.tagger.tag_names[his.pos] == 'PRP$'
assert his.lemma == '-PRP-'

View File

@ -4,10 +4,10 @@ import pytest
from spacy.en import English
def test_only_pre1():
EN = English()
EN = English(tag=False, parse=False)
assert len(EN("(")) == 1
def test_only_pre2():
EN = English()
EN = English(tag=False, parse=False)
assert len(EN("((")) == 2

View File

@ -12,7 +12,7 @@ def open_puncts():
@pytest.fixture
def EN():
return English()
return English(tag=False, parse=False)
def test_open(open_puncts, EN):

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals
from spacy.en import EN
from spacy.en import English
from spacy.util import utf8open
import pytest
@ -18,6 +18,7 @@ def sun_txt():
def test_tokenize(sun_txt):
nlp = English(tag=False, parse=False)
assert len(sun_txt) != 0
tokens = EN.tokenize(sun_txt)
tokens = nlp(sun_txt)
assert True