* All tests now passing for reorg

This commit is contained in:
Matthew Honnibal 2014-12-23 13:18:59 +11:00
parent 73f200436f
commit b00bc01d8c
10 changed files with 91 additions and 144 deletions

View File

@ -1,9 +1,21 @@
from preshed.maps cimport PreshMapArray
from ..tagger cimport Tagger from ..tagger cimport Tagger
from ..morphology cimport Morphologizer
from ..strings cimport StringStore from ..strings cimport StringStore
from ..structs cimport TokenC, Lexeme, Morphology, PosTag
from ..typedefs cimport univ_tag_t
from .lemmatizer import Lemmatizer
cdef class EnPosTagger(Tagger): cdef class EnPosTagger(Tagger):
cdef readonly StringStore strings cdef readonly StringStore strings
cdef readonly StringStore tags cdef public object lemmatizer
cdef readonly Morphologizer morphologizer cdef PreshMapArray _morph_cache
cdef PosTag* tags
cdef readonly object tag_names
cdef readonly object tag_map
cdef int set_morph(self, const int i, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1

View File

@ -3,10 +3,13 @@ import json
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
from ..typedefs cimport univ_tag_t
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..typedefs cimport X, PUNCT, EOL from ..typedefs cimport X, PUNCT, EOL
from ..structs cimport TokenC, Morphology from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, Lexeme
from ..tokens cimport Tokens from ..tokens cimport Tokens
from ..morphology cimport set_morph_from_dict
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
@ -194,29 +197,39 @@ POS_TEMPLATES = (
) )
cdef struct _CachedMorph:
Morphology morph
int lemma
cdef class EnPosTagger(Tagger): cdef class EnPosTagger(Tagger):
def __init__(self, StringStore strings, data_dir): def __init__(self, StringStore strings, data_dir):
model_dir = path.join(data_dir, 'pos') model_dir = path.join(data_dir, 'pos')
Tagger.__init__(self, path.join(model_dir)) Tagger.__init__(self, path.join(model_dir))
self.strings = strings self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
self.tags = StringStore() self.tag_names = sorted(cfg['tag_names'])
for tag in sorted(cfg['tag_names']): self.tag_map = cfg['tag_map']
_ = self.tags[tag] cdef int n_tags = len(self.tag_names) + 1
self.morphologizer = Morphologizer(self.strings, cfg['tag_names'], self._morph_cache = PreshMapArray(n_tags)
cfg['tag_map'], self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
Lemmatizer(path.join(data_dir, 'wordnet'), for i, tag in enumerate(sorted(self.tag_names)):
NOUN, VERB, ADJ)) pos, props = self.tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
set_morph_from_dict(&self.tags[i].morph, props)
if path.exists(path.join(data_dir, 'morphs.json')):
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
def __call__(self, Tokens tokens): def __call__(self, Tokens tokens):
cdef int i cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context cdef atom_t[N_CONTEXT_FIELDS] context
cdef TokenC* t = tokens.data cdef TokenC* t = tokens.data
assert self.morphologizer is not None
for i in range(tokens.length): for i in range(tokens.length):
fill_context(context, i, t) fill_context(context, i, t)
t[i].pos = self.predict(context) t[i].pos = self.predict(context)
self.morphologizer.set_morph(i, t) self.set_morph(i, t)
def train(self, Tokens tokens, golds): def train(self, Tokens tokens, golds):
cdef int i cdef int i
@ -226,10 +239,53 @@ cdef class EnPosTagger(Tagger):
for i in range(tokens.length): for i in range(tokens.length):
fill_context(context, i, t) fill_context(context, i, t)
t[i].pos = self.predict(context, [golds[i]]) t[i].pos = self.predict(context, [golds[i]])
self.morphologizer.set_morph(i, t) self.set_morph(i, t)
c += t[i].pos == golds[i] c += t[i].pos == golds[i]
return c return c
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].pos]
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
if cached is NULL:
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
cdef bytes py_string = self.strings[lex.sic]
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef set lemma_strings
cdef bytes lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
return lemma
def load_morph_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef id_t sic
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
sic = self.strings[form_str]
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._morph_cache.set(pos, sic, <void*>cached)
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1: cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_sic], &tokens[i-2]) _fill_from_token(&context[P2_sic], &tokens[i-2])

View File

@ -1,21 +1,4 @@
from cymem.cymem cimport Pool
from preshed.maps cimport PreshMapArray
from .structs cimport TokenC, Lexeme, Morphology, PosTag from .structs cimport TokenC, Lexeme, Morphology, PosTag
from .strings cimport StringStore
from .typedefs cimport id_t, univ_tag_t
cdef class Morphologizer:
cdef Pool mem
cdef StringStore strings
cdef object lemmatizer
cdef PosTag* tags
cdef readonly list tag_names
cdef PreshMapArray _cache
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cdef int set_morph(self, const int i, TokenC* tokens) except -1
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1

View File

@ -1,106 +1,6 @@
# cython: profile=True # cython: profile=True
# cython: embedsignature=True # cython: embedsignature=True
from os import path
import json
from .typedefs cimport id_t, univ_tag_t
from .typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT
from .typedefs cimport VERB, X, PUNCT, EOL
from . import util
UNIV_TAGS = {
'NULL': NO_TAG,
'ADJ': ADJ,
'ADV': ADV,
'ADP': ADP,
'CONJ': CONJ,
'DET': DET,
'NOUN': NOUN,
'NUM': NUM,
'PRON': PRON,
'PRT': PRT,
'VERB': VERB,
'X': X,
'.': PUNCT,
'EOL': EOL
}
cdef struct _Cached:
Morphology morph
int lemma
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
def __init__(self, StringStore strings, object tag_names, object tag_map,
object lemmatizer, irregulars=None):
self.mem = Pool()
self.strings = strings
self.lemmatizer = lemmatizer
cdef int n_tags = len(tag_names) + 1
self._cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for i, tag in enumerate(sorted(tag_names)):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
self.tags[i].morph.number = props.get('number', 0)
self.tags[i].morph.tenspect = props.get('tenspect', 0)
self.tags[i].morph.mood = props.get('mood', 0)
self.tags[i].morph.gender = props.get('gender', 0)
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
if irregulars is not None:
self.load_exceptions(irregulars)
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef bytes py_string = self.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
return lemma
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].pos]
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
if cached is NULL:
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
def load_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef id_t sic
cdef int pos
for pos_str, entries in exc.items():
pos = self.strings.pos_tags[pos_str]
assert pos < len(self.strings.pos_tags)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
sic = self.strings[form_str]
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._cache.set(pos, sic, <void*>cached)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0) morph.number = props.get('number', 0)

View File

@ -10,11 +10,6 @@ def EN():
def tagged(EN): def tagged(EN):
string = u'Bananas in pyjamas are geese.' string = u'Bananas in pyjamas are geese.'
tokens = EN(string, tag=True) tokens = EN(string, tag=True)
assert EN.tagger.tags[tokens[0].pos] == 'NNP'
assert EN.tagger.tags[tokens[1].pos] == 'IN'
assert EN.tagger.tags[tokens[2].pos] == 'NNS'
assert EN.tagger.tags[tokens[3].pos] == 'VBP'
assert EN.tagger.tags[tokens[3].pos] == 'NNS'
return tokens return tokens

View File

@ -21,7 +21,7 @@ def test_read_exc():
@pytest.fixture @pytest.fixture
def lemmatizer(): def lemmatizer():
return Lemmatizer(path.join(DATA_DIR, 'wordnet')) return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0)
def test_noun_lemmas(lemmatizer): def test_noun_lemmas(lemmatizer):

View File

@ -18,8 +18,8 @@ def morph_exc():
} }
def test_load_exc(EN, morph_exc): def test_load_exc(EN, morph_exc):
EN.tagger.morphologizer.load_exceptions(morph_exc) EN.tagger.load_morph_exceptions(morph_exc)
tokens = EN('I like his style.', tag=True) tokens = EN('I like his style.', tag=True)
his = tokens[2] his = tokens[2]
assert his.pos == 'PRP$' assert EN.tagger.tag_names[his.pos] == 'PRP$'
assert his.lemma == '-PRP-' assert his.lemma == '-PRP-'

View File

@ -4,10 +4,10 @@ import pytest
from spacy.en import English from spacy.en import English
def test_only_pre1(): def test_only_pre1():
EN = English() EN = English(tag=False, parse=False)
assert len(EN("(")) == 1 assert len(EN("(")) == 1
def test_only_pre2(): def test_only_pre2():
EN = English() EN = English(tag=False, parse=False)
assert len(EN("((")) == 2 assert len(EN("((")) == 2

View File

@ -12,7 +12,7 @@ def open_puncts():
@pytest.fixture @pytest.fixture
def EN(): def EN():
return English() return English(tag=False, parse=False)
def test_open(open_puncts, EN): def test_open(open_puncts, EN):

View File

@ -1,6 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en import EN from spacy.en import English
from spacy.util import utf8open from spacy.util import utf8open
import pytest import pytest
@ -18,6 +18,7 @@ def sun_txt():
def test_tokenize(sun_txt): def test_tokenize(sun_txt):
nlp = English(tag=False, parse=False)
assert len(sun_txt) != 0 assert len(sun_txt) != 0
tokens = EN.tokenize(sun_txt) tokens = nlp(sun_txt)
assert True assert True