mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* All tests now passing for reorg
This commit is contained in:
parent
73f200436f
commit
b00bc01d8c
|
@ -1,9 +1,21 @@
|
|||
from preshed.maps cimport PreshMapArray
|
||||
|
||||
from ..tagger cimport Tagger
|
||||
from ..morphology cimport Morphologizer
|
||||
from ..strings cimport StringStore
|
||||
from ..structs cimport TokenC, Lexeme, Morphology, PosTag
|
||||
from ..typedefs cimport univ_tag_t
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
cdef class EnPosTagger(Tagger):
|
||||
cdef readonly StringStore strings
|
||||
cdef readonly StringStore tags
|
||||
cdef readonly Morphologizer morphologizer
|
||||
cdef public object lemmatizer
|
||||
cdef PreshMapArray _morph_cache
|
||||
|
||||
cdef PosTag* tags
|
||||
cdef readonly object tag_names
|
||||
cdef readonly object tag_map
|
||||
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||
|
||||
|
|
|
@ -3,10 +3,13 @@ import json
|
|||
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from ..typedefs cimport univ_tag_t
|
||||
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from ..typedefs cimport X, PUNCT, EOL
|
||||
from ..structs cimport TokenC, Morphology
|
||||
from ..typedefs cimport id_t
|
||||
from ..structs cimport TokenC, Morphology, Lexeme
|
||||
from ..tokens cimport Tokens
|
||||
from ..morphology cimport set_morph_from_dict
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
|
@ -194,29 +197,39 @@ POS_TEMPLATES = (
|
|||
)
|
||||
|
||||
|
||||
cdef struct _CachedMorph:
|
||||
Morphology morph
|
||||
int lemma
|
||||
|
||||
|
||||
cdef class EnPosTagger(Tagger):
|
||||
def __init__(self, StringStore strings, data_dir):
|
||||
model_dir = path.join(data_dir, 'pos')
|
||||
Tagger.__init__(self, path.join(model_dir))
|
||||
self.strings = strings
|
||||
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||
self.tags = StringStore()
|
||||
for tag in sorted(cfg['tag_names']):
|
||||
_ = self.tags[tag]
|
||||
self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
|
||||
cfg['tag_map'],
|
||||
Lemmatizer(path.join(data_dir, 'wordnet'),
|
||||
NOUN, VERB, ADJ))
|
||||
self.tag_names = sorted(cfg['tag_names'])
|
||||
self.tag_map = cfg['tag_map']
|
||||
cdef int n_tags = len(self.tag_names) + 1
|
||||
self._morph_cache = PreshMapArray(n_tags)
|
||||
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
|
||||
for i, tag in enumerate(sorted(self.tag_names)):
|
||||
pos, props = self.tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
set_morph_from_dict(&self.tags[i].morph, props)
|
||||
if path.exists(path.join(data_dir, 'morphs.json')):
|
||||
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'morphs.json'))))
|
||||
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
|
||||
|
||||
def __call__(self, Tokens tokens):
|
||||
cdef int i
|
||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
||||
cdef TokenC* t = tokens.data
|
||||
assert self.morphologizer is not None
|
||||
for i in range(tokens.length):
|
||||
fill_context(context, i, t)
|
||||
t[i].pos = self.predict(context)
|
||||
self.morphologizer.set_morph(i, t)
|
||||
self.set_morph(i, t)
|
||||
|
||||
def train(self, Tokens tokens, golds):
|
||||
cdef int i
|
||||
|
@ -226,10 +239,53 @@ cdef class EnPosTagger(Tagger):
|
|||
for i in range(tokens.length):
|
||||
fill_context(context, i, t)
|
||||
t[i].pos = self.predict(context, [golds[i]])
|
||||
self.morphologizer.set_morph(i, t)
|
||||
self.set_morph(i, t)
|
||||
c += t[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.sic)
|
||||
if cached is NULL:
|
||||
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
cached.morph = tag.morph
|
||||
self._morph_cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||
tokens[i].lemma = cached.lemma
|
||||
tokens[i].morph = cached.morph
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
cdef bytes py_string = self.strings[lex.sic]
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, pos)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||
return lemma
|
||||
|
||||
def load_morph_exceptions(self, dict exc):
|
||||
cdef unicode pos_str
|
||||
cdef unicode form_str
|
||||
cdef unicode lemma_str
|
||||
cdef dict entries
|
||||
cdef dict props
|
||||
cdef int lemma
|
||||
cdef id_t sic
|
||||
cdef int pos
|
||||
for pos_str, entries in exc.items():
|
||||
pos = self.tag_names.index(pos_str)
|
||||
for form_str, props in entries.items():
|
||||
lemma_str = props.get('L', form_str)
|
||||
sic = self.strings[form_str]
|
||||
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
|
||||
cached.lemma = self.strings[lemma_str]
|
||||
set_morph_from_dict(&cached.morph, props)
|
||||
self._morph_cache.set(pos, sic, <void*>cached)
|
||||
|
||||
|
||||
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
|
|
|
@ -1,21 +1,4 @@
|
|||
from cymem.cymem cimport Pool
|
||||
from preshed.maps cimport PreshMapArray
|
||||
|
||||
from .structs cimport TokenC, Lexeme, Morphology, PosTag
|
||||
from .strings cimport StringStore
|
||||
from .typedefs cimport id_t, univ_tag_t
|
||||
|
||||
|
||||
cdef class Morphologizer:
|
||||
cdef Pool mem
|
||||
cdef StringStore strings
|
||||
cdef object lemmatizer
|
||||
cdef PosTag* tags
|
||||
cdef readonly list tag_names
|
||||
|
||||
cdef PreshMapArray _cache
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1
|
||||
|
|
|
@ -1,105 +1,5 @@
|
|||
# cython: profile=True
|
||||
# cython: embedsignature=True
|
||||
from os import path
|
||||
import json
|
||||
|
||||
from .typedefs cimport id_t, univ_tag_t
|
||||
from .typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT
|
||||
from .typedefs cimport VERB, X, PUNCT, EOL
|
||||
from . import util
|
||||
|
||||
|
||||
UNIV_TAGS = {
|
||||
'NULL': NO_TAG,
|
||||
'ADJ': ADJ,
|
||||
'ADV': ADV,
|
||||
'ADP': ADP,
|
||||
'CONJ': CONJ,
|
||||
'DET': DET,
|
||||
'NOUN': NOUN,
|
||||
'NUM': NUM,
|
||||
'PRON': PRON,
|
||||
'PRT': PRT,
|
||||
'VERB': VERB,
|
||||
'X': X,
|
||||
'.': PUNCT,
|
||||
'EOL': EOL
|
||||
}
|
||||
|
||||
|
||||
cdef struct _Cached:
|
||||
Morphology morph
|
||||
int lemma
|
||||
|
||||
|
||||
cdef class Morphologizer:
|
||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||
"""
|
||||
def __init__(self, StringStore strings, object tag_names, object tag_map,
|
||||
object lemmatizer, irregulars=None):
|
||||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
self.lemmatizer = lemmatizer
|
||||
cdef int n_tags = len(tag_names) + 1
|
||||
self._cache = PreshMapArray(n_tags)
|
||||
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
|
||||
for i, tag in enumerate(sorted(tag_names)):
|
||||
pos, props = tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
self.tags[i].morph.number = props.get('number', 0)
|
||||
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
||||
self.tags[i].morph.mood = props.get('mood', 0)
|
||||
self.tags[i].morph.gender = props.get('gender', 0)
|
||||
self.tags[i].morph.person = props.get('person', 0)
|
||||
self.tags[i].morph.case = props.get('case', 0)
|
||||
self.tags[i].morph.misc = props.get('misc', 0)
|
||||
if irregulars is not None:
|
||||
self.load_exceptions(irregulars)
|
||||
|
||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||
if self.lemmatizer is None:
|
||||
return lex.sic
|
||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||
return lex.sic
|
||||
cdef bytes py_string = self.strings[lex.sic]
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
lemma_strings = self.lemmatizer(py_string, pos)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||
return lemma
|
||||
|
||||
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
|
||||
if cached is NULL:
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||
cached.morph = tag.morph
|
||||
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
|
||||
tokens[i].lemma = cached.lemma
|
||||
tokens[i].morph = cached.morph
|
||||
|
||||
def load_exceptions(self, dict exc):
|
||||
cdef unicode pos_str
|
||||
cdef unicode form_str
|
||||
cdef unicode lemma_str
|
||||
cdef dict entries
|
||||
cdef dict props
|
||||
cdef int lemma
|
||||
cdef id_t sic
|
||||
cdef int pos
|
||||
for pos_str, entries in exc.items():
|
||||
pos = self.strings.pos_tags[pos_str]
|
||||
assert pos < len(self.strings.pos_tags)
|
||||
for form_str, props in entries.items():
|
||||
lemma_str = props.get('L', form_str)
|
||||
sic = self.strings[form_str]
|
||||
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
|
||||
cached.lemma = self.strings[lemma_str]
|
||||
set_morph_from_dict(&cached.morph, props)
|
||||
self._cache.set(pos, sic, <void*>cached)
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
|
|
|
@ -10,11 +10,6 @@ def EN():
|
|||
def tagged(EN):
|
||||
string = u'Bananas in pyjamas are geese.'
|
||||
tokens = EN(string, tag=True)
|
||||
assert EN.tagger.tags[tokens[0].pos] == 'NNP'
|
||||
assert EN.tagger.tags[tokens[1].pos] == 'IN'
|
||||
assert EN.tagger.tags[tokens[2].pos] == 'NNS'
|
||||
assert EN.tagger.tags[tokens[3].pos] == 'VBP'
|
||||
assert EN.tagger.tags[tokens[3].pos] == 'NNS'
|
||||
return tokens
|
||||
|
||||
|
||||
|
|
|
@ -21,7 +21,7 @@ def test_read_exc():
|
|||
|
||||
@pytest.fixture
|
||||
def lemmatizer():
|
||||
return Lemmatizer(path.join(DATA_DIR, 'wordnet'))
|
||||
return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0)
|
||||
|
||||
|
||||
def test_noun_lemmas(lemmatizer):
|
||||
|
|
|
@ -18,8 +18,8 @@ def morph_exc():
|
|||
}
|
||||
|
||||
def test_load_exc(EN, morph_exc):
|
||||
EN.tagger.morphologizer.load_exceptions(morph_exc)
|
||||
EN.tagger.load_morph_exceptions(morph_exc)
|
||||
tokens = EN('I like his style.', tag=True)
|
||||
his = tokens[2]
|
||||
assert his.pos == 'PRP$'
|
||||
assert EN.tagger.tag_names[his.pos] == 'PRP$'
|
||||
assert his.lemma == '-PRP-'
|
||||
|
|
|
@ -4,10 +4,10 @@ import pytest
|
|||
from spacy.en import English
|
||||
|
||||
def test_only_pre1():
|
||||
EN = English()
|
||||
EN = English(tag=False, parse=False)
|
||||
assert len(EN("(")) == 1
|
||||
|
||||
|
||||
def test_only_pre2():
|
||||
EN = English()
|
||||
EN = English(tag=False, parse=False)
|
||||
assert len(EN("((")) == 2
|
||||
|
|
|
@ -12,7 +12,7 @@ def open_puncts():
|
|||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
return English(tag=False, parse=False)
|
||||
|
||||
|
||||
def test_open(open_puncts, EN):
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
from spacy.util import utf8open
|
||||
|
||||
import pytest
|
||||
|
@ -18,6 +18,7 @@ def sun_txt():
|
|||
|
||||
|
||||
def test_tokenize(sun_txt):
|
||||
nlp = English(tag=False, parse=False)
|
||||
assert len(sun_txt) != 0
|
||||
tokens = EN.tokenize(sun_txt)
|
||||
tokens = nlp(sun_txt)
|
||||
assert True
|
||||
|
|
Loading…
Reference in New Issue
Block a user