mirror of
https://github.com/explosion/spaCy.git
synced 2025-05-03 23:33:40 +03:00
* Move morphological analysis into its own module, morphology.pyx
This commit is contained in:
parent
b962fe73d7
commit
6b34a2f34b
12
spacy/en.pyx
12
spacy/en.pyx
|
@ -35,8 +35,8 @@ from __future__ import unicode_literals
|
||||||
cimport lang
|
cimport lang
|
||||||
from .typedefs cimport flags_t
|
from .typedefs cimport flags_t
|
||||||
import orth
|
import orth
|
||||||
from .tagger cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
from .morphology cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||||
from .tagger cimport X, PUNCT, EOL
|
from .morphology cimport X, PUNCT, EOL
|
||||||
|
|
||||||
from .tokens cimport Morphology
|
from .tokens cimport Morphology
|
||||||
|
|
||||||
|
@ -154,8 +154,8 @@ cdef class English(Language):
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_pos_context(context, i, t)
|
fill_pos_context(context, i, t)
|
||||||
t[i].pos = self.pos_tagger.predict(context)
|
t[i].pos = self.pos_tagger.predict(context)
|
||||||
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
|
if self.morphologizer:
|
||||||
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
|
self.morphologizer.set_morph(i, t)
|
||||||
|
|
||||||
def train_pos(self, Tokens tokens, golds):
|
def train_pos(self, Tokens tokens, golds):
|
||||||
cdef int i
|
cdef int i
|
||||||
|
@ -165,8 +165,8 @@ cdef class English(Language):
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
fill_pos_context(context, i, t)
|
fill_pos_context(context, i, t)
|
||||||
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
|
||||||
_merge_morph(&t[i].morph, &self.pos_tagger.tags[t[i].pos].morph)
|
if self.morphologizer:
|
||||||
t[i].lemma = self.lemmatize(self.pos_tagger.tags[t[i].pos].pos, t[i].lex)
|
self.morphologizer.set_morph(i, t)
|
||||||
c += t[i].pos == golds[i]
|
c += t[i].pos == golds[i]
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
|
@ -2,15 +2,15 @@ from libcpp.vector cimport vector
|
||||||
|
|
||||||
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
|
from cpython cimport Py_UNICODE_ISSPACE, Py_UNICODE_ISALPHA, Py_UNICODE_ISUPPER
|
||||||
|
|
||||||
from preshed.maps cimport PreshMap, PreshMapArray
|
from preshed.maps cimport PreshMap
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from .typedefs cimport hash_t
|
from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens, TokenC
|
from .tokens cimport Tokens, TokenC
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .tagger cimport Tagger
|
from .tagger cimport Tagger
|
||||||
from .tagger cimport univ_tag_t
|
|
||||||
from .utf8string cimport StringStore, UniStr
|
from .utf8string cimport StringStore, UniStr
|
||||||
|
from .morphology cimport Morphologizer
|
||||||
|
|
||||||
|
|
||||||
cdef union LexemesOrTokens:
|
cdef union LexemesOrTokens:
|
||||||
|
@ -40,17 +40,14 @@ cdef class Language:
|
||||||
cdef readonly unicode name
|
cdef readonly unicode name
|
||||||
cdef PreshMap _cache
|
cdef PreshMap _cache
|
||||||
cdef PreshMap _specials
|
cdef PreshMap _specials
|
||||||
cdef PreshMapArray _lemmas
|
|
||||||
cpdef readonly Lexicon lexicon
|
cpdef readonly Lexicon lexicon
|
||||||
cpdef readonly Tagger pos_tagger
|
cpdef readonly Tagger pos_tagger
|
||||||
cpdef readonly object lemmatizer
|
cpdef readonly Morphologizer morphologizer
|
||||||
|
|
||||||
cdef object _prefix_re
|
cdef object _prefix_re
|
||||||
cdef object _suffix_re
|
cdef object _suffix_re
|
||||||
cdef object _infix_re
|
cdef object _infix_re
|
||||||
|
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings)
|
cpdef Tokens tokens_from_list(self, list strings)
|
||||||
cpdef Tokens tokenize(self, unicode text)
|
cpdef Tokens tokenize(self, unicode text)
|
||||||
|
|
||||||
|
|
|
@ -14,7 +14,6 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
from preshed.maps cimport PreshMap
|
from preshed.maps cimport PreshMap
|
||||||
from .lemmatizer import Lemmatizer
|
|
||||||
|
|
||||||
from .lexeme cimport Lexeme
|
from .lexeme cimport Lexeme
|
||||||
from .lexeme cimport EMPTY_LEXEME
|
from .lexeme cimport EMPTY_LEXEME
|
||||||
|
@ -26,8 +25,6 @@ from .utf8string cimport slice_unicode
|
||||||
from . import util
|
from . import util
|
||||||
from .util import read_lang_data
|
from .util import read_lang_data
|
||||||
from .tokens import Tokens
|
from .tokens import Tokens
|
||||||
|
|
||||||
from .tagger cimport NOUN, VERB, ADJ, N_UNIV_TAGS
|
|
||||||
from .tokens cimport Morphology
|
from .tokens cimport Morphology
|
||||||
|
|
||||||
|
|
||||||
|
@ -43,39 +40,16 @@ cdef class Language:
|
||||||
self._infix_re = re.compile(infix)
|
self._infix_re = re.compile(infix)
|
||||||
self.lexicon = Lexicon(self.get_props)
|
self.lexicon = Lexicon(self.get_props)
|
||||||
self._load_special_tokenization(rules)
|
self._load_special_tokenization(rules)
|
||||||
self._lemmas = PreshMapArray(N_UNIV_TAGS)
|
|
||||||
self.pos_tagger = None
|
self.pos_tagger = None
|
||||||
self.lemmatizer = None
|
self.morphologizer = None
|
||||||
|
|
||||||
def load(self):
|
def load(self):
|
||||||
self.lemmatizer = Lemmatizer(path.join(util.DATA_DIR, 'wordnet'))
|
|
||||||
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
|
self.lexicon.load(path.join(util.DATA_DIR, self.name, 'lexemes'))
|
||||||
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
|
self.lexicon.strings.load(path.join(util.DATA_DIR, self.name, 'strings'))
|
||||||
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
if path.exists(path.join(util.DATA_DIR, self.name, 'pos')):
|
||||||
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
self.pos_tagger = Tagger(path.join(util.DATA_DIR, self.name, 'pos'))
|
||||||
|
self.morphologizer = Morphologizer(self.lexicon.strings,
|
||||||
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
path.join(util.DATA_DIR, self.name))
|
||||||
if self.lemmatizer is None:
|
|
||||||
return lex.sic
|
|
||||||
if pos != NOUN and pos != VERB and pos != ADJ:
|
|
||||||
return lex.sic
|
|
||||||
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
|
|
||||||
if lemma != 0:
|
|
||||||
return lemma
|
|
||||||
cdef bytes py_string = self.lexicon.strings[lex.sic]
|
|
||||||
cdef set lemma_strings
|
|
||||||
cdef bytes lemma_string
|
|
||||||
if pos == NOUN:
|
|
||||||
lemma_strings = self.lemmatizer.noun(py_string)
|
|
||||||
elif pos == VERB:
|
|
||||||
lemma_strings = self.lemmatizer.verb(py_string)
|
|
||||||
else:
|
|
||||||
assert pos == ADJ
|
|
||||||
lemma_strings = self.lemmatizer.adj(py_string)
|
|
||||||
lemma_string = sorted(lemma_strings)[0]
|
|
||||||
lemma = self.lexicon.strings.intern(lemma_string, len(lemma_string)).i
|
|
||||||
self._lemmas.set(pos, lex.sic, <void*>lemma)
|
|
||||||
return lemma
|
|
||||||
|
|
||||||
cpdef Tokens tokens_from_list(self, list strings):
|
cpdef Tokens tokens_from_list(self, list strings):
|
||||||
cdef int length = sum([len(s) for s in strings])
|
cdef int length = sum([len(s) for s in strings])
|
||||||
|
|
42
spacy/morphology.pxd
Normal file
42
spacy/morphology.pxd
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
from .tokens cimport TokenC, Morphology
|
||||||
|
from .lexeme cimport Lexeme
|
||||||
|
from .utf8string cimport StringStore
|
||||||
|
|
||||||
|
from preshed.maps cimport PreshMapArray
|
||||||
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
|
# Google universal tag set
|
||||||
|
cpdef enum univ_tag_t:
|
||||||
|
NO_TAG
|
||||||
|
ADJ
|
||||||
|
ADV
|
||||||
|
ADP
|
||||||
|
CONJ
|
||||||
|
DET
|
||||||
|
NOUN
|
||||||
|
NUM
|
||||||
|
PRON
|
||||||
|
PRT
|
||||||
|
VERB
|
||||||
|
X
|
||||||
|
PUNCT
|
||||||
|
EOL
|
||||||
|
N_UNIV_TAGS
|
||||||
|
|
||||||
|
|
||||||
|
cdef struct PosTag:
|
||||||
|
Morphology morph
|
||||||
|
int id
|
||||||
|
univ_tag_t pos
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Morphologizer:
|
||||||
|
cdef Pool mem
|
||||||
|
cdef StringStore strings
|
||||||
|
cdef object lemmatizer
|
||||||
|
cdef PosTag* tags
|
||||||
|
|
||||||
|
cdef PreshMapArray _morph
|
||||||
|
cdef PreshMapArray _lemmas
|
||||||
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
|
||||||
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1
|
81
spacy/morphology.pyx
Normal file
81
spacy/morphology.pyx
Normal file
|
@ -0,0 +1,81 @@
|
||||||
|
from os import path
|
||||||
|
import json
|
||||||
|
|
||||||
|
from .lemmatizer import Lemmatizer
|
||||||
|
|
||||||
|
|
||||||
|
UNIV_TAGS = {
|
||||||
|
'NULL': NO_TAG,
|
||||||
|
'ADJ': ADJ,
|
||||||
|
'ADV': ADV,
|
||||||
|
'ADP': ADP,
|
||||||
|
'CONJ': CONJ,
|
||||||
|
'DET': DET,
|
||||||
|
'NOUN': NOUN,
|
||||||
|
'NUM': NUM,
|
||||||
|
'PRON': PRON,
|
||||||
|
'PRT': PRT,
|
||||||
|
'VERB': VERB,
|
||||||
|
'X': X,
|
||||||
|
'.': PUNCT,
|
||||||
|
'EOL': EOL
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
cdef class Morphologizer:
|
||||||
|
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||||
|
"""
|
||||||
|
def __init__(self, StringStore strings, data_dir):
|
||||||
|
self.mem = Pool()
|
||||||
|
self.strings = strings
|
||||||
|
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||||
|
tag_map = cfg['tag_map']
|
||||||
|
tag_names = cfg['tag_names']
|
||||||
|
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
|
||||||
|
self._lemmas = PreshMapArray(N_UNIV_TAGS)
|
||||||
|
self._morph = PreshMapArray(len(tag_names))
|
||||||
|
self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
|
||||||
|
for i, tag in enumerate(tag_names):
|
||||||
|
pos, props = tag_map[tag]
|
||||||
|
self.tags[i].id = i
|
||||||
|
self.tags[i].pos = pos
|
||||||
|
self.tags[i].morph.number = props.get('number', 0)
|
||||||
|
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
||||||
|
self.tags[i].morph.mood = props.get('mood', 0)
|
||||||
|
self.tags[i].morph.gender = props.get('gender', 0)
|
||||||
|
self.tags[i].morph.person = props.get('person', 0)
|
||||||
|
self.tags[i].morph.case = props.get('case', 0)
|
||||||
|
self.tags[i].morph.misc = props.get('misc', 0)
|
||||||
|
|
||||||
|
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
|
||||||
|
if self.lemmatizer is None:
|
||||||
|
return lex.sic
|
||||||
|
if pos != NOUN and pos != VERB and pos != ADJ:
|
||||||
|
return lex.sic
|
||||||
|
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
|
||||||
|
if lemma != 0:
|
||||||
|
return lemma
|
||||||
|
cdef bytes py_string = self.strings[lex.sic]
|
||||||
|
cdef set lemma_strings
|
||||||
|
cdef bytes lemma_string
|
||||||
|
if pos == NOUN:
|
||||||
|
lemma_strings = self.lemmatizer.noun(py_string)
|
||||||
|
elif pos == VERB:
|
||||||
|
lemma_strings = self.lemmatizer.verb(py_string)
|
||||||
|
else:
|
||||||
|
assert pos == ADJ
|
||||||
|
lemma_strings = self.lemmatizer.adj(py_string)
|
||||||
|
lemma_string = sorted(lemma_strings)[0]
|
||||||
|
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||||
|
self._lemmas.set(pos, lex.sic, <void*>lemma)
|
||||||
|
return lemma
|
||||||
|
|
||||||
|
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
|
||||||
|
cdef const PosTag* tag = &self.tags[tokens[i].pos]
|
||||||
|
tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
|
||||||
|
morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
|
||||||
|
if morph is NULL:
|
||||||
|
self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
|
||||||
|
tokens[i].morph = tag.morph
|
||||||
|
else:
|
||||||
|
tokens[i].morph = morph[0]
|
|
@ -12,31 +12,6 @@ from .typedefs cimport hash_t
|
||||||
from .tokens cimport Tokens, Morphology
|
from .tokens cimport Tokens, Morphology
|
||||||
|
|
||||||
|
|
||||||
# Google universal tag set
|
|
||||||
cdef enum univ_tag_t:
|
|
||||||
NO_TAG
|
|
||||||
ADJ
|
|
||||||
ADV
|
|
||||||
ADP
|
|
||||||
CONJ
|
|
||||||
DET
|
|
||||||
NOUN
|
|
||||||
NUM
|
|
||||||
PRON
|
|
||||||
PRT
|
|
||||||
VERB
|
|
||||||
X
|
|
||||||
PUNCT
|
|
||||||
EOL
|
|
||||||
N_UNIV_TAGS
|
|
||||||
|
|
||||||
|
|
||||||
cdef struct PosTag:
|
|
||||||
Morphology morph
|
|
||||||
int id
|
|
||||||
univ_tag_t pos
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Tagger:
|
cdef class Tagger:
|
||||||
cdef class_t predict(self, const atom_t* context, object golds=*) except *
|
cdef class_t predict(self, const atom_t* context, object golds=*) except *
|
||||||
|
|
||||||
|
@ -45,5 +20,4 @@ cdef class Tagger:
|
||||||
cpdef readonly LinearModel model
|
cpdef readonly LinearModel model
|
||||||
|
|
||||||
cpdef readonly list tag_names
|
cpdef readonly list tag_names
|
||||||
cdef PosTag* tags
|
|
||||||
cdef dict tagdict
|
cdef dict tagdict
|
||||||
|
|
|
@ -34,23 +34,10 @@ cdef class Tagger:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
cfg = json.load(open(path.join(model_dir, 'config.json')))
|
||||||
templates = cfg['templates']
|
templates = cfg['templates']
|
||||||
tag_map = cfg['tag_map']
|
|
||||||
univ_counts = {}
|
univ_counts = {}
|
||||||
cdef unicode tag
|
cdef unicode tag
|
||||||
cdef unicode univ_tag
|
cdef unicode univ_tag
|
||||||
self.tag_names = cfg['tag_names']
|
self.tag_names = cfg['tag_names']
|
||||||
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
|
|
||||||
for i, tag in enumerate(self.tag_names):
|
|
||||||
pos, props = tag_map[tag]
|
|
||||||
self.tags[i].id = i
|
|
||||||
self.tags[i].pos = pos
|
|
||||||
self.tags[i].morph.number = props.get('number', 0)
|
|
||||||
self.tags[i].morph.tenspect = props.get('tenspect', 0)
|
|
||||||
self.tags[i].morph.mood = props.get('mood', 0)
|
|
||||||
self.tags[i].morph.gender = props.get('gender', 0)
|
|
||||||
self.tags[i].morph.person = props.get('person', 0)
|
|
||||||
self.tags[i].morph.case = props.get('case', 0)
|
|
||||||
self.tags[i].morph.misc = props.get('misc', 0)
|
|
||||||
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
self.tagdict = _make_tag_dict(cfg['tag_counts'])
|
||||||
self.extractor = Extractor(templates)
|
self.extractor = Extractor(templates)
|
||||||
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
|
||||||
|
@ -85,23 +72,6 @@ cdef class Tagger:
|
||||||
return tag_id
|
return tag_id
|
||||||
|
|
||||||
|
|
||||||
UNIV_TAGS = {
|
|
||||||
'NULL': NO_TAG,
|
|
||||||
'ADJ': ADJ,
|
|
||||||
'ADV': ADV,
|
|
||||||
'ADP': ADP,
|
|
||||||
'CONJ': CONJ,
|
|
||||||
'DET': DET,
|
|
||||||
'NOUN': NOUN,
|
|
||||||
'NUM': NUM,
|
|
||||||
'PRON': PRON,
|
|
||||||
'PRT': PRT,
|
|
||||||
'VERB': VERB,
|
|
||||||
'X': X,
|
|
||||||
'.': PUNCT,
|
|
||||||
'EOL': EOL
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _make_tag_dict(counts):
|
def _make_tag_dict(counts):
|
||||||
freq_thresh = 50
|
freq_thresh = 50
|
||||||
|
|
Loading…
Reference in New Issue
Block a user