Remove enums from morphology

This commit is contained in:
Matthew Honnibal 2019-03-07 17:14:57 +01:00
parent 932d7dde1c
commit fed0371db7
7 changed files with 487 additions and 1023 deletions

View File

@ -31,388 +31,3 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
cdef int _assign_tag_from_exceptions(self, TokenC* token, int tag_id) except -1
cdef enum univ_morph_t:
NIL = 0
begin_Abbr
Abbr_yes
end_Abbr
begin_AdpType
AdpType_circ
AdpType_comprep
AdpType_prep
AdpType_post
AdpType_voc
end_AdpType
begin_AdvType
AdvType_adadj
AdvType_cau
AdvType_deg
AdvType_ex
AdvType_loc
AdvType_man
AdvType_mod
AdvType_sta
AdvType_tim
end_AdvType
begin_Animacy
Animacy_anim
Animacy_hum
Animacy_inan
Animacy_nhum
end_Animacy
begin_Aspect
Aspect_freq
Aspect_imp
Aspect_mod
Aspect_none
Aspect_perf
end_Aspect
begin_Case
Case_abe
Case_abl
Case_abs
Case_acc
Case_ade
Case_all
Case_cau
Case_com
Case_dat
Case_del
Case_dis
Case_ela
Case_ess
Case_gen
Case_ill
Case_ine
Case_ins
Case_loc
Case_lat
Case_nom
Case_par
Case_sub
Case_sup
Case_tem
Case_ter
Case_tra
Case_voc
end_Case
begin_ConjType
ConjType_comp # cz, U
ConjType_oper # cz, U
end_ConjType
begin_Connegative
Connegative_yes # fi
end_Connegative
begin_Definite
Definite_cons # U20
Definite_def
Definite_ind
Definite_red
Definite_two
end_Definite
begin_Degree
Degree_abs
Degree_cmp
Degree_comp
Degree_none
Degree_pos
Degree_sup
Degree_com
Degree_dim # du
end_Degree
begin_Derivation
Derivation_minen # fi
Derivation_sti # fi
Derivation_inen # fi
Derivation_lainen # fi
Derivation_ja # fi
Derivation_ton # fi
Derivation_vs # fi
Derivation_ttain # fi
Derivation_ttaa # fi
end_Derivation
begin_Echo
Echo_rdp # U
Echo_ech # U
end_Echo
begin_Foreign
Foreign_foreign # cz, fi, U
Foreign_fscript # cz, fi, U
Foreign_tscript # cz, U
Foreign_yes # sl
end_Foreign
begin_Gender
Gender_com
Gender_fem
Gender_masc
Gender_neut
Gender_dat_masc # bq, U
Gender_dat_fem # bq, U
Gender_erg_masc # bq
Gender_erg_fem # bq
Gender_psor_masc # cz, sl, U
Gender_psor_fem # cz, sl, U
Gender_psor_neut # sl
end_Gender
begin_Hyph
Hyph_yes # cz, U
end_Hyph
begin_InfForm
InfForm_one # fi
InfForm_two # fi
InfForm_three # fi
end_InfForm
begin_Mood
Mood_cnd
Mood_imp
Mood_ind
Mood_n
Mood_pot
Mood_sub
Mood_opt
end_Mood
begin_NameType
NameType_geo # U, cz
NameType_prs # U, cz
NameType_giv # U, cz
NameType_sur # U, cz
NameType_nat # U, cz
NameType_com # U, cz
NameType_pro # U, cz
NameType_oth # U, cz
end_NameType
begin_Negative
Negative_neg
Negative_pos
Negative_yes
end_Negative
begin_NounType
NounType_com # U
NounType_prop # U
NounType_class # U
end_NounType
begin_Number
Number_com
Number_dual
Number_none
Number_plur
Number_sing
Number_ptan # bg
Number_count # bg
Number_abs_sing # bq, U
Number_abs_plur # bq, U
Number_dat_sing # bq, U
Number_dat_plur # bq, U
Number_erg_sing # bq, U
Number_erg_plur # bq, U
Number_psee_sing # U
Number_psee_plur # U
Number_psor_sing # cz, fi, sl, U
Number_psor_plur # cz, fi, sl, U
end_Number
begin_NumForm
NumForm_digit # cz, sl, U
NumForm_roman # cz, sl, U
NumForm_word # cz, sl, U
end_NumForm
begin_NumType
NumType_card
NumType_dist
NumType_frac
NumType_gen
NumType_mult
NumType_none
NumType_ord
NumType_sets
end_NumType
begin_NumValue
NumValue_one # cz, U
NumValue_two # cz, U
NumValue_three # cz, U
end_NumValue
begin_PartForm
PartForm_pres # fi
PartForm_past # fi
PartForm_agt # fi
PartForm_neg # fi
end_PartForm
begin_PartType
PartType_mod # U
PartType_emp # U
PartType_res # U
PartType_inf # U
PartType_vbp # U
end_PartType
begin_Person
Person_one
Person_two
Person_three
Person_none
Person_abs_one # bq, U
Person_abs_two # bq, U
Person_abs_three # bq, U
Person_dat_one # bq, U
Person_dat_two # bq, U
Person_dat_three # bq, U
Person_erg_one # bq, U
Person_erg_two # bq, U
Person_erg_three # bq, U
Person_psor_one # fi, U
Person_psor_two # fi, U
Person_psor_three # fi, U
end_Person
begin_Polarity
Polarity_neg # U20
Polarity_pos # U20
end_Polarity
begin_Polite
Polite_inf # bq, U
Polite_pol # bq, U
Polite_abs_inf # bq, U
Polite_abs_pol # bq, U
Polite_erg_inf # bq, U
Polite_erg_pol # bq, U
Polite_dat_inf # bq, U
Polite_dat_pol # bq, U
end_Polite
begin_Poss
Poss_yes
end_Poss
begin_Prefix
Prefix_yes # U
end_Prefix
begin_PrepCase
PrepCase_npr # cz
PrepCase_pre # U
end_PrepCase
begin_PronType
PronType_advPart
PronType_art
PronType_default
PronType_dem
PronType_ind
PronType_int
PronType_neg
PronType_prs
PronType_rcp
PronType_rel
PronType_tot
PronType_clit
PronType_exc # es, ca, it, fa
end_PronType
begin_PunctSide
PunctSide_ini # U
PunctSide_fin # U
end_PunctSide
begin_PunctType
PunctType_peri # U
PunctType_qest # U
PunctType_excl # U
PunctType_quot # U
PunctType_brck # U
PunctType_comm # U
PunctType_colo # U
PunctType_semi # U
PunctType_dash # U
end_PunctType
begin_Reflex
Reflex_yes
end_Reflex
begin_Style
Style_arch # cz, fi, U
Style_rare # cz, fi, U
Style_poet # cz, U
Style_norm # cz, U
Style_coll # cz, U
Style_vrnc # cz, U
Style_sing # cz, U
Style_expr # cz, U
Style_derg # cz, U
Style_vulg # cz, U
Style_yes # fi, U
end_Style
begin_StyleVariant
StyleVariant_styleShort # cz
StyleVariant_styleBound # cz, sl
end_StyleVariant
begin_Tense
Tense_fut
Tense_imp
Tense_past
Tense_pres
end_Tense
begin_Typo
Typo_yes
end_Typo
begin_VerbForm
VerbForm_fin
VerbForm_ger
VerbForm_inf
VerbForm_none
VerbForm_part
VerbForm_partFut
VerbForm_partPast
VerbForm_partPres
VerbForm_sup
VerbForm_trans
VerbForm_conv # U20
VerbForm_gdv # la
end_VerbForm
begin_VerbType
VerbType_aux # U
VerbType_cop # U
VerbType_mod # U
VerbType_light # U
end_VerbType
begin_Voice
Voice_act
Voice_cau
Voice_pass
Voice_mid # gkc
Voice_int # hb
end_Voice

File diff suppressed because it is too large Load Diff

View File

@ -16,7 +16,7 @@ from ..compat import basestring_
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
from ..morphology cimport Morphology
from ..morphology import parse_feature, IDS, FIELDS, FIELD_SIZES, NAMES
from ..morphology import get_field_size, get_field_offset, parse_feature, FIELDS
class Morphologizer(Pipe):
@ -27,7 +27,7 @@ class Morphologizer(Pipe):
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
raise ValueError(TempErrors.T008)
if attr_nums is None:
attr_nums = list(FIELD_SIZES)
attr_nums = [get_field_size(name) for name in FIELDS]
return build_morphologizer_model(attr_nums, **cfg)
def __init__(self, vocab, model=True, **cfg):
@ -76,7 +76,7 @@ class Morphologizer(Pipe):
cdef Doc doc
cdef Vocab vocab = self.vocab
field_names = list(FIELDS)
offsets = [IDS['begin_%s' % field] for field in field_names]
offsets = [get_field_offset(field) for field in field_names]
for i, doc in enumerate(docs):
doc_scores = batch_scores[i]
doc_guesses = scores_to_guesses(doc_scores, self.model.softmax.out_sizes)

View File

@ -2,7 +2,6 @@ from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t
from .typedefs cimport flags_t, attr_t, hash_t
from .parts_of_speech cimport univ_pos_t
from .morphology cimport univ_morph_t
cdef struct LexemeC:

View File

@ -69,7 +69,6 @@ def test_doc_retokenize_retokenizer_attrs(en_tokenizer):
assert doc[4].ent_type_ == "ORG"
@pytest.mark.xfail
def test_doc_retokenize_lex_attrs(en_tokenizer):
"""Test that lexical attributes can be changed (see #2390)."""
doc = en_tokenizer("WKRO played beach boys songs")

View File

@ -2,7 +2,7 @@ from __future__ import unicode_literals
import pytest
from ...morphology import Morphology
from ...strings import StringStore
from ...strings import StringStore, get_string_id
from ...lemmatizer import Lemmatizer
from ...morphology import *
@ -17,14 +17,14 @@ def test_add_morphology_with_string_names(morphology):
morphology.add({"Case_gen", "Number_sing"})
def test_add_morphology_with_int_ids(morphology):
morphology.add({Case_gen, Number_sing})
morphology.add({get_string_id("Case_gen"), get_string_id("Number_sing")})
def test_add_morphology_with_mix_strings_and_ints(morphology):
morphology.add({PunctSide_ini, 'VerbType_aux'})
morphology.add({get_string_id("PunctSide_ini"), 'VerbType_aux'})
def test_morphology_tags_hash_distinctly(morphology):
tag1 = morphology.add({PunctSide_ini, 'VerbType_aux'})
tag1 = morphology.add({"PunctSide_ini", 'VerbType_aux'})
tag2 = morphology.add({"Case_gen", 'Number_sing'})
assert tag1 != tag2

View File

@ -22,6 +22,7 @@ from ..compat import is_config
from ..errors import Errors, Warnings, user_warning, models_warning
from .. import util
from .underscore import Underscore, get_ext_args
from .morphanalysis cimport MorphAnalysis
cdef class Token:
@ -176,6 +177,10 @@ cdef class Token:
def __get__(self):
return self.c.morph
property morph:
def __get__(self):
return MorphAnalysis.from_id(self.vocab, self.c.morph)
property lex_id:
"""RETURNS (int): Sequential ID of the token's lexical type."""
def __get__(self):