From 3879d28457ecf66f37e1e9c0da8ec29661144e52 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Aug 2015 02:40:35 +0200 Subject: [PATCH 01/48] * Fix https for url detection --- spacy/orth.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 6ffac839b..ca4bbd9ba 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -69,7 +69,7 @@ TLDs = set("com|org|edu|gov|net|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|mu cpdef bint like_url(unicode string): # We're looking for things that function in text like URLs. So, valid URL # or not, anything they say http:// is going to be good. - if string.startswith('http://'): + if string.startswith('http://') or string.startswith('https://'): return True elif string.startswith('www.') and len(string) >= 5: return True From 6f1743692add1507b76b30ac6b347c662467446f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 23 Aug 2015 20:49:18 +0200 Subject: [PATCH 02/48] * Work on language-independent refactoring --- spacy/en/__init__.py | 2 ++ spacy/lexeme.pxd | 33 ++++++++++++++++++++++++++-- spacy/lexeme.pyx | 9 +++----- spacy/matcher.pyx | 7 +++--- spacy/orth.pyx | 1 + spacy/strings.pyx | 2 ++ spacy/tokens/doc.pyx | 6 ++--- spacy/tokens/token.pyx | 27 ++++++++++++----------- spacy/vocab.pxd | 5 +++-- spacy/vocab.pyx | 50 +++++++++++++++++++++--------------------- 10 files changed, 88 insertions(+), 54 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index a04b615da..3d433e497 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -41,6 +41,8 @@ def get_lex_props(string, oov_prob=-30, is_oov=False): 'sentiment': 0 } +get_lex_attr = {} + if_model_present = -1 LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 321f7c616..510840b2b 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -4,6 +4,7 @@ from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTE from .structs cimport LexemeC from .strings cimport StringStore +from .vocab cimport Vocab from numpy cimport ndarray @@ -15,7 +16,8 @@ cdef class Lexeme: cdef readonly Vocab vocab cdef readonly attr_t orth - cdef int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: + @staticmethod + cdef inline int set_struct_props(Vocab vocab, LexemeC* lex, dict props) except -1: lex.length = props['length'] lex.orth = vocab.strings[props['orth']] lex.lower = vocab.strings[props['lower']] @@ -29,7 +31,6 @@ cdef class Lexeme: lex.sentiment = props['sentiment'] lex.flags = props['flags'] - lex.repvec = empty_vec @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: @@ -55,6 +56,34 @@ cdef class Lexeme: return lex.cluster else: return 0 + + @staticmethod + cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: + if name < (sizeof(flags_t) * 8): + Lexeme.set_flag(lex, name, value) + elif name == ID: + lex.id = value + elif name == LOWER: + lex.lower = value + elif name == NORM: + lex.norm = value + elif name == SHAPE: + lex.shape = value + elif name == PREFIX: + lex.prefix = value + elif name == SUFFIX: + lex.suffix = value + elif name == CLUSTER: + lex.cluster = value + @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: return lexeme.flags & (1 << flag_id) + + @staticmethod + cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: + cdef flags_t one = 1 + if value: + lex.flags |= one << flag_id + else: + lex.flags &= ~(one << flag_id) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index f0b3303f1..4deec60c1 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -26,12 +26,9 @@ cdef class Lexeme: def __init__(self, Vocab vocab, int orth): self.vocab = vocab self.orth = orth - self.c = vocab.get_by_orth(orth) + self.c = vocab.get_by_orth(vocab.mem, orth) + assert self.c.orth == orth - property orth: - def __get__(self): - return self.c.orth - property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x @@ -113,7 +110,7 @@ cdef class Lexeme: def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_URL, x) property like_num: - def __get__(self): return Lexeme.like_num(self.c, IKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c, LIKE_NUM) def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, LIKE_NUM, x) property like_email: diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 72473b073..9d1220648 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -103,20 +103,21 @@ cdef class Matcher: def __init__(self, vocab, patterns): self.mem = Pool() + self.vocab = vocab for entity_key, (etype, attrs, specs) in sorted(patterns.items()): self.add(entity_key, etype, attrs, specs) def add(self, entity_key, etype, attrs, specs): if isinstance(entity_key, basestring): - entity_key = vocab.strings[entity_key] + entity_key = self.vocab.strings[entity_key] if isinstance(etype, basestring): - etype = vocab.strings[etype] + etype = self.vocab.strings[etype] elif etype is None: etype = -1 # TODO: Do something more clever about multiple patterns for single # entity for spec in specs: - spec = _convert_strings(spec, vocab.strings) + spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) @classmethod diff --git a/spacy/orth.pyx b/spacy/orth.pyx index ca4bbd9ba..df4e2dc32 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -92,6 +92,7 @@ cpdef bint like_url(unicode string): return False +# TODO: This should live in the language.orth NUM_WORDS = set('zero one two three four five six seven eight nine ten' 'eleven twelve thirteen fourteen fifteen sixteen seventeen' 'eighteen nineteen twenty thirty forty fifty sixty seventy' diff --git a/spacy/strings.pyx b/spacy/strings.pyx index c187a6aa6..a4a470158 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -142,6 +142,8 @@ cdef class StringStore: def load(self, loc): with codecs.open(loc, 'r', 'utf8') as file_: strings = file_.read().split(SEPARATOR) + if strings == ['']: + return None cdef unicode string cdef bytes byte_string for string in strings: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7994c97c3..0fa562dfb 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -12,8 +12,7 @@ from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUST from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES from ..parts_of_speech cimport CONJ, PUNCT, NOUN -from ..lexeme cimport check_flag -from ..lexeme cimport get_attr as get_lex_attr +from ..lexeme cimport Lexeme from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -47,7 +46,7 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: elif feat_name == ENT_TYPE: return token.ent_type else: - return get_lex_attr(token.lex, feat_name) + return Lexeme.get_struct_attr(token.lex, feat_name) cdef class Doc: @@ -218,6 +217,7 @@ cdef class Doc: t.idx = 0 else: t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy + assert t.lex.orth != 0 t.spacy = has_space self.length += 1 self._py_tokens.append(None) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index f1f2696cb..04945ecd1 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,6 +1,5 @@ from libc.string cimport memcpy from cpython.mem cimport PyMem_Malloc, PyMem_Free -from ..lexeme cimport check_flag # Compiler crashes on memory view coercion without this. Should report bug. from cython.view cimport array as cvarray cimport numpy as np @@ -20,6 +19,8 @@ from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV +from ..lexeme cimport Lexeme + cdef class Token: """An individual token --- i.e. a word, a punctuation symbol, etc. Created @@ -42,7 +43,7 @@ cdef class Token: return self.string cpdef bint check_flag(self, attr_id_t flag_id) except -1: - return check_flag(self.c.lex, flag_id) + return Lexeme.check_flag(self.c.lex, flag_id) def nbor(self, int i=1): return self.doc[self.i+i] @@ -286,37 +287,37 @@ cdef class Token: return self.vocab.strings[self.c.dep] property is_oov: - def __get__(self): return check_flag(self.c.lex, IS_OOV) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_OOV) property is_alpha: - def __get__(self): return check_flag(self.c.lex, IS_ALPHA) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ALPHA) property is_ascii: - def __get__(self): return check_flag(self.c.lex, IS_ASCII) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_ASCII) property is_digit: - def __get__(self): return check_flag(self.c.lex, IS_DIGIT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_DIGIT) property is_lower: - def __get__(self): return check_flag(self.c.lex, IS_LOWER) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_LOWER) property is_title: - def __get__(self): return check_flag(self.c.lex, IS_TITLE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_TITLE) property is_punct: - def __get__(self): return check_flag(self.c.lex, IS_PUNCT) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_PUNCT) property is_space: - def __get__(self): return check_flag(self.c.lex, IS_SPACE) + def __get__(self): return Lexeme.check_flag(self.c.lex, IS_SPACE) property like_url: - def __get__(self): return check_flag(self.c.lex, LIKE_URL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_URL) property like_num: - def __get__(self): return check_flag(self.c.lex, LIKE_NUM) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_NUM) property like_email: - def __get__(self): return check_flag(self.c.lex, LIKE_EMAIL) + def __get__(self): return Lexeme.check_flag(self.c.lex, LIKE_EMAIL) _pos_id_to_string = {id_: string for string, id_ in UNIV_POS_NAMES.items()} diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 2503cdcee..cf7a46388 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -27,15 +27,16 @@ cdef class Vocab: cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings - cdef readonly object pos_tags cdef readonly int length cdef public object _serializer cdef public object data_dir - cdef public float oov_prob + cdef public object get_lex_attr + cdef public object pos_tags cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 cdef PreshMap _by_hash diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index dcb7d575c..4c35ea41c 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,7 +12,6 @@ import math import json from .lexeme cimport EMPTY_LEXEME -from .lexeme cimport set_lex_struct_props from .lexeme cimport Lexeme from .strings cimport hash_string from .orth cimport word_shape @@ -36,17 +35,15 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_attr=None): + def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=True, pos_tags=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.pos_tags = pos_tags if pos_tags is not None else {} - self.get_lex_attr = get_lex_attr self.repvec_length = 0 - self.length = 0 - self._add_lex_to_vocab(0, &EMPTY_LEXEME) + self.length = 1 + self.pos_tags = pos_tags if data_dir is not None: if not path.exists(data_dir): raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) @@ -84,7 +81,10 @@ cdef class Vocab: cdef LexemeC* lex cdef hash_t key = hash_string(string) lex = self._by_hash.get(key) + cdef size_t addr if lex != NULL: + print string, lex.orth, self.strings[string] + assert lex.orth == self.strings[string] return lex else: return self._new_lexeme(mem, string) @@ -103,15 +103,24 @@ cdef class Vocab: return self._new_lexeme(mem, self.strings[orth]) cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL: + cdef hash_t key cdef bint is_oov = mem is not self.mem - if len(string) < 3: - mem = self.mem + mem = self.mem + #if len(string) < 3: + # mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) - for attr, func in self.lex_attr_getters.items(): - Lexeme.set_struct_attr(lex, attr, func(string)) + lex.orth = self.strings[string] + lex.id = self.length + if self.get_lex_attr is not None: + for attr, func in self.get_lex_attr.items(): + value = func(string) + if isinstance(value, unicode): + value = self.strings[value] + Lexeme.set_struct_attr(lex, attr, value) if is_oov: lex.id = 0 else: + key = hash_string(string) self._add_lex_to_vocab(key, lex) assert lex != NULL, string return lex @@ -119,13 +128,14 @@ cdef class Vocab: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: self._by_hash.set(key, lex) self._by_orth.set(lex.orth, lex) + print "Add lex", key, lex.orth, self.strings[lex.orth] self.length += 1 def __iter__(self): cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): - yield Lexeme.from_ptr(addr, self.strings, self.repvec_length) + yield Lexeme(self, orth) def __getitem__(self, id_or_string): '''Retrieve a lexeme, given an int ID or a unicode string. If a previously @@ -142,22 +152,12 @@ cdef class Vocab: An instance of the Lexeme Python class, with data copied on instantiation. ''' - cdef const LexemeC* lexeme cdef attr_t orth - if type(id_or_string) == int: - orth = id_or_string - lexeme = self._by_orth.get(orth) - if lexeme == NULL: - raise KeyError(id_or_string) - assert lexeme.orth == orth, ('%d vs %d' % (lexeme.orth, orth)) - elif type(id_or_string) == unicode: - lexeme = self.get(self.mem, id_or_string) - assert lexeme.orth == self.strings[id_or_string] + if type(id_or_string) == unicode: + orth = self.strings[id_or_string] else: - raise ValueError("Vocab unable to map type: " - "%s. Maps unicode --> Lexeme or " - "int --> Lexeme" % str(type(id_or_string))) - return Lexeme.from_ptr(lexeme, self.strings, self.repvec_length) + orth = id_or_string + return Lexeme(self, orth) def dump(self, loc): if path.exists(loc): From 5d5922dbfaf160ef40f9ec62743fe51db1f86700 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Aug 2015 01:04:30 +0200 Subject: [PATCH 03/48] * Begin laying out morphological features --- spacy/morphology.pxd | 721 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 721 insertions(+) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 5dfee4250..6914eb8d6 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -2,3 +2,724 @@ from .structs cimport TokenC, Morphology, PosTag cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 + + +cdef enum Feature: + Abbr + AdpType + AdvType + ConjType + Connegative + Derivation + Echo + Foreign + Gender_dat + Gender_erg + Gender_psor + Hyph + InfForm + NameType + NounType + NumberAbs + NumberDat + NumberErg + NumberPsee + NumberPsor + NumForm + NumValue + PartForm + PartType + Person_abs + Person_dat + Person_psor + Polite + Polite_abs + Polite_dat + Prefix + PrepCase + PunctSide + PunctType + Style + Typo + Variant + VerbType +cpdef enum Animacy: + Anim + Inam + + +cpdef enum Aspect: + Freq + Imp + Mod + None_ + Perf + + +cpdef enum Case1: + Abe + Abl + Abs + Acc + Ade + All + Cau + Com + +cdef enum Case2: + Dat + Del + Dis + Ela + Ess + Gen + Ill + Ine + +cdef enum Case3: + Ins + Loc + Lat + Nom + Par + Sub + Sup + Tem + Ter + + +cdef enum Case4: + Tra + Voc + + +cpdef enum Definite: + Two + Def + Red + Ind + + +cpdef enum Degree: + Cmp + Comp + None_ + Pos + Sup + Abs + Com + Degree # du + + +cpdef enum Gender: + Com + Fem + Masc + Neut + + +cpdef enum Mood: + Cnd + Imp + Ind + N + Pot + Sub + Opt + + +cpdef enum Negative: + Neg + Pos + Yes + + +cpdef enum Number: + Com + Dual + None_ + Plur + Sing + Ptan # bg + Count # bg + + +cpdef enum NumType: + Card + Dist + Frac + Gen + Mult + None_ + Ord + Sets + + +cpdef enum Person: + One + Two + Three + None_ + + +cpdef enum Poss: + Yes + + +cpdef enum PronType1: + AdvPart + Art + Default + Dem + Ind + Int + Neg + +cpdef enum PronType2: + Prs + Rcp + Rel + Tot + Clit + Exc # es, ca, it, fa + Clit # it + + +cpdef enum Reflex: + Yes + + +cpdef enum Tense: + Fut + Imp + Past + Pres + +cpdef enum VerbForm1: + Fin + Ger + Inf + None_ + Part + PartFut + PartPast + +cpdef enum VerbForm2: + PartPres + Sup + Trans + Gdv # la + + +cpdef enum Voice: + Act + Cau + Pass + Mid # gkc + Int # hb + + +cpdef enum Abbr: + Yes # cz, fi, sl, U + +cpdef enum AdpType: + Prep # cz, U + Post # U + Voc # cz + Comprep # cz + Circ # U + Voc # U + + +cpdef enum AdvType1: + # U + Man + Loc + Tim + Deg + Cau + Mod + Sta + Ex + +cpdef enum AdvType2: + Adadj + +cpdef enum ConjType: + Oper # cz, U + Comp # cz, U + +cpdef enum Connegative: + Yes # fi + + +cpdef enum Derivation1: + Minen # fi + Sti # fi + Inen # fi + Lainen # fi + Ja # fi + Ton # fi + Vs # fi + Ttain # fi + +cpdef enum Derivation2: + Ttaa + + +cpdef enum Echo: + Rdp # U + Ech # U + + +cpdef enum Foreign: + Foreign # cz, fi, U + Fscript # cz, fi, U + Tscript # cz, U + Yes # sl + + +cpdef enum Gender_dat: + Masc # bq, U + Fem # bq, U + + +cpdef enum Gender_erg: + Masc # bq + Fem # bq + + +cpdef enum Gender_psor: + Masc # cz, sl, U + Fem # cz, sl, U + Neut # sl + + +cpdef enum Hyph: + Yes # cz, U + + +cpdef enum InfForm: + One # fi + Two # fi + Three # fi + + +cpdef enum NameType: + Geo # U, cz + Prs # U, cz + Giv # U, cz + Sur # U, cz + Nat # U, cz + Com # U, cz + Pro # U, cz + Oth # U, cz + + +cpdef enum NounType: + Com # U + Prop # U + Class # U + +cpdef enum Number_abs: + Sing # bq, U + Plur # bq, U + +cpdef enum Number_dat: + Sing # bq, U + Plur # bq, U + +cpdef enum Number_erg: + Sing # bq, U + Plur # bq, U + +cpdef enum Number_psee: + Sing # U + Plur # U + + +cpdef enum Number_psor: + Sing # cz, fi, sl, U + Plur # cz, fi, sl, U + + +cpdef enum NumForm: + Digit # cz, sl, U + Roman # cz, sl, U + Word # cz, sl, U + + +cpdef enum NumValue: + One # cz, U + Two # cz, U + Three # cz, U + + +cpdef enum PartForm: + Pres # fi + Past # fi + Agt # fi + Neg # fi + + +cpdef enum PartType: + Mod # U + Emp # U + Res # U + Inf # U + Vbp # U + +cpdef enum Person_abs: + One # bq, U + Two # bq, U + Three # bq, U + + +cpdef enum Person_dat: + One # bq, U + Two # bq, U + Three # bq, U + + +cpdef enum Person_erg: + One # bq, U + Two # bq, U + Three # bq, U + + +cpdef enum Person_psor: + One # fi, U + Two # fi, U + Three # fi, U + + +cpdef enum Polite: + Inf # bq, U + Pol # bq, U + + +cpdef enum Polite_abs: + Inf # bq, U + Pol # bq, U + + +cpdef enum Polite_erg: + Inf # bq, U + Pol # bq, U + + +cpdef enum Polite_dat: + Inf # bq, U + Pol # bq, U + + +cpdef enum Prefix: + Yes # U + + +cpdef enum PrepCase: + Npr # cz + Pre # U + + +cpdef enum PunctSide: + Ini # U + Fin # U + +cpdef enum PunctType1: + Peri # U + Qest # U + Excl # U + Quot # U + Brck # U + Comm # U + Colo # U + Semi # U + +cpdef enum PunctType2: + Dash # U + + +cpdef enum Style1: + Arch # cz, fi, U + Rare # cz, fi, U + Poet # cz, U + Norm # cz, U + Coll # cz, U + Vrnc # cz, U + Sing # cz, U + Expr # cz, U + + +cpdef enum Style2: + Derg # cz, U + Vulg # cz, U + + +cpdef enum Typo: + Yes # fi, U + + +cpdef enum Variant: + Short # cz + Bound # cz, sl + + +cpdef enum VerbType: + Aux # U + Cop # U + Mod # U + Light # U + + +cpdef enum FeatureValues: + Animacy_Anim + Animacy_Inam + Aspect_Freq + Aspect_Imp + Aspect_Mod + Aspect_None_ + Aspect_Perf + Case_Abe + Case_Abl + Case_Abs + Case_Acc + Case_Ade + Case_All + Case_Cau + Case_Com + Case_Dat + Case_Del + Case_Dis + Case_Ela + Case_Ess + Case_Gen + Case_Ill + Case_Ine + Case_Ins + Case_Loc + Case_Lat + Case_Nom + Case_Par + Case_Sub + Case_Sup + Case_Tem + Case_Ter + Case_Tra + Case_Voc + Definite_Two + Definite_Def + Definite_Red + Definite_Ind + Degree_Cmp + Degree_Comp + Degree_None + Degree_Pos + Degree_Sup + Degree_Abs + Degree_Com + Degree_Dim # du + Gender_Com + Gender_Fem + Gender_Masc + Gender_Neut + Mood_Cnd + Mood_Imp + Mood_Ind + Mood_N + Mood_Pot + Mood_Sub + Mood_Opt + Negative_Neg + Negative_Pos + Negative_Yes + Number_Com + Number_Dual + Number_None + Number_Plur + Number_Sing + Number_Ptan # bg + Number_Count # bg + NumType_Card + NumType_Dist + NumType_Frac + NumType_Gen + NumType_Mult + NumType_None + NumType_Ord + NumType_Sets + Person_One + Person_Two + Person_Three + Person_None + Poss_Yes + PronType_AdvPart + PronType_Art + PronType_Default + PronType_Dem + PronType_Ind + PronType_Int + PronType_Neg + PronType_Prs + PronType_Rcp + PronType_Rel + PronType_Tot + PronType_Clit + PronType_Exc # es, ca, it, fa + PronType_Clit # it + Reflex_Yes + Tense_Fut + Tense_Imp + Tense_Past + Tense_Pres + VerbForm_Fin + VerbForm_Ger + VerbForm_Inf + VerbForm_None + VerbForm_Part + VerbForm_PartFut + VerbForm_PartPast + VerbForm_PartPres + VerbForm_Sup + VerbForm_Trans + VerbForm_Gdv # la + Voice_Act + Voice_Cau + Voice_Pass + Voice_Mid # gkc + Voice_Int # hb + Abbr_Yes # cz, fi, sl, U + AdpType_Prep # cz, U + AdpType_Post # U + AdpType_Voc # cz + AdpType_Comprep # cz + AdpType_Circ # U + AdpType_Voc # U + AdvType_Man + AdvType_Loc + AdvType_Tim + AdvType_Deg + AdvType_Cau + AdvType_Mod + AdvType_Sta + AdvType_Ex + AdvType_Adadj + ConjType_Oper # cz, U + ConjType_Comp # cz, U + Connegative_Yes # fi + # fi + Derivation_Minen + Derivation_Sti + Derivation_Inen + Derivation_Lainen + Derivation_Ja + Derivation_Ton + Derivation_Vs + Derivation_Ttain + Derivation_Ttaa + Echo_Rdp # U + Echo_Ech # U + Foreign_Foreign # cz, fi, U + Foreign_Fscript # cz, fi, U + Foreign_Tscript # cz, U + Foreign_Yes # sl + Gender_dat_Masc # bq, U + Gender_dat_Fem # bq, U + Gender_erg_Masc # bq + Gender_erg_Fem # bq + Gender_psor_Masc # cz, sl, U + Gender_psor_Fem # cz, sl, U + Gender_psor_Neut # sl + Hyph_Yes # cz, U + InfForm_One # fi + InfForm_Two # fi + InfForm_Three # fi + NameType_Geo # U, cz + NameType_Prs # U, cz + NameType_Giv # U, cz + NameType_Sur # U, cz + NameType_Nat # U, cz + NameType_Com # U, cz + NameType_Pro # U, cz + NameType_Oth # U, cz + NounType_Com # U + NounType_Prop # U + NounType_Class # U + Number_abs_Sing # bq, U + Number_abs_Plur # bq, U + Number_dat_Sing # bq, U + Number_dat_Plur # bq, U + Number_erg_Sing # bq, U + Number_erg_Plur # bq, U + Number_psee_Sing # U + Number_psee_Plur # U + Number_psor_Sing # cz, fi, sl, U + Number_psor_Plur # cz, fi, sl, U + NumForm_Digit # cz, sl, U + NumForm_Roman # cz, sl, U + NumForm_Word # cz, sl, U + NumValue_One # cz, U + NumValue_Two # cz, U + NumValue_Three # cz, U + PartForm_Pres # fi + PartForm_Past # fi + PartForm_Agt # fi + PartForm_Neg # fi + PartType_Mod # U + PartType_Emp # U + PartType_Res # U + PartType_Inf # U + PartType_Vbp # U + Person_abs_One # bq, U + Person_abs_Two # bq, U + Person_abs_Three # bq, U + Person_dat_One # bq, U + Person_dat_Two # bq, U + Person_dat_Three # bq, U + Person_erg_One # bq, U + Person_erg_Two # bq, U + Person_erg_Three # bq, U + Person_psor_One # fi, U + Person_psor_Two # fi, U + Person_psor_Three # fi, U + Polite_Inf # bq, U + Polite_Pol # bq, U + Polite_abs_Inf # bq, U + Polite_abs_Pol # bq, U + Polite_erg_Inf # bq, U + Polite_erg_Pol # bq, U + Polite_dat_Inf # bq, U + Polite_dat_Pol # bq, U + Prefix_Yes # U + PrepCase_Npr # cz + PrepCase_Pre # U + PunctSide_Ini # U + PunctSide_Fin # U + PunctType_Peri # U + PunctType_Qest # U + PunctType_Excl # U + PunctType_Quot # U + PunctType_Brck # U + PunctType_Comm # U + PunctType_Colo # U + PunctType_Semi # U + PunctType_Dash # U + Style_Arch # cz, fi, U + Style_Rare # cz, fi, U + Style_Poet # cz, U + Style_Norm # cz, U + Style_Coll # cz, U + Style_Vrnc # cz, U + Style_Sing # cz, U + Style_Expr # cz, U + Style_Derg # cz, U + Style_Vulg # cz, U + Style_Yes # fi, U + StyleVariant_StyleShort # cz + StyleVariant_StyleBound # cz, sl + VerbType_Aux # U + VerbType_Cop # U + VerbType_Mod # U + VerbType_Light # U + + From bbf07ac253e12cdc2ec76dcdde46f5bc6c7dd51b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Aug 2015 01:05:20 +0200 Subject: [PATCH 04/48] * Cut down init_model to work on more languages --- bin/init_model.py | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index 3307bffa8..9a635f296 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -29,8 +29,6 @@ from shutil import copytree import codecs from collections import defaultdict -from spacy.en import get_lex_props -from spacy.en.lemmatizer import Lemmatizer from spacy.vocab import Vocab from spacy.vocab import write_binary_vectors from spacy.strings import hash_string @@ -38,6 +36,11 @@ from preshed.counter import PreshCounter from spacy.parts_of_speech import NOUN, VERB, ADJ +import spacy.en +import spacy.de + + + def setup_tokenizer(lang_data_dir, tok_dir): if not tok_dir.exists(): @@ -139,7 +142,7 @@ def _read_senses(loc): return lexicon -def setup_vocab(src_dir, dst_dir): +def setup_vocab(get_lex_attr, src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() @@ -148,13 +151,13 @@ def setup_vocab(src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") - vocab = Vocab(data_dir=None, get_lex_props=get_lex_props) + vocab = Vocab(data_dir=None, get_lex_attr=get_lex_attr) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: probs, oov_prob = _read_freqs(src_dir / 'freqs.txt') if not probs: - oov_prob = 0.0 + oov_prob = -20 else: oov_prob = min(probs.values()) for word in clusters: @@ -163,23 +166,30 @@ def setup_vocab(src_dir, dst_dir): lexicon = [] for word, prob in reversed(sorted(list(probs.items()), key=lambda item: item[1])): - entry = get_lex_props(word) - entry['prob'] = float(prob) - cluster = clusters.get(word, '0') + lexeme = vocab[word] + lexeme.prob = prob + lexeme.is_oov = False # Decode as a little-endian string, so that we can do & 15 to get # the first 4 bits. See _parse_features.pyx - entry['cluster'] = int(cluster[::-1], 2) - vocab[word] = entry + if word in clusters: + lexeme.cluster = int(clusters[word][::-1], 2) + else: + lexeme.cluster = 0 vocab.dump(str(dst_dir / 'lexemes.bin')) vocab.strings.dump(str(dst_dir / 'strings.txt')) with (dst_dir / 'oov_prob').open('w') as file_: file_.write('%f' % oov_prob) -def main(lang_data_dir, corpora_dir, model_dir): +def main(lang_id, lang_data_dir, corpora_dir, model_dir): + languages = { + 'en': spacy.en.get_lex_attr, + 'de': spacy.en.get_lex_attr + } + model_dir = Path(model_dir) - lang_data_dir = Path(lang_data_dir) - corpora_dir = Path(corpora_dir) + lang_data_dir = Path(lang_data_dir) / lang_id + corpora_dir = Path(corpora_dir) / lang_id assert corpora_dir.exists() assert lang_data_dir.exists() @@ -188,12 +198,12 @@ def main(lang_data_dir, corpora_dir, model_dir): model_dir.mkdir() setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') - setup_vocab(corpora_dir, model_dir / 'vocab') + setup_vocab(languages[lang_id], corpora_dir, model_dir / 'vocab') if (lang_data_dir / 'gazetteer.json').exists(): copyfile(str(lang_data_dir / 'gazetteer.json'), str(model_dir / 'vocab' / 'gazetteer.json')) - if not (model_dir / 'wordnet').exists(): + if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet')) From 5dd76be4460b2d08ab9384c7142452e84f797ee3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 24 Aug 2015 05:25:55 +0200 Subject: [PATCH 05/48] * Split EnPosTagger up into base class and subclass --- setup.py | 2 +- spacy/en/pos.pxd | 27 +------- spacy/en/pos.pyx | 169 +++++++---------------------------------------- spacy/tagger.pxd | 27 ++++++++ spacy/tagger.pyx | 144 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 199 insertions(+), 170 deletions(-) create mode 100644 spacy/tagger.pxd create mode 100644 spacy/tagger.pyx diff --git a/setup.py b/setup.py index 218272504..fe55d0d5a 100644 --- a/setup.py +++ b/setup.py @@ -153,7 +153,7 @@ def main(modules, is_pypy): MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', 'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', - 'spacy.morphology', + 'spacy.morphology', 'spacy.tagger', 'spacy.syntax.stateclass', 'spacy._ml', 'spacy._theano', 'spacy.tokenizer', 'spacy.en.attrs', diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 2fc7b4ac7..213752cf5 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -1,26 +1,5 @@ -from preshed.maps cimport PreshMapArray -from preshed.counter cimport PreshCounter -from cymem.cymem cimport Pool - -from .._ml cimport Model -from ..strings cimport StringStore -from ..structs cimport TokenC, LexemeC, Morphology, PosTag -from ..parts_of_speech cimport univ_pos_t -from .lemmatizer import Lemmatizer +from ..tagger cimport Tagger -cdef class EnPosTagger: - cdef readonly Pool mem - cdef readonly StringStore strings - cdef readonly Model model - cdef public object lemmatizer - cdef PreshMapArray _morph_cache - cdef public dict freqs - - cdef PosTag* tags - cdef readonly object tag_names - cdef readonly object tag_map - cdef readonly int n_tags - - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 +cdef class EnPosTagger(Tagger): + pass diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 569b209fc..703d7198c 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -218,155 +218,34 @@ POS_TEMPLATES = ( ) -cdef struct _CachedMorph: - Morphology morph - int lemma - - -def setup_model_dir(tag_names, tag_map, templates, model_dir): - if path.exists(model_dir): - shutil.rmtree(model_dir) - os.mkdir(model_dir) - config = { - 'templates': templates, - 'tag_names': tag_names, - 'tag_map': tag_map - } - with open(path.join(model_dir, 'config.json'), 'w') as file_: - json.dump(config, file_) - - -cdef class EnPosTagger: +cdef class EnPosTagger(Tagger): """A part-of-speech tagger for English""" - def __init__(self, StringStore strings, data_dir): - self.mem = Pool() - model_dir = path.join(data_dir, 'pos') - self.strings = strings - cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) - self.tag_names = sorted(cfg['tag_names']) - assert self.tag_names - self.n_tags = len(self.tag_names) - self.tag_map = cfg['tag_map'] - cdef int n_tags = len(self.tag_names) + 1 - - self.model = Model(n_tags, cfg['templates'], model_dir) - self._morph_cache = PreshMapArray(n_tags) - self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) - for i, tag in enumerate(sorted(self.tag_names)): - pos, props = self.tag_map[tag] - self.tags[i].id = i - self.tags[i].pos = pos - set_morph_from_dict(&self.tags[i].morph, props) - if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): - self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', - 'morphs.json')))) - self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) - self.freqs = {TAG: defaultdict(int)} - for tag in self.tag_names: - self.freqs[TAG][self.strings[tag]] = 1 - self.freqs[TAG][0] = 1 - - def __call__(self, Doc tokens): - """Apply the tagger, setting the POS tags onto the Doc object. - - Args: - tokens (Doc): The tokens to be tagged. - """ - if tokens.length == 0: - return 0 - cdef int i + def make_lemmatizer(self, data_dir): + return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) + + cdef int predict(self, int i, const TokenC* tokens) except -1: cdef atom_t[N_CONTEXT_FIELDS] context - cdef const weight_t* scores - for i in range(tokens.length): - if tokens.data[i].pos == 0: - fill_context(context, i, tokens.data) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + return arg_max(scores, self.model.n_classes) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def tag_from_strings(self, Doc tokens, object tag_strs): - cdef int i - for i in range(tokens.length): - tokens.data[i].tag = self.strings[tag_strs[i]] - self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], - tokens.data) - tokens.is_tagged = True - tokens._py_tokens = [None] * tokens.length - - def train(self, Doc tokens, object gold_tag_strs): - cdef int i - cdef int loss + cdef int update(self, int i, const TokenC* tokens, int gold) except -1: cdef atom_t[N_CONTEXT_FIELDS] context - cdef const weight_t* scores - golds = [self.tag_names.index(g) if g is not None else -1 - for g in gold_tag_strs] - correct = 0 - for i in range(tokens.length): - fill_context(context, i, tokens.data) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - loss = guess != golds[i] if golds[i] != -1 else 0 - self.model.update(context, guess, golds[i], loss) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - correct += loss == 0 - self.freqs[TAG][tokens.data[i].tag] += 1 - return correct - - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: - tokens[i].pos = tag.pos - cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) - if cached is NULL: - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) - cached.morph = tag.morph - self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) - tokens[i].lemma = cached.lemma - tokens[i].morph = cached.morph - - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: - if self.lemmatizer is None: - return lex.orth - cdef unicode py_string = self.strings[lex.orth] - if pos != NOUN and pos != VERB and pos != ADJ: - return lex.orth - cdef set lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, pos) - lemma_string = sorted(lemma_strings)[0] - lemma = self.strings[lemma_string] - return lemma - - def load_morph_exceptions(self, dict exc): - cdef unicode pos_str - cdef unicode form_str - cdef unicode lemma_str - cdef dict entries - cdef dict props - cdef int lemma - cdef attr_t orth - cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) - for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.strings[lemma_str] - set_morph_from_dict(&cached.morph, props) - self._morph_cache.set(pos, orth, cached) - - -cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1: - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + guess = arg_max(scores, self.model.n_classes) + loss = guess != gold if gold != -1 else 0 + self.model.update(context, guess, gold, loss) + return guess + cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd new file mode 100644 index 000000000..4aa9acc43 --- /dev/null +++ b/spacy/tagger.pxd @@ -0,0 +1,27 @@ +from preshed.maps cimport PreshMapArray +from preshed.counter cimport PreshCounter +from cymem.cymem cimport Pool + +from ._ml cimport Model +from .strings cimport StringStore +from .structs cimport TokenC, LexemeC, Morphology, PosTag +from .parts_of_speech cimport univ_pos_t + + +cdef class Tagger: + cdef readonly Pool mem + cdef readonly StringStore strings + cdef readonly Model model + cdef public object lemmatizer + cdef PreshMapArray _morph_cache + cdef public dict freqs + + cdef PosTag* tags + cdef readonly object tag_names + cdef readonly object tag_map + cdef readonly int n_tags + + cdef int predict(self, int i, const TokenC* tokens) except -1 + cdef int update(self, int i, const TokenC* tokens, int gold) except -1 + cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 + cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx new file mode 100644 index 000000000..ccb40fd22 --- /dev/null +++ b/spacy/tagger.pyx @@ -0,0 +1,144 @@ +import json +from os import path +from collections import defaultdict + +from thinc.typedefs cimport atom_t, weight_t + +from .typedefs cimport attr_t +from .tokens.doc cimport Doc +from .morphology cimport set_morph_from_dict +from .attrs cimport TAG +from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON +from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE + + +cdef struct _CachedMorph: + Morphology morph + int lemma + + +cdef class Tagger: + """A part-of-speech tagger for English""" + def make_lemmatizer(self): + return None + + def __init__(self, StringStore strings, data_dir): + self.mem = Pool() + model_dir = path.join(data_dir, 'pos') + self.strings = strings + cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) + self.tag_names = sorted(cfg['tag_names']) + assert self.tag_names + self.n_tags = len(self.tag_names) + self.tag_map = cfg['tag_map'] + cdef int n_tags = len(self.tag_names) + 1 + + self.model = Model(n_tags, cfg['templates'], model_dir) + self._morph_cache = PreshMapArray(n_tags) + self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) + for i, tag in enumerate(sorted(self.tag_names)): + pos, props = self.tag_map[tag] + self.tags[i].id = i + self.tags[i].pos = pos + set_morph_from_dict(&self.tags[i].morph, props) + if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): + self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', + 'morphs.json')))) + self.lemmatizer = self.make_lemmatizer(data_dir) + self.freqs = {TAG: defaultdict(int)} + for tag in self.tag_names: + self.freqs[TAG][self.strings[tag]] = 1 + self.freqs[TAG][0] = 1 + + def __call__(self, Doc tokens): + """Apply the tagger, setting the POS tags onto the Doc object. + + Args: + tokens (Doc): The tokens to be tagged. + """ + if tokens.length == 0: + return 0 + cdef int i + cdef const weight_t* scores + for i in range(tokens.length): + if tokens.data[i].pos == 0: + guess = self.predict(i, tokens.data) + tokens.data[i].tag = self.strings[self.tag_names[guess]] + self.set_morph(i, &self.tags[guess], tokens.data) + + tokens.is_tagged = True + tokens._py_tokens = [None] * tokens.length + + def tag_from_strings(self, Doc tokens, object tag_strs): + cdef int i + for i in range(tokens.length): + tokens.data[i].tag = self.strings[tag_strs[i]] + self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], + tokens.data) + tokens.is_tagged = True + tokens._py_tokens = [None] * tokens.length + + def train(self, Doc tokens, object gold_tag_strs): + cdef int i + cdef int loss + cdef const weight_t* scores + golds = [self.tag_names.index(g) if g is not None else -1 + for g in gold_tag_strs] + correct = 0 + for i in range(tokens.length): + guess = self.update(i, tokens.data, golds[i]) + loss = golds[i] != -1 and guess != golds[i] + tokens.data[i].tag = self.strings[self.tag_names[guess]] + self.set_morph(i, &self.tags[guess], tokens.data) + correct += loss == 0 + self.freqs[TAG][tokens.data[i].tag] += 1 + return correct + + cdef int predict(self, int i, const TokenC* tokens) except -1: + raise NotImplementedError + + cdef int update(self, int i, const TokenC* tokens, int gold) except -1: + raise NotImplementedError + + cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: + tokens[i].pos = tag.pos + cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) + if cached is NULL: + cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) + cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) + cached.morph = tag.morph + self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) + tokens[i].lemma = cached.lemma + tokens[i].morph = cached.morph + + cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: + if self.lemmatizer is None: + return lex.orth + cdef unicode py_string = self.strings[lex.orth] + if pos != NOUN and pos != VERB and pos != ADJ: + return lex.orth + cdef set lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, pos) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings[lemma_string] + return lemma + + def load_morph_exceptions(self, dict exc): + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef attr_t orth + cdef int pos + for pos_str, entries in exc.items(): + pos = self.tag_names.index(pos_str) + for form_str, props in entries.items(): + lemma_str = props.get('L', form_str) + orth = self.strings[form_str] + cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) + cached.lemma = self.strings[lemma_str] + set_morph_from_dict(&cached.morph, props) + self._morph_cache.set(pos, orth, cached) From f2f699ac186e6fcd69d79e5cdca87d8a489a3614 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Aug 2015 15:37:17 +0200 Subject: [PATCH 06/48] * Add language base class --- spacy/language.py | 195 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 spacy/language.py diff --git a/spacy/language.py b/spacy/language.py new file mode 100644 index 000000000..fca52277b --- /dev/null +++ b/spacy/language.py @@ -0,0 +1,195 @@ +class Language(object): + @staticmethod + def lower(string): + return string.lower() + + @staticmethod + def norm(string): + return string + + @staticmethod + def shape(string): + return orth.word_shape(string) + + @staticmethod + def prefix(string): + return string[0] + + @staticmethod + def suffix(string): + return string[-3:] + + @staticmethod + def prob(string): + return self.oov_prob + + @staticmethod + def cluster(string): + return 0 + + @staticmethod + def is_alpha(string): + return orths.is_alpha(string) + + @staticmethod + def is_lower(string): + return orths.is_lower(string) + + @staticmethod + def is_upper(string): + return orths.is_upper(string) + + @staticmethod + def like_url(string): + return orths.like_url(string) + + @staticmethod + def like_number(string): + return orths.like_number(string) + + @staticmethod + def like_email(string): + return orths.like_email(string) + + def default_lex_attrs(cls, data_dir): + return { + attrs.LOWER: cls.lower, + attrs.NORM: cls.norm, + attrs.SHAPE: cls.shape, + attrs.PREFIX: cls.prefix, + attrs.SUFFIX: cls.suffix, + attrs.CLUSTER: cls.cluster, + attrs.PROB: cls.prob, + + attrs.IS_ALPHA: cls.is_alpha, + attrs.IS_ASCII: cls.is_ascii, + attrs.IS_DIGIT: cls.is_digit, + attrs.IS_LOWER: cls.is_lower, + attrs.IS_UPPER: cls.is_upper, + attrs.LIKE_URL: cls.like_url, + attrs.LIKE_NUM: cls.like_number, + attrs.LIKE_EMAIL: cls.like_email, + attrs.IS_STOP: lambda string: False, + attrs.IS_OOV: lambda string: True + } + + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') + + @classmethod + def default_vocab(cls, get_lex_attr=None, vectors=None, morphology=None, data_dir=None): + if data_dir is None: + data_dir = cls.default_data_dir() + if vectors is None: + vectors = cls.default_vectors(data_dir) + if get_lex_attr is None: + get_lex_attr = cls.default_lex_attrs(data_dir) + if morphology is None: + morphology = cls.default_morphology(data_dir) + return vocab = Vocab.from_dir(data_dir, get_lex_attr, vectors, morphology) + + @classmethod + def default_tokenizer(cls, vocab, data_dir=None): + if data_dir is None: + data_dir = cls.default_data_dir() + return Tokenizer.from_dir(data_dir, vocab) + + @classmethod + def default_tagger(cls, vocab, data_dir=None): + return Tagger.from_dir(data_dir, vocab) + + @classmethod + def default_parser(cls, vocab, transition_system=None, data_dir=None): + if transition_system is None: + transition_system = ArcEager() + return Parser.from_dir(data_dir, vocab, transition_system) + + @classmethod + def default_entity(cls, vocab, transition_system=None, data_dir=None): + if transition_system is None: + transition_system = BiluoPushDown() + return Parser.from_dir(data_dir, vocab, transition_system) + + @classmethod + def default_matcher(cls, vocab, data_dir=None): + if data_dir is None: + data_dir = cls.default_data_dir() + return Matcher(data_dir, vocab) + + @classmethod + def default_serializer(cls, vocab, data_dir=None): + if data_dir is None: + data_dir = cls.default_data_dir() + return Packer(data_dir, vocab) + + def __init__(self, vocab=None, tokenizer=None, tagger=None, parser=None, + entity=None, matcher=None, serializer=None): + if data_dir is None: + data_dir = self.default_data_dir() + if vocab is None: + vocab = self.default_vocab(data_dir) + if tokenizer is None: + tokenizer = self.default_tokenizer(vocab, data_dir) + if tagger is None: + tagger = self.default_tagger(vocab, data_dir) + if entity is None: + entity = self.default_entity(vocab, data_dir) + if parser is None: + parser = self.default_parser(vocab, data_dir) + if matcher is None: + matcher = self.default_matcher(vocab, data_dir) + if serializer is None: + serializer = self.default_serializer(vocab, data_dir) + self.vocab = vocab + self.tokenizer = tokenizer + self.tagger = tagger + self.parser = parser + self.entity = entity + self.matcher = matcher + self.serializer = serializer + + def __call__(self, text, tag=True, parse=True, entity=True): + """Apply the pipeline to some text. The text can span multiple sentences, + and can contain arbtrary whitespace. Alignment into the original string + is preserved. + + Args: + text (unicode): The text to be processed. + + Returns: + tokens (spacy.tokens.Doc): + + >>> from spacy.en import English + >>> nlp = English() + >>> tokens = nlp('An example sentence. Another example sentence.') + >>> tokens[0].orth_, tokens[0].head.tag_ + ('An', 'NN') + """ + tokens = self.tokenizer(text) + if self.tagger and tag: + self.tagger(tokens) + if self.matcher and entity: + self.matcher(tokens) + if self.parser and parse: + self.parser(tokens) + if self.entity and entity: + self.entity(tokens) + return tokens + + def end_training(self, data_dir=None): + if data_dir is None: + data_dir = self.data_dir + self.parser.model.end_training() + self.entity.model.end_training() + self.tagger.model.end_training() + self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) + + with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: + file_.write( + json.dumps([ + (TAG, list(self.tagger.freqs[TAG].items())), + (DEP, list(self.parser.moves.freqs[DEP].items())), + (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), + (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), + (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) From 8083a07c3e5093349180a93314db11c7e15b108e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Aug 2015 15:37:30 +0200 Subject: [PATCH 07/48] * Use language base class --- spacy/en/__init__.py | 186 ++----------------------------------------- 1 file changed, 7 insertions(+), 179 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 3d433e497..ca19fb084 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -1,183 +1,11 @@ -from __future__ import unicode_literals +from __future__ import unicode_literals, print_function + from os import path -import re -import struct -import json -from .. import orth -from ..vocab import Vocab -from ..tokenizer import Tokenizer -from ..syntax.arc_eager import ArcEager -from ..syntax.ner import BiluoPushDown -from ..syntax.parser import ParserFactory -from ..serialize.bits import BitArray -from ..matcher import Matcher - -from ..tokens import Doc -from ..multi_words import RegexMerger - -from .pos import EnPosTagger -from .pos import POS_TAGS -from .attrs import get_flags -from . import regexes - -from ..util import read_lang_data - -from ..attrs import TAG, HEAD, DEP, ENT_TYPE, ENT_IOB +from ..language import Language -def get_lex_props(string, oov_prob=-30, is_oov=False): - return { - 'flags': get_flags(string, is_oov=is_oov), - 'length': len(string), - 'orth': string, - 'lower': string.lower(), - 'norm': string, - 'shape': orth.word_shape(string), - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': oov_prob, - 'sentiment': 0 - } - -get_lex_attr = {} - -if_model_present = -1 -LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') - - -class English(object): - """The English NLP pipeline. - - Example: - - Load data from default directory: - - >>> nlp = English() - >>> nlp = English(data_dir=u'') - - Load data from specified directory: - - >>> nlp = English(data_dir=u'path/to/data_directory') - - Disable (and avoid loading) parts of the processing pipeline: - - >>> nlp = English(vectors=False, parser=False, tagger=False, entity=False) - - Start with nothing loaded: - - >>> nlp = English(data_dir=None) - """ - ParserTransitionSystem = ArcEager - EntityTransitionSystem = BiluoPushDown - - def __init__(self, - data_dir=LOCAL_DATA_DIR, - Tokenizer=Tokenizer.from_dir, - Tagger=EnPosTagger, - Parser=ParserFactory(ParserTransitionSystem), - Entity=ParserFactory(EntityTransitionSystem), - Matcher=Matcher.from_dir, - Packer=None, - load_vectors=True - ): - self.data_dir = data_dir - - if path.exists(path.join(data_dir, 'vocab', 'oov_prob')): - oov_prob = float(open(path.join(data_dir, 'vocab', 'oov_prob')).read()) - else: - oov_prob = None - - self.vocab = Vocab(data_dir=path.join(data_dir, 'vocab') if data_dir else None, - get_lex_props=get_lex_props, load_vectors=load_vectors, - pos_tags=POS_TAGS, - oov_prob=oov_prob) - if Tagger is True: - Tagger = EnPosTagger - if Parser is True: - transition_system = self.ParserTransitionSystem - Parser = lambda s, d: parser.Parser(s, d, transition_system) - if Entity is True: - transition_system = self.EntityTransitionSystem - Entity = lambda s, d: parser.Parser(s, d, transition_system) - - self.tokenizer = Tokenizer(self.vocab, path.join(data_dir, 'tokenizer')) - - if Tagger and path.exists(path.join(data_dir, 'pos')): - self.tagger = Tagger(self.vocab.strings, data_dir) - else: - self.tagger = None - if Parser and path.exists(path.join(data_dir, 'deps')): - self.parser = Parser(self.vocab.strings, path.join(data_dir, 'deps')) - else: - self.parser = None - if Entity and path.exists(path.join(data_dir, 'ner')): - self.entity = Entity(self.vocab.strings, path.join(data_dir, 'ner')) - else: - self.entity = None - if Matcher: - self.matcher = Matcher(self.vocab, data_dir) - else: - self.matcher = None - if Packer: - self.packer = Packer(self.vocab, data_dir) - else: - self.packer = None - self.mwe_merger = RegexMerger([ - ('IN', 'O', regexes.MW_PREPOSITIONS_RE), - ('CD', 'TIME', regexes.TIME_RE), - ('NNP', 'DATE', regexes.DAYS_RE), - ('CD', 'MONEY', regexes.MONEY_RE)]) - - def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): - """Apply the pipeline to some text. The text can span multiple sentences, - and can contain arbtrary whitespace. Alignment into the original string - is preserved. - - Args: - text (unicode): The text to be processed. - - Returns: - tokens (spacy.tokens.Doc): - - >>> from spacy.en import English - >>> nlp = English() - >>> tokens = nlp('An example sentence. Another example sentence.') - >>> tokens[0].orth_, tokens[0].head.tag_ - ('An', 'NN') - """ - tokens = self.tokenizer(text) - if self.tagger and tag: - self.tagger(tokens) - if self.matcher and entity: - self.matcher(tokens) - if self.parser and parse: - self.parser(tokens) - if self.entity and entity: - self.entity(tokens) - if merge_mwes and self.mwe_merger is not None: - self.mwe_merger(tokens) - return tokens - - def end_training(self, data_dir=None): - if data_dir is None: - data_dir = self.data_dir - self.parser.model.end_training() - self.entity.model.end_training() - self.tagger.model.end_training() - self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) - - with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: - file_.write( - json.dumps([ - (TAG, list(self.tagger.freqs[TAG].items())), - (DEP, list(self.parser.moves.freqs[DEP].items())), - (ENT_IOB, list(self.entity.moves.freqs[ENT_IOB].items())), - (ENT_TYPE, list(self.entity.moves.freqs[ENT_TYPE].items())), - (HEAD, list(self.parser.moves.freqs[HEAD].items()))])) - - @property - def tags(self): - """Deprecated. List of part-of-speech tag names.""" - return self.tagger.tag_names +class English(Language): + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') From 82217c6ec6de0f1f948deb74b6e98e095d45e0da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Aug 2015 15:46:19 +0200 Subject: [PATCH 08/48] * Generalize lemmatizer --- spacy/en/lemmatizer.py | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/spacy/en/lemmatizer.py b/spacy/en/lemmatizer.py index 5883e12c8..660a16eb9 100644 --- a/spacy/en/lemmatizer.py +++ b/spacy/en/lemmatizer.py @@ -3,39 +3,6 @@ from os import path import codecs -NOUN_RULES = ( - ('s', ''), - ('ses', 's'), - ('ves', 'f'), - ('xes', 'x'), - ('zes', 'z'), - ('ches', 'ch'), - ('shes', 'sh'), - ('men', 'man'), - ('ies', 'y') -) - - -VERB_RULES = ( - ("s", ""), - ("ies", "y"), - ("es", "e"), - ("es", ""), - ("ed", "e"), - ("ed", ""), - ("ing", "e"), - ("ing", "") -) - - -ADJ_RULES = ( - ("er", ""), - ("est", ""), - ("er", "e"), - ("est", "e") -) - - class Lemmatizer(object): def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): self.noun_id = noun_id @@ -48,6 +15,8 @@ class Lemmatizer(object): self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) def __call__(self, string, pos): + + return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) if pos == self.noun_id: return self.noun(string) elif pos == self.verb_id: @@ -58,13 +27,13 @@ class Lemmatizer(object): raise Exception("Cannot lemmatize with unknown pos: %s" % pos) def noun(self, string): - return lemmatize(string, self.index['noun'], self.exc['noun'], NOUN_RULES) + return self(string, 'noun') def verb(self, string): - return lemmatize(string, self.index['verb'], self.exc['verb'], VERB_RULES) + return self(string, 'verb') def adj(self, string): - return lemmatize(string, self.index['adj'], self.exc['adj'], ADJ_RULES) + return self(string, 'adj') def lemmatize(string, index, exceptions, rules): From c5a27d1821d2bccdeec75c8740a442f74d66358d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 25 Aug 2015 15:47:08 +0200 Subject: [PATCH 09/48] * Move lemmatizer to spacy --- spacy/{en => }/lemmatizer.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename spacy/{en => }/lemmatizer.py (100%) diff --git a/spacy/en/lemmatizer.py b/spacy/lemmatizer.py similarity index 100% rename from spacy/en/lemmatizer.py rename to spacy/lemmatizer.py From 494da25872d6250068029df9a843baebb85a1a02 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:13:50 +0200 Subject: [PATCH 10/48] * Refactor for more universal spacy --- lang_data/de/infix.txt | 3 + lang_data/de/morphs.json | 0 lang_data/de/prefix.txt | 21 ++++++ lang_data/de/sample.txt | 3 + lang_data/de/specials.json | 149 +++++++++++++++++++++++++++++++++++++ lang_data/de/suffix.txt | 26 +++++++ lang_data/de/tag_map.json | 56 ++++++++++++++ lang_data/en/tag_map.json | 51 +++++++++++++ 8 files changed, 309 insertions(+) create mode 100644 lang_data/de/infix.txt create mode 100644 lang_data/de/morphs.json create mode 100644 lang_data/de/prefix.txt create mode 100644 lang_data/de/sample.txt create mode 100644 lang_data/de/specials.json create mode 100644 lang_data/de/suffix.txt create mode 100644 lang_data/de/tag_map.json create mode 100644 lang_data/en/tag_map.json diff --git a/lang_data/de/infix.txt b/lang_data/de/infix.txt new file mode 100644 index 000000000..37eca7350 --- /dev/null +++ b/lang_data/de/infix.txt @@ -0,0 +1,3 @@ +\.\.\. +(?<=[a-z])\.(?=[A-Z]) +(?<=[a-zA-Z])-(?=[a-zA-z]) diff --git a/lang_data/de/morphs.json b/lang_data/de/morphs.json new file mode 100644 index 000000000..e69de29bb diff --git a/lang_data/de/prefix.txt b/lang_data/de/prefix.txt new file mode 100644 index 000000000..48c4fc549 --- /dev/null +++ b/lang_data/de/prefix.txt @@ -0,0 +1,21 @@ +, +" +( +[ +{ +* +< +$ +£ +“ +' +`` +` +# +US$ +C$ +A$ +a- +‘ +.... +... diff --git a/lang_data/de/sample.txt b/lang_data/de/sample.txt new file mode 100644 index 000000000..12c0bb787 --- /dev/null +++ b/lang_data/de/sample.txt @@ -0,0 +1,3 @@ +Biografie: Ein Spiel ist ein Theaterstück des Schweizer Schriftstellers Max Frisch, das 1967 entstand und am 1. Februar 1968 im Schauspielhaus Zürich uraufgeführt wurde. 1984 legte Frisch eine überarbeitete Neufassung vor. Das von Frisch als Komödie bezeichnete Stück greift eines seiner zentralen Themen auf: die Möglichkeit oder Unmöglichkeit des Menschen, seine Identität zu verändern. + +Mit Biografie: Ein Spiel wandte sich Frisch von der Parabelform seiner Erfolgsstücke Biedermann und die Brandstifter und Andorra ab und postulierte eine „Dramaturgie der Permutation“. Darin sollte nicht, wie im klassischen Theater, Sinn und Schicksal im Mittelpunkt stehen, sondern die Zufälligkeit von Ereignissen und die Möglichkeit ihrer Variation. Dennoch handelt Biografie: Ein Spiel gerade von der Unmöglichkeit seines Protagonisten, seinen Lebenslauf grundlegend zu verändern. Frisch empfand die Wirkung des Stücks im Nachhinein als zu fatalistisch und die Umsetzung seiner theoretischen Absichten als nicht geglückt. Obwohl das Stück 1968 als unpolitisch und nicht zeitgemäß kritisiert wurde und auch später eine geteilte Rezeption erfuhr, gehört es an deutschsprachigen Bühnen zu den häufiger aufgeführten Stücken Frischs. diff --git a/lang_data/de/specials.json b/lang_data/de/specials.json new file mode 100644 index 000000000..0e0986339 --- /dev/null +++ b/lang_data/de/specials.json @@ -0,0 +1,149 @@ +{ +"a.m.": [{"F": "a.m."}], +"p.m.": [{"F": "p.m."}], + +"1a.m.": [{"F": "1"}, {"F": "a.m."}], +"2a.m.": [{"F": "2"}, {"F": "a.m."}], +"3a.m.": [{"F": "3"}, {"F": "a.m."}], +"4a.m.": [{"F": "4"}, {"F": "a.m."}], +"5a.m.": [{"F": "5"}, {"F": "a.m."}], +"6a.m.": [{"F": "6"}, {"F": "a.m."}], +"7a.m.": [{"F": "7"}, {"F": "a.m."}], +"8a.m.": [{"F": "8"}, {"F": "a.m."}], +"9a.m.": [{"F": "9"}, {"F": "a.m."}], +"10a.m.": [{"F": "10"}, {"F": "a.m."}], +"11a.m.": [{"F": "11"}, {"F": "a.m."}], +"12a.m.": [{"F": "12"}, {"F": "a.m."}], +"1am": [{"F": "1"}, {"F": "am", "L": "a.m."}], +"2am": [{"F": "2"}, {"F": "am", "L": "a.m."}], +"3am": [{"F": "3"}, {"F": "am", "L": "a.m."}], +"4am": [{"F": "4"}, {"F": "am", "L": "a.m."}], +"5am": [{"F": "5"}, {"F": "am", "L": "a.m."}], +"6am": [{"F": "6"}, {"F": "am", "L": "a.m."}], +"7am": [{"F": "7"}, {"F": "am", "L": "a.m."}], +"8am": [{"F": "8"}, {"F": "am", "L": "a.m."}], +"9am": [{"F": "9"}, {"F": "am", "L": "a.m."}], +"10am": [{"F": "10"}, {"F": "am", "L": "a.m."}], +"11am": [{"F": "11"}, {"F": "am", "L": "a.m."}], +"12am": [{"F": "12"}, {"F": "am", "L": "a.m."}], + + +"1p.m.": [{"F": "1"}, {"F": "p.m."}], +"2p.m.": [{"F": "2"}, {"F": "p.m."}], +"3p.m.": [{"F": "3"}, {"F": "p.m."}], +"4p.m.": [{"F": "4"}, {"F": "p.m."}], +"5p.m.": [{"F": "5"}, {"F": "p.m."}], +"6p.m.": [{"F": "6"}, {"F": "p.m."}], +"7p.m.": [{"F": "7"}, {"F": "p.m."}], +"8p.m.": [{"F": "8"}, {"F": "p.m."}], +"9p.m.": [{"F": "9"}, {"F": "p.m."}], +"10p.m.": [{"F": "10"}, {"F": "p.m."}], +"11p.m.": [{"F": "11"}, {"F": "p.m."}], +"12p.m.": [{"F": "12"}, {"F": "p.m."}], +"1pm": [{"F": "1"}, {"F": "pm", "L": "p.m."}], +"2pm": [{"F": "2"}, {"F": "pm", "L": "p.m."}], +"3pm": [{"F": "3"}, {"F": "pm", "L": "p.m."}], +"4pm": [{"F": "4"}, {"F": "pm", "L": "p.m."}], +"5pm": [{"F": "5"}, {"F": "pm", "L": "p.m."}], +"6pm": [{"F": "6"}, {"F": "pm", "L": "p.m."}], +"7pm": [{"F": "7"}, {"F": "pm", "L": "p.m."}], +"8pm": [{"F": "8"}, {"F": "pm", "L": "p.m."}], +"9pm": [{"F": "9"}, {"F": "pm", "L": "p.m."}], +"10pm": [{"F": "10"}, {"F": "pm", "L": "p.m."}], +"11pm": [{"F": "11"}, {"F": "pm", "L": "p.m."}], +"12pm": [{"F": "12"}, {"F": "pm", "L": "p.m."}], + +"Jan.": [{"F": "Jan.", "L": "Januar"}], +"Feb.": [{"F": "Feb.", "L": "Februar"}], +"Mär.": [{"F": "Mär.", "L": "März"}], +"Apr.": [{"F": "Apr.", "L": "April"}], +"Mai.": [{"F": "Mai.", "L": "Mai"}], +"Jun.": [{"F": "Jun.", "L": "Juni"}], +"Jul.": [{"F": "Jul.", "L": "Juli"}], +"Aug.": [{"F": "Aug.", "L": "August"}], +"Sep.": [{"F": "Sep.", "L": "September"}], +"Sept.": [{"F": "Sept.", "L": "September"}], +"Okt.": [{"F": "Okt.", "L": "Oktober"}], +"Nov.": [{"F": "Nov.", "L": "November"}], +"Dez.": [{"F": "Dez.", "L": "Dezember"}], + +":)": [{"F": ":)"}], +"<3": [{"F": "<3"}], +";)": [{"F": ";)"}], +"(:": [{"F": "(:"}], +":(": [{"F": ":("}], +"-_-": [{"F": "-_-"}], +"=)": [{"F": "=)"}], +":/": [{"F": ":/"}], +":>": [{"F": ":>"}], +";-)": [{"F": ";-)"}], +":Y": [{"F": ":Y"}], +":P": [{"F": ":P"}], +":-P": [{"F": ":-P"}], +":3": [{"F": ":3"}], +"=3": [{"F": "=3"}], +"xD": [{"F": "xD"}], +"^_^": [{"F": "^_^"}], +"=]": [{"F": "=]"}], +"=D": [{"F": "=D"}], +"<333": [{"F": "<333"}], +":))": [{"F": ":))"}], +":0": [{"F": ":0"}], +"-__-": [{"F": "-__-"}], +"xDD": [{"F": "xDD"}], +"o_o": [{"F": "o_o"}], +"o_O": [{"F": "o_O"}], +"V_V": [{"F": "V_V"}], +"=[[": [{"F": "=[["}], +"<33": [{"F": "<33"}], +";p": [{"F": ";p"}], +";D": [{"F": ";D"}], +";-p": [{"F": ";-p"}], +";(": [{"F": ";("}], +":p": [{"F": ":p"}], +":]": [{"F": ":]"}], +":O": [{"F": ":O"}], +":-/": [{"F": ":-/"}], +":-)": [{"F": ":-)"}], +":(((": [{"F": ":((("}], +":((": [{"F": ":(("}], +":')": [{"F": ":')"}], +"(^_^)": [{"F": "(^_^)"}], +"(=": [{"F": "(="}], +"o.O": [{"F": "o.O"}], +"\")": [{"F": "\")"}], +"a.": [{"F": "a."}], +"b.": [{"F": "b."}], +"c.": [{"F": "c."}], +"d.": [{"F": "d."}], +"e.": [{"F": "e."}], +"f.": [{"F": "f."}], +"g.": [{"F": "g."}], +"h.": [{"F": "h."}], +"i.": [{"F": "i."}], +"j.": [{"F": "j."}], +"k.": [{"F": "k."}], +"l.": [{"F": "l."}], +"m.": [{"F": "m."}], +"n.": [{"F": "n."}], +"o.": [{"F": "o."}], +"p.": [{"F": "p."}], +"q.": [{"F": "q."}], +"s.": [{"F": "s."}], +"t.": [{"F": "t."}], +"u.": [{"F": "u."}], +"v.": [{"F": "v."}], +"w.": [{"F": "w."}], +"x.": [{"F": "x."}], +"y.": [{"F": "y."}], +"z.": [{"F": "z."}], + +"z.b.": [{"F": "z.b."}], +"e.h.": [{"F": "I.e."}], +"o.ä.": [{"F": "I.E."}], +"bzw.": [{"F": "bzw."}], +"usw.": [{"F": "usw."}], +"\n": [{"F": "\n", "pos": "SP"}], +"\t": [{"F": "\t", "pos": "SP"}], +" ": [{"F": " ", "pos": "SP"}] +} diff --git a/lang_data/de/suffix.txt b/lang_data/de/suffix.txt new file mode 100644 index 000000000..d8c6bc2c2 --- /dev/null +++ b/lang_data/de/suffix.txt @@ -0,0 +1,26 @@ +, +\" +\) +\] +\} +\* +\! +\? +% +\$ +> +: +; +' +” +'' +'s +'S +’s +’S +’ +\.\. +\.\.\. +\.\.\.\. +(?<=[a-z0-9)\]"'%\)])\. +(?<=[0-9])km diff --git a/lang_data/de/tag_map.json b/lang_data/de/tag_map.json new file mode 100644 index 000000000..ee1bb1b81 --- /dev/null +++ b/lang_data/de/tag_map.json @@ -0,0 +1,56 @@ +{ +"$(": {"pos": "PUNCT", "PunctType": "Brck"}, +"$,": {"pos": "PUNCT", "PunctType": "Comm"}, +"$.": {"pos": "PUNCT", "PunctType": "Peri"}, +"ADJA": {"pos": "ADJ"}, +"ADJD": {"pos": "ADJ", "Variant": "Short"}, +"ADV": {"pos": "ADV"}, +"APPO": {"pos": "ADP", "AdpType": "Post"}, +"APPR": {"pos": "ADP", "AdpType": "Prep"}, +"APPRART": {"pos": "ADP", "AdpType": "Prep", "PronType": "Art"}, +"APZR": {"pos": "ADP", "AdpType": "Circ"}, +"ART": {"pos": "DET", "PronType": "Art"}, +"CARD": {"pos": "NUM", "NumType": "Card"}, +"FM": {"pos": "X", "Foreign": "Yes"}, +"ITJ": {"pos": "INTJ"}, +"KOKOM": {"pos": "CONJ", "ConjType": "Comp"}, +"KON": {"pos": "CONJ"}, +"KOUI": {"pos": "SCONJ"}, +"KOUS": {"pos": "SCONJ"}, +"NE": {"pos": "PROPN"}, +"NN": {"pos": "NOUN"}, +"PAV": {"pos": "ADV", "PronType": "Dem"}, +"PDAT": {"pos": "DET", "PronType": "Dem"}, +"PDS": {"pos": "PRON", "PronType": "Dem"}, +"PIAT": {"pos": "DET", "PronType": "Ind,Neg,Tot"}, +"PIDAT": {"pos": "DET", "AdjType": "Pdt", "PronType": "Ind,Neg,Tot"}, +"PIS": {"pos": "PRON", "PronType": "Ind,Neg,Tot"}, +"PPER": {"pos": "PRON", "PronType": "Prs"}, +"PPOSAT": {"pos": "DET", "Poss": "Yes", "PronType": "Prs"}, +"PPOSS": {"pos": "PRON", "Poss": "Yes", "PronType": "Prs"}, +"PRELAT": {"pos": "DET", "PronType": "Rel"}, +"PRELS": {"pos": "PRON", "PronType": "Rel"}, +"PRF": {"pos": "PRON", "PronType": "Prs", "Reflex": "Yes"}, +"PTKA": {"pos": "PART"}, +"PTKANT": {"pos": "PART", "PartType": "Res"}, +"PTKNEG": {"pos": "PART", "Negative": "Neg"}, +"PTKVZ": {"pos": "PART", "PartType": "Vbp"}, +"PTKZU": {"pos": "PART", "PartType": "Inf"}, +"PWAT": {"pos": "DET", "PronType": "Int"}, +"PWAV": {"pos": "ADV", "PronType": "Int"}, +"PWS": {"pos": "PRON", "PronType": "Int"}, +"TRUNC": {"pos": "X", "Hyph": "Yes"}, +"VAFIN": {"pos": "AUX", "Mood": "Ind", "VerbForm": "Fin"}, +"VAIMP": {"pos": "AUX", "Mood": "Imp", "VerbForm": "Fin"}, +"VAINF": {"pos": "AUX", "VerbForm": "Inf"}, +"VAPP": {"pos": "AUX", "Aspect": "Perf", "VerbForm": "Part"}, +"VMFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin", "VerbType": "Mod"}, +"VMINF": {"pos": "VERB", "VerbForm": "Inf", "VerbType": "Mod"}, +"VMPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part", "VerbType": "Mod"}, +"VVFIN": {"pos": "VERB", "Mood": "Ind", "VerbForm": "Fin"}, +"VVIMP": {"pos": "VERB", "Mood": "Imp", "VerbForm": "Fin"}, +"VVINF": {"pos": "VERB", "VerbForm": "Inf"}, +"VVIZU": {"pos": "VERB", "VerbForm": "Inf"}, +"VVPP": {"pos": "VERB", "Aspect": "Perf", "VerbForm": "Part"}, +"XY": {"pos": "X"} +} diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json new file mode 100644 index 000000000..57d3eedee --- /dev/null +++ b/lang_data/en/tag_map.json @@ -0,0 +1,51 @@ +{ +".": {"pos": "punc", "punctype": "peri"}, +",": {"pos": "punc", "punctype": "comm"}, +"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"}, +"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"}, +"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"}, +"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"}, +":": {"pos": "punc"}, +"$": {"pos": "sym", "other": {"symtype": "currency"}}, +"#": {"pos": "sym", "other": {"symtype": "numbersign"}}, +"AFX": {"pos": "adj", "hyph": "hyph"}, +"CC": {"pos": "conj", "conjtype": "coor"}, +"CD": {"pos": "num", "numtype": "card"}, +"DT": {"pos": "adj", "prontype": "prn"}, +"EX": {"pos": "adv", "advtype": "ex"}, +"FW": {"foreign": "foreign"}, +"HYPH": {"pos": "punc", "punctype": "dash"}, +"IN": {"pos": "adp"}, +"JJ": {"pos": "adj", "degree": "pos"}, +"JJR": {"pos": "adj", "degree": "comp"}, +"JJS": {"pos": "adj", "degree": "sup"}, +"LS": {"pos": "punc", "numtype": "ord"}, +"MD": {"pos": "verb", "verbtype": "mod"}, +"NIL": {}, +"NN": {"pos": "noun", "number": "sing"}, +"NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, +"NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, +"NNS": {"pos": "noun", "number": "plur"}, +"PDT": {"pos": "adj", "adjtype": "pdt", "prontype": "prn"}, +"POS": {"pos": "part", "poss": "poss"}, +"PRP": {"pos": "noun", "prontype": "prs"}, +"PRP$": {"pos": "adj", "prontype": "prs", "poss": "poss"}, +"RB": {"pos": "adv", "degree": "pos"}, +"RBR": {"pos": "adv", "degree": "comp"}, +"RBS": {"pos": "adv", "degree": "sup"}, +"RP": {"pos": "part"}, +"SYM": {"pos": "sym"}, +"TO": {"pos": "part", "parttype": "inf", "verbform": "inf"}, +"UH": {"pos": "int"}, +"VB": {"pos": "verb", "verbform": "inf"}, +"VBD": {"pos": "verb", "verbform": "fin", "tense": "past"}, +"VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"}, +"VBN": {"pos": "verb", "verbform": "part", "tense": "past", "aspect": "perf"}, +"VBP": {"pos": "verb", "verbform": "fin", "tense": "pres"}, +"VBZ": {"pos": "verb", "verbform": "fin", "tense": "pres", "number": "sing", "person": 3}, +"WDT": {"pos": "adj", "prontype": "int|rel"}, +"WP": {"pos": "noun", "prontype": "int|rel"}, +"WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"}, +"WRB": {"pos": "adv", "prontype": "int|rel"}, +"SP": {"pos": "space"} +} From dc13edd7cb78e751d0954059173c09bc0ebf7394 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:14:05 +0200 Subject: [PATCH 11/48] * Refactor init_model to accomodate other languages --- bin/init_model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index 9a635f296..0badf71fc 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -151,7 +151,7 @@ def setup_vocab(get_lex_attr, src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") - vocab = Vocab(data_dir=None, get_lex_attr=get_lex_attr) + vocab = Vocab(get_lex_attr=get_lex_attr) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: @@ -183,8 +183,8 @@ def setup_vocab(get_lex_attr, src_dir, dst_dir): def main(lang_id, lang_data_dir, corpora_dir, model_dir): languages = { - 'en': spacy.en.get_lex_attr, - 'de': spacy.en.get_lex_attr + 'en': spacy.en.English.default_lex_attrs(), + 'de': spacy.de.Deutsch.default_lex_attrs() } model_dir = Path(model_dir) From c2d8edd0bdae9a6a2a0ac59e8ee37a09524d7674 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:14:19 +0200 Subject: [PATCH 12/48] * Add PROB attribute in attrs.pxd --- spacy/attrs.pxd | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/attrs.pxd b/spacy/attrs.pxd index d2ace1cff..c810762ef 100644 --- a/spacy/attrs.pxd +++ b/spacy/attrs.pxd @@ -84,3 +84,4 @@ cpdef enum attr_id_t: ENT_TYPE HEAD SPACY + PROB From c4d87543857c6f40e521c8ea93b0c5ebf920e565 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:15:07 +0200 Subject: [PATCH 13/48] * Specify LOCAL_DATA_DIR global in spacy.en.__init__.py --- spacy/en/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index ca19fb084..f68ff196e 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -4,8 +4,9 @@ from os import path from ..language import Language +LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') class English(Language): @classmethod def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') + return LOCAL_DATA_DIR From e2ef78b29cee72790d0bf20983d64fd6be32c7da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:15:42 +0200 Subject: [PATCH 14/48] * Gut pos.pyx module, since functionality moved to spacy/tagger.pyx --- spacy/en/pos.pyx | 261 +---------------------------------------------- 1 file changed, 2 insertions(+), 259 deletions(-) diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 703d7198c..8e034eadf 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -1,268 +1,11 @@ from os import path -import json -import os -import shutil -from libc.string cimport memset +from ..parts_of_speech cimport NOUN, VERB, ADJ -from cymem.cymem cimport Address -from thinc.typedefs cimport atom_t, weight_t -from collections import defaultdict - -from ..parts_of_speech cimport univ_pos_t -from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON - -from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE -from ..structs cimport TokenC, Morphology, LexemeC -from ..tokens.doc cimport Doc -from ..morphology cimport set_morph_from_dict -from .._ml cimport arg_max - -from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL -from ..typedefs cimport attr_t - -from .lemmatizer import Lemmatizer - - -cpdef enum en_person_t: - NO_PERSON - FIRST - SECOND - THIRD - NON_THIRD - - -cpdef enum en_number_t: - NO_NUMBER - SINGULAR - PLURAL - MASS - - -cpdef enum en_gender_t: - NO_GENDER - MASCULINE - FEMININE - NEUTER - - -cpdef enum en_case_t: - NO_CASE - NOMINATIVE - GENITIVE - ACCUSATIVE - REFLEXIVE - DEMONYM - - -cpdef enum en_tenspect_t: - NO_TENSE - BASE_VERB - PRESENT - PAST - PASSIVE - ING - MODAL - - -cpdef enum misc_t: - NO_MISC - COMPARATIVE - SUPERLATIVE - RELATIVE - NAME - - -cpdef enum: - P2_orth - P2_cluster - P2_shape - P2_prefix - P2_suffix - P2_pos - P2_lemma - P2_flags - - P1_orth - P1_cluster - P1_shape - P1_prefix - P1_suffix - P1_pos - P1_lemma - P1_flags - - W_orth - W_cluster - W_shape - W_prefix - W_suffix - W_pos - W_lemma - W_flags - - N1_orth - N1_cluster - N1_shape - N1_prefix - N1_suffix - N1_pos - N1_lemma - N1_flags - - N2_orth - N2_cluster - N2_shape - N2_prefix - N2_suffix - N2_pos - N2_lemma - N2_flags - - N_CONTEXT_FIELDS - - -POS_TAGS = { - 'NULL': (NO_TAG, {}), - 'EOL': (EOL, {}), - 'CC': (CONJ, {}), - 'CD': (NUM, {}), - 'DT': (DET, {}), - 'EX': (DET, {}), - 'FW': (X, {}), - 'IN': (ADP, {}), - 'JJ': (ADJ, {}), - 'JJR': (ADJ, {'misc': COMPARATIVE}), - 'JJS': (ADJ, {'misc': SUPERLATIVE}), - 'LS': (X, {}), - 'MD': (VERB, {'tenspect': MODAL}), - 'NN': (NOUN, {}), - 'NNS': (NOUN, {'number': PLURAL}), - 'NNP': (NOUN, {'misc': NAME}), - 'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}), - 'PDT': (DET, {}), - 'POS': (PRT, {'case': GENITIVE}), - 'PRP': (PRON, {}), - 'PRP$': (PRON, {'case': GENITIVE}), - 'RB': (ADV, {}), - 'RBR': (ADV, {'misc': COMPARATIVE}), - 'RBS': (ADV, {'misc': SUPERLATIVE}), - 'RP': (PRT, {}), - 'SYM': (X, {}), - 'TO': (PRT, {}), - 'UH': (X, {}), - 'VB': (VERB, {}), - 'VBD': (VERB, {'tenspect': PAST}), - 'VBG': (VERB, {'tenspect': ING}), - 'VBN': (VERB, {'tenspect': PASSIVE}), - 'VBP': (VERB, {'tenspect': PRESENT}), - 'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}), - 'WDT': (DET, {'misc': RELATIVE}), - 'WP': (PRON, {'misc': RELATIVE}), - 'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}), - 'WRB': (ADV, {'misc': RELATIVE}), - '!': (PUNCT, {}), - '#': (PUNCT, {}), - '$': (PUNCT, {}), - "''": (PUNCT, {}), - "(": (PUNCT, {}), - ")": (PUNCT, {}), - "-LRB-": (PUNCT, {}), - "-RRB-": (PUNCT, {}), - ".": (PUNCT, {}), - ",": (PUNCT, {}), - "``": (PUNCT, {}), - ":": (PUNCT, {}), - "?": (PUNCT, {}), - "ADD": (X, {}), - "NFP": (PUNCT, {}), - "GW": (X, {}), - "AFX": (X, {}), - "HYPH": (PUNCT, {}), - "XX": (X, {}), - "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}), - "SP": (SPACE, {}) -} - - -POS_TEMPLATES = ( - (W_orth,), - (P1_lemma, P1_pos), - (P2_lemma, P2_pos), - (N1_orth,), - (N2_orth,), - - (W_suffix,), - (W_prefix,), - - (P1_pos,), - (P2_pos,), - (P1_pos, P2_pos), - (P1_pos, W_orth), - (P1_suffix,), - (N1_suffix,), - - (W_shape,), - (W_cluster,), - (N1_cluster,), - (N2_cluster,), - (P1_cluster,), - (P2_cluster,), - - (W_flags,), - (N1_flags,), - (N2_flags,), - (P1_flags,), - (P2_flags,), -) +from ..lemmatizer import Lemmatizer cdef class EnPosTagger(Tagger): """A part-of-speech tagger for English""" def make_lemmatizer(self, data_dir): return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) - - cdef int predict(self, int i, const TokenC* tokens) except -1: - cdef atom_t[N_CONTEXT_FIELDS] context - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) - scores = self.model.score(context) - return arg_max(scores, self.model.n_classes) - - cdef int update(self, int i, const TokenC* tokens, int gold) except -1: - cdef atom_t[N_CONTEXT_FIELDS] context - _fill_from_token(&context[P2_orth], &tokens[i-2]) - _fill_from_token(&context[P1_orth], &tokens[i-1]) - _fill_from_token(&context[W_orth], &tokens[i]) - _fill_from_token(&context[N1_orth], &tokens[i+1]) - _fill_from_token(&context[N2_orth], &tokens[i+2]) - scores = self.model.score(context) - guess = arg_max(scores, self.model.n_classes) - loss = guess != gold if gold != -1 else 0 - self.model.update(context, guess, gold, loss) - return guess - - - -cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: - context[0] = t.lex.lower - context[1] = t.lex.cluster - context[2] = t.lex.shape - context[3] = t.lex.prefix - context[4] = t.lex.suffix - context[5] = t.tag - context[6] = t.lemma - if t.lex.flags & (1 << IS_ALPHA): - context[7] = 1 - elif t.lex.flags & (1 << IS_PUNCT): - context[7] = 2 - elif t.lex.flags & (1 << LIKE_URL): - context[7] = 3 - elif t.lex.flags & (1 << LIKE_NUM): - context[7] = 4 - else: - context[7] = 0 From 76996f414515765709499c642199892f160d244d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:16:09 +0200 Subject: [PATCH 15/48] * Hack on generic Language class. Still needs work for morphology, defaults, etc --- spacy/language.py | 153 +++++++++++++++++++++++++++++++++------------- 1 file changed, 109 insertions(+), 44 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index fca52277b..706df34a5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,3 +1,19 @@ +from os import path + +from .tokenizer import Tokenizer +from .morphology import Morphology +from .vocab import Vocab +from .syntax.parser import Parser +from .tagger import Tagger +from .matcher import Matcher +from .serialize.packer import Packer +from ._ml import Model +from . import attrs +from . import orth +from .syntax.ner import BiluoPushDown +from .syntax.arc_eager import ArcEager + + class Language(object): @staticmethod def lower(string): @@ -21,7 +37,7 @@ class Language(object): @staticmethod def prob(string): - return self.oov_prob + return -30 @staticmethod def cluster(string): @@ -29,29 +45,50 @@ class Language(object): @staticmethod def is_alpha(string): - return orths.is_alpha(string) + return orth.is_alpha(string) + + @staticmethod + def is_ascii(string): + return orth.is_ascii(string) + + @staticmethod + def is_digit(string): + return string.isdigit() @staticmethod def is_lower(string): - return orths.is_lower(string) + return orth.is_lower(string) + + @staticmethod + def is_punct(string): + return orth.is_punct(string) + + @staticmethod + def is_space(string): + return string.isspace() + + @staticmethod + def is_title(string): + return orth.is_title(string) @staticmethod def is_upper(string): - return orths.is_upper(string) + return orth.is_upper(string) @staticmethod def like_url(string): - return orths.like_url(string) + return orth.like_url(string) @staticmethod def like_number(string): - return orths.like_number(string) + return orth.like_number(string) @staticmethod def like_email(string): - return orths.like_email(string) + return orth.like_email(string) - def default_lex_attrs(cls, data_dir): + @classmethod + def default_lex_attrs(cls, data_dir=None): return { attrs.LOWER: cls.lower, attrs.NORM: cls.norm, @@ -59,12 +96,15 @@ class Language(object): attrs.PREFIX: cls.prefix, attrs.SUFFIX: cls.suffix, attrs.CLUSTER: cls.cluster, - attrs.PROB: cls.prob, + attrs.PROB: lambda string: -10.0, attrs.IS_ALPHA: cls.is_alpha, attrs.IS_ASCII: cls.is_ascii, attrs.IS_DIGIT: cls.is_digit, attrs.IS_LOWER: cls.is_lower, + attrs.IS_PUNCT: cls.is_punct, + attrs.IS_SPACE: cls.is_space, + attrs.IS_TITLE: cls.is_title, attrs.IS_UPPER: cls.is_upper, attrs.LIKE_URL: cls.like_url, attrs.LIKE_NUM: cls.like_number, @@ -73,12 +113,36 @@ class Language(object): attrs.IS_OOV: lambda string: True } + @classmethod + def default_dep_templates(cls): + return [] + + @classmethod + def default_ner_templates(cls): + return [] + + @classmethod + def default_dep_labels(cls): + return {0: {'ROOT': True}} + + @classmethod + def default_ner_labels(cls): + return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} + @classmethod def default_data_dir(cls): return path.join(path.dirname(__file__), 'data') @classmethod - def default_vocab(cls, get_lex_attr=None, vectors=None, morphology=None, data_dir=None): + def default_morphology(cls, data_dir): + return Morphology.from_dir(data_dir) + + @classmethod + def default_vectors(cls, data_dir): + return None + + @classmethod + def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None): if data_dir is None: data_dir = cls.default_data_dir() if vectors is None: @@ -86,70 +150,71 @@ class Language(object): if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs(data_dir) if morphology is None: - morphology = cls.default_morphology(data_dir) - return vocab = Vocab.from_dir(data_dir, get_lex_attr, vectors, morphology) + morphology = cls.default_morphology(path.join(data_dir, 'vocab')) + return Vocab.from_dir( + path.join(data_dir, 'vocab'), + get_lex_attr=get_lex_attr, + vectors=vectors, + morphology=morphology) @classmethod - def default_tokenizer(cls, vocab, data_dir=None): - if data_dir is None: - data_dir = cls.default_data_dir() - return Tokenizer.from_dir(data_dir, vocab) + def default_tokenizer(cls, vocab, data_dir): + if path.exists(data_dir): + return Tokenizer.from_dir(vocab, data_dir) + else: + return Tokenizer(vocab, {}, None, None, None) @classmethod - def default_tagger(cls, vocab, data_dir=None): - return Tagger.from_dir(data_dir, vocab) + def default_tagger(cls, vocab, data_dir): + if path.exists(data_dir): + return Tagger.from_dir(data_dir, vocab) + else: + return None @classmethod - def default_parser(cls, vocab, transition_system=None, data_dir=None): - if transition_system is None: - transition_system = ArcEager() - return Parser.from_dir(data_dir, vocab, transition_system) + def default_parser(cls, vocab, data_dir): + if path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, ArcEager) + else: + return None @classmethod - def default_entity(cls, vocab, transition_system=None, data_dir=None): - if transition_system is None: - transition_system = BiluoPushDown() - return Parser.from_dir(data_dir, vocab, transition_system) + def default_entity(cls, vocab, data_dir): + if path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) + else: + return None @classmethod def default_matcher(cls, vocab, data_dir=None): if data_dir is None: data_dir = cls.default_data_dir() - return Matcher(data_dir, vocab) + return Matcher.from_dir(data_dir, vocab) - @classmethod - def default_serializer(cls, vocab, data_dir=None): - if data_dir is None: - data_dir = cls.default_data_dir() - return Packer(data_dir, vocab) - - def __init__(self, vocab=None, tokenizer=None, tagger=None, parser=None, - entity=None, matcher=None, serializer=None): + def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, + parser=None, entity=None, matcher=None, serializer=None): if data_dir is None: data_dir = self.default_data_dir() if vocab is None: vocab = self.default_vocab(data_dir) if tokenizer is None: - tokenizer = self.default_tokenizer(vocab, data_dir) + tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer')) if tagger is None: - tagger = self.default_tagger(vocab, data_dir) + tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos')) if entity is None: - entity = self.default_entity(vocab, data_dir) + entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner')) if parser is None: - parser = self.default_parser(vocab, data_dir) + parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps')) if matcher is None: - matcher = self.default_matcher(vocab, data_dir) - if serializer is None: - serializer = self.default_serializer(vocab, data_dir) + matcher = self.default_matcher(vocab, data_dir=data_dir) self.vocab = vocab self.tokenizer = tokenizer self.tagger = tagger self.parser = parser self.entity = entity self.matcher = matcher - self.serializer = serializer - def __call__(self, text, tag=True, parse=True, entity=True): + def __call__(self, text, tag=True, parse=True, entity=True, merge_mwes=False): """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. From 3acf60df06d2f1bf2afed1049ff87f7402c6b285 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:16:28 +0200 Subject: [PATCH 16/48] * Add missing properties in Lexeme class --- spacy/lexeme.pyx | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 4deec60c1..e99bcfa7c 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -29,6 +29,10 @@ cdef class Lexeme: self.c = vocab.get_by_orth(vocab.mem, orth) assert self.c.orth == orth + property orth_: + def __get__(self): + return self.vocab.strings[self.c.orth] + property lower: def __get__(self): return self.c.lower def __set__(self, int x): self.c.lower = x @@ -49,9 +53,13 @@ cdef class Lexeme: def __get__(self): return self.c.suffix def __set__(self, int x): self.c.suffix = x - property orth_: - def __get__(self): - return self.vocab.strings[self.c.orth] + property cluster: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x + + property prob: + def __get__(self): return self.c.suffix + def __set__(self, int x): self.c.suffix = x property lower_: def __get__(self): return self.vocab.strings[self.c.lower] @@ -73,6 +81,10 @@ cdef class Lexeme: def __get__(self): return self.c.suffix def __set__(self, unicode x): self.c.suffix = self.vocab.strings[x] + property flags: + def __get__(self): return self.c.flags + def __set__(self, flags_t x): self.c.flags = x + property is_oov: def __get__(self): return Lexeme.check_flag(self.c, IS_OOV) def __set__(self, attr_id_t x): Lexeme.set_flag(self.c, IS_OOV, x) From 430affc347423c8312130e5963da93fd471ff3dc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:17:02 +0200 Subject: [PATCH 17/48] * Fix missing n_patterns property in Matcher class. Fix from_dir method --- spacy/matcher.pyx | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 9d1220648..2cc91a368 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -99,7 +99,7 @@ def map_attr_name(attr): cdef class Matcher: cdef Pool mem cdef vector[Pattern*] patterns - cdef readonly int n_patterns + cdef readonly Vocab vocab def __init__(self, vocab, patterns): self.mem = Pool() @@ -107,6 +107,19 @@ cdef class Matcher: for entity_key, (etype, attrs, specs) in sorted(patterns.items()): self.add(entity_key, etype, attrs, specs) + @classmethod + def from_dir(cls, data_dir, Vocab vocab): + patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') + if path.exists(patterns_loc): + patterns_data = open(patterns_loc).read() + patterns = json.loads(patterns_data) + return cls(vocab, patterns) + else: + return cls(vocab, {}) + + property n_patterns: + def __get__(self): return self.patterns.size() + def add(self, entity_key, etype, attrs, specs): if isinstance(entity_key, basestring): entity_key = self.vocab.strings[entity_key] @@ -120,16 +133,6 @@ cdef class Matcher: spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) - @classmethod - def from_dir(cls, vocab, data_dir): - patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') - if path.exists(patterns_loc): - patterns_data = open(patterns_loc).read() - patterns = json.loads(patterns_data) - return cls(vocab, patterns) - else: - return cls(vocab, {}) - def __call__(self, Doc doc): cdef vector[Pattern*] partials cdef int n_partials = 0 From 378729f81af5025f6f45e68a95ca7f5eef24a1a2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:17:21 +0200 Subject: [PATCH 18/48] * Hack Morphology class towards usability --- spacy/morphology.pyx | 136 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 127 insertions(+), 9 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 96a4ba884..f32009351 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,11 +1,129 @@ -# cython: embedsignature=True +from os import path + +try: + import ujson as json +except ImportError: + import json + +from spacy.parts_of_speech import UNIV_POS_NAMES + + +cdef class Morphology: + def __init__(self, tag_map, fused_tokens, lemmatizer): + self.tag_map = tag_map + self.n_tags = len(tag_map) + self.tag_names = tuple(sorted(tag_map.keys())) + self.tag_ids = {} + for i, tag_str in enumerate(self.tag_names): + self.tag_ids[tag_str] = i + + @classmethod + def from_dir(cls, data_dir): + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + return cls(tag_map, {}, None) + + cdef int assign_tag(self, TokenC* token, int tag) except -1: + props = self.tag_map[self.tag_names[tag]] + token.pos = UNIV_POS_NAMES[props['pos'].upper()] + token.tag = tag + #token.inflection = # TODO + + cdef int assign_from_dict(self, TokenC* token, props) except -1: + pass + + def load_morph_exceptions(self, dict exc): + pass + # Map (form, pos) to (lemma, inflection) + #cdef unicode pos_str + #cdef unicode form_str + #cdef unicode lemma_str + #cdef dict entries + #cdef dict props + #cdef int lemma + #cdef attr_t orth + #cdef int pos + #for pos_str, entries in exc.items(): + # pos = self.tag_names.index(pos_str) + # for form_str, props in entries.items(): + # lemma_str = props.get('L', form_str) + # orth = self.strings[form_str] + # cached = self.mem.alloc(1, sizeof(InflectedLemma)) + # cached.lemma = self.strings[lemma_str] + # set_morph_from_dict(&cached.morph, props) + # self._morph_cache.set(pos, orth, cached) -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: - morph.number = props.get('number', 0) - morph.tenspect = props.get('tenspect', 0) - morph.mood = props.get('mood', 0) - morph.gender = props.get('gender', 0) - morph.person = props.get('person', 0) - morph.case = props.get('case', 0) - morph.misc = props.get('misc', 0) +#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: +# morph.number = props.get('number', 0) +# morph.tenspect = props.get('tenspect', 0) +# morph.mood = props.get('mood', 0) +# morph.gender = props.get('gender', 0) +# morph.person = props.get('person', 0) +# morph.case = props.get('case', 0) +# morph.misc = props.get('misc', 0) +# +# +#cdef class Morphology: +# cdef Pool mem +# cdef PreshMap table +# +# def __init__(self, tags, exceptions): +# pass +# +# def __getitem__(self, hash_t id_): +# pass +# +# cdef const InflectionC* get(self, hash_t key) except NULL: +# pass +# +# cdef MorphAnalysis analyse(const TokenC* token) except -1: +# cdef struct MorphAnalysis morphology +# tokens[i].pos = tag.pos +# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) +# if cached is NULL: +# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) +# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) +# cached.morph = tag.morph +# self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) +# tokens[i].lemma = cached.lemma +# tokens[i].morph = cached.morph +# +# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: +# if self.lemmatizer is None: +# return lex.orth +# cdef unicode py_string = self.strings[lex.orth] +# if pos != NOUN and pos != VERB and pos != ADJ: +# return lex.orth +# cdef set lemma_strings +# cdef unicode lemma_string +# lemma_strings = self.lemmatizer(py_string, pos) +# lemma_string = sorted(lemma_strings)[0] +# lemma = self.strings[lemma_string] +# return lemma +# +# +#cdef class Inflection: +# cdef InflectionC* c +# +# def __init__(self, container, id_): +# self.c = container[id_] +# self.container = container +# +# for i, feat_id in enumerate(feat_ids): +# feature, value = parse_id(feat_id) +# self.add_value(feature, value, True) +# +# def has(self, Value_t feat_value_id): +# part = feat_value_id % 64 +# bit = feat_value_id / 64 +# if self.value_set[part] & bit: +# return True +# else: +# return False +# +# property pos: def __get__(self): return self.c.pos +# +# property id: def __get__(self): return self.c.id +# +# property features: +# pass From 008b02b03572c1687bc9e9a004adef26920abb5d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:17:35 +0200 Subject: [PATCH 19/48] * Comment out enums in Morpohlogy for now --- spacy/morphology.pxd | 982 ++++++++++++++++++++++--------------------- 1 file changed, 493 insertions(+), 489 deletions(-) diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 6914eb8d6..7f2ebe34b 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,478 +1,487 @@ -from .structs cimport TokenC, Morphology, PosTag - - -cdef int set_morph_from_dict(Morphology* morph, dict props) except -1 - - -cdef enum Feature: - Abbr - AdpType - AdvType - ConjType - Connegative - Derivation - Echo - Foreign - Gender_dat - Gender_erg - Gender_psor - Hyph - InfForm - NameType - NounType - NumberAbs - NumberDat - NumberErg - NumberPsee - NumberPsor - NumForm - NumValue - PartForm - PartType - Person_abs - Person_dat - Person_psor - Polite - Polite_abs - Polite_dat - Prefix - PrepCase - PunctSide - PunctType - Style - Typo - Variant - VerbType -cpdef enum Animacy: - Anim - Inam - - -cpdef enum Aspect: - Freq - Imp - Mod - None_ - Perf - - -cpdef enum Case1: - Abe - Abl - Abs - Acc - Ade - All - Cau - Com - -cdef enum Case2: - Dat - Del - Dis - Ela - Ess - Gen - Ill - Ine - -cdef enum Case3: - Ins - Loc - Lat - Nom - Par - Sub - Sup - Tem - Ter - - -cdef enum Case4: - Tra - Voc - - -cpdef enum Definite: - Two - Def - Red - Ind - - -cpdef enum Degree: - Cmp - Comp - None_ - Pos - Sup - Abs - Com - Degree # du - - -cpdef enum Gender: - Com - Fem - Masc - Neut - - -cpdef enum Mood: - Cnd - Imp - Ind - N - Pot - Sub - Opt - - -cpdef enum Negative: - Neg - Pos - Yes - - -cpdef enum Number: - Com - Dual - None_ - Plur - Sing - Ptan # bg - Count # bg - - -cpdef enum NumType: - Card - Dist - Frac - Gen - Mult - None_ - Ord - Sets - - -cpdef enum Person: - One - Two - Three - None_ - - -cpdef enum Poss: - Yes - - -cpdef enum PronType1: - AdvPart - Art - Default - Dem - Ind - Int - Neg - -cpdef enum PronType2: - Prs - Rcp - Rel - Tot - Clit - Exc # es, ca, it, fa - Clit # it - - -cpdef enum Reflex: - Yes - - -cpdef enum Tense: - Fut - Imp - Past - Pres - -cpdef enum VerbForm1: - Fin - Ger - Inf - None_ - Part - PartFut - PartPast - -cpdef enum VerbForm2: - PartPres - Sup - Trans - Gdv # la - - -cpdef enum Voice: - Act - Cau - Pass - Mid # gkc - Int # hb - - -cpdef enum Abbr: - Yes # cz, fi, sl, U - -cpdef enum AdpType: - Prep # cz, U - Post # U - Voc # cz - Comprep # cz - Circ # U - Voc # U - - -cpdef enum AdvType1: - # U - Man - Loc - Tim - Deg - Cau - Mod - Sta - Ex - -cpdef enum AdvType2: - Adadj - -cpdef enum ConjType: - Oper # cz, U - Comp # cz, U - -cpdef enum Connegative: - Yes # fi - - -cpdef enum Derivation1: - Minen # fi - Sti # fi - Inen # fi - Lainen # fi - Ja # fi - Ton # fi - Vs # fi - Ttain # fi - -cpdef enum Derivation2: - Ttaa - - -cpdef enum Echo: - Rdp # U - Ech # U - - -cpdef enum Foreign: - Foreign # cz, fi, U - Fscript # cz, fi, U - Tscript # cz, U - Yes # sl - - -cpdef enum Gender_dat: - Masc # bq, U - Fem # bq, U - - -cpdef enum Gender_erg: - Masc # bq - Fem # bq - - -cpdef enum Gender_psor: - Masc # cz, sl, U - Fem # cz, sl, U - Neut # sl - - -cpdef enum Hyph: - Yes # cz, U - - -cpdef enum InfForm: - One # fi - Two # fi - Three # fi - - -cpdef enum NameType: - Geo # U, cz - Prs # U, cz - Giv # U, cz - Sur # U, cz - Nat # U, cz - Com # U, cz - Pro # U, cz - Oth # U, cz - - -cpdef enum NounType: - Com # U - Prop # U - Class # U - -cpdef enum Number_abs: - Sing # bq, U - Plur # bq, U - -cpdef enum Number_dat: - Sing # bq, U - Plur # bq, U - -cpdef enum Number_erg: - Sing # bq, U - Plur # bq, U - -cpdef enum Number_psee: - Sing # U - Plur # U - - -cpdef enum Number_psor: - Sing # cz, fi, sl, U - Plur # cz, fi, sl, U - - -cpdef enum NumForm: - Digit # cz, sl, U - Roman # cz, sl, U - Word # cz, sl, U - - -cpdef enum NumValue: - One # cz, U - Two # cz, U - Three # cz, U - - -cpdef enum PartForm: - Pres # fi - Past # fi - Agt # fi - Neg # fi - - -cpdef enum PartType: - Mod # U - Emp # U - Res # U - Inf # U - Vbp # U - -cpdef enum Person_abs: - One # bq, U - Two # bq, U - Three # bq, U - - -cpdef enum Person_dat: - One # bq, U - Two # bq, U - Three # bq, U - - -cpdef enum Person_erg: - One # bq, U - Two # bq, U - Three # bq, U - - -cpdef enum Person_psor: - One # fi, U - Two # fi, U - Three # fi, U - - -cpdef enum Polite: - Inf # bq, U - Pol # bq, U - - -cpdef enum Polite_abs: - Inf # bq, U - Pol # bq, U - - -cpdef enum Polite_erg: - Inf # bq, U - Pol # bq, U - - -cpdef enum Polite_dat: - Inf # bq, U - Pol # bq, U - - -cpdef enum Prefix: - Yes # U - - -cpdef enum PrepCase: - Npr # cz - Pre # U - - -cpdef enum PunctSide: - Ini # U - Fin # U - -cpdef enum PunctType1: - Peri # U - Qest # U - Excl # U - Quot # U - Brck # U - Comm # U - Colo # U - Semi # U - -cpdef enum PunctType2: - Dash # U - - -cpdef enum Style1: - Arch # cz, fi, U - Rare # cz, fi, U - Poet # cz, U - Norm # cz, U - Coll # cz, U - Vrnc # cz, U - Sing # cz, U - Expr # cz, U - - -cpdef enum Style2: - Derg # cz, U - Vulg # cz, U - - -cpdef enum Typo: - Yes # fi, U - - -cpdef enum Variant: - Short # cz - Bound # cz, sl - - -cpdef enum VerbType: - Aux # U - Cop # U - Mod # U - Light # U - - -cpdef enum FeatureValues: +from .structs cimport TokenC + + +cdef class Morphology: + cdef public object tag_map + cdef public object tag_names + cdef public object tag_ids + cdef public int n_tags + + cdef int assign_tag(self, TokenC* token, int tag) except -1 + + cdef int assign_from_dict(self, TokenC* token, props) except -1 + +# +#cpdef enum Feature_t: +# Abbr +# AdpType +# AdvType +# ConjType +# Connegative +# Derivation +# Echo +# Foreign +# Gender_dat +# Gender_erg +# Gender_psor +# Hyph +# InfForm +# NameType +# NounType +# NumberAbs +# NumberDat +# NumberErg +# NumberPsee +# NumberPsor +# NumForm +# NumValue +# PartForm +# PartType +# Person_abs +# Person_dat +# Person_psor +# Polite +# Polite_abs +# Polite_dat +# Prefix +# PrepCase +# PunctSide +# PunctType +# Style +# Typo +# Variant +# VerbType +# +# +#cpdef enum Animacy: +# Anim +# Inam +# +# +#cpdef enum Aspect: +# Freq +# Imp +# Mod +# None_ +# Perf +# +# +#cpdef enum Case1: +# Nom +# Gen +# Acc +# Dat +# Voc +# Abl +# +#cdef enum Case2: +# Abe +# Abs +# Ade +# All +# Cau +# Com +# Del +# Dis +# +#cdef enum Case3: +# Ela +# Ess +# Ill +# Ine +# Ins +# Loc +# Lat +# Par +# +#cdef enum Case4: +# Sub +# Sup +# Tem +# Ter +# Tra +# +# +#cpdef enum Definite: +# Two +# Def +# Red +# Ind +# +# +#cpdef enum Degree: +# Cmp +# Comp +# None_ +# Pos +# Sup +# Abs +# Com +# Degree # du +# +# +#cpdef enum Gender: +# Com +# Fem +# Masc +# Neut +# +# +#cpdef enum Mood: +# Cnd +# Imp +# Ind +# N +# Pot +# Sub +# Opt +# +# +#cpdef enum Negative: +# Neg +# Pos +# Yes +# +# +#cpdef enum Number: +# Com +# Dual +# None_ +# Plur +# Sing +# Ptan # bg +# Count # bg +# +# +#cpdef enum NumType: +# Card +# Dist +# Frac +# Gen +# Mult +# None_ +# Ord +# Sets +# +# +#cpdef enum Person: +# One +# Two +# Three +# None_ +# +# +#cpdef enum Poss: +# Yes +# +# +#cpdef enum PronType1: +# AdvPart +# Art +# Default +# Dem +# Ind +# Int +# Neg +# +#cpdef enum PronType2: +# Prs +# Rcp +# Rel +# Tot +# Clit +# Exc # es, ca, it, fa +# Clit # it +# +# +#cpdef enum Reflex: +# Yes +# +# +#cpdef enum Tense: +# Fut +# Imp +# Past +# Pres +# +#cpdef enum VerbForm1: +# Fin +# Ger +# Inf +# None_ +# Part +# PartFut +# PartPast +# +#cpdef enum VerbForm2: +# PartPres +# Sup +# Trans +# Gdv # la +# +# +#cpdef enum Voice: +# Act +# Cau +# Pass +# Mid # gkc +# Int # hb +# +# +#cpdef enum Abbr: +# Yes # cz, fi, sl, U +# +#cpdef enum AdpType: +# Prep # cz, U +# Post # U +# Voc # cz +# Comprep # cz +# Circ # U +# Voc # U +# +# +#cpdef enum AdvType1: +# # U +# Man +# Loc +# Tim +# Deg +# Cau +# Mod +# Sta +# Ex +# +#cpdef enum AdvType2: +# Adadj +# +#cpdef enum ConjType: +# Oper # cz, U +# Comp # cz, U +# +#cpdef enum Connegative: +# Yes # fi +# +# +#cpdef enum Derivation1: +# Minen # fi +# Sti # fi +# Inen # fi +# Lainen # fi +# Ja # fi +# Ton # fi +# Vs # fi +# Ttain # fi +# +#cpdef enum Derivation2: +# Ttaa +# +# +#cpdef enum Echo: +# Rdp # U +# Ech # U +# +# +#cpdef enum Foreign: +# Foreign # cz, fi, U +# Fscript # cz, fi, U +# Tscript # cz, U +# Yes # sl +# +# +#cpdef enum Gender_dat: +# Masc # bq, U +# Fem # bq, U +# +# +#cpdef enum Gender_erg: +# Masc # bq +# Fem # bq +# +# +#cpdef enum Gender_psor: +# Masc # cz, sl, U +# Fem # cz, sl, U +# Neut # sl +# +# +#cpdef enum Hyph: +# Yes # cz, U +# +# +#cpdef enum InfForm: +# One # fi +# Two # fi +# Three # fi +# +# +#cpdef enum NameType: +# Geo # U, cz +# Prs # U, cz +# Giv # U, cz +# Sur # U, cz +# Nat # U, cz +# Com # U, cz +# Pro # U, cz +# Oth # U, cz +# +# +#cpdef enum NounType: +# Com # U +# Prop # U +# Class # U +# +#cpdef enum Number_abs: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_dat: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_erg: +# Sing # bq, U +# Plur # bq, U +# +#cpdef enum Number_psee: +# Sing # U +# Plur # U +# +# +#cpdef enum Number_psor: +# Sing # cz, fi, sl, U +# Plur # cz, fi, sl, U +# +# +#cpdef enum NumForm: +# Digit # cz, sl, U +# Roman # cz, sl, U +# Word # cz, sl, U +# +# +#cpdef enum NumValue: +# One # cz, U +# Two # cz, U +# Three # cz, U +# +# +#cpdef enum PartForm: +# Pres # fi +# Past # fi +# Agt # fi +# Neg # fi +# +# +#cpdef enum PartType: +# Mod # U +# Emp # U +# Res # U +# Inf # U +# Vbp # U +# +#cpdef enum Person_abs: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_dat: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_erg: +# One # bq, U +# Two # bq, U +# Three # bq, U +# +# +#cpdef enum Person_psor: +# One # fi, U +# Two # fi, U +# Three # fi, U +# +# +#cpdef enum Polite: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_abs: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_erg: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Polite_dat: +# Inf # bq, U +# Pol # bq, U +# +# +#cpdef enum Prefix: +# Yes # U +# +# +#cpdef enum PrepCase: +# Npr # cz +# Pre # U +# +# +#cpdef enum PunctSide: +# Ini # U +# Fin # U +# +#cpdef enum PunctType1: +# Peri # U +# Qest # U +# Excl # U +# Quot # U +# Brck # U +# Comm # U +# Colo # U +# Semi # U +# +#cpdef enum PunctType2: +# Dash # U +# +# +#cpdef enum Style1: +# Arch # cz, fi, U +# Rare # cz, fi, U +# Poet # cz, U +# Norm # cz, U +# Coll # cz, U +# Vrnc # cz, U +# Sing # cz, U +# Expr # cz, U +# +# +#cpdef enum Style2: +# Derg # cz, U +# Vulg # cz, U +# +# +#cpdef enum Typo: +# Yes # fi, U +# +# +#cpdef enum Variant: +# Short # cz +# Bound # cz, sl +# +# +#cpdef enum VerbType: +# Aux # U +# Cop # U +# Mod # U +# Light # U +# + +cpdef enum Value_t: Animacy_Anim Animacy_Inam Aspect_Freq @@ -566,7 +575,6 @@ cpdef enum FeatureValues: PronType_Tot PronType_Clit PronType_Exc # es, ca, it, fa - PronType_Clit # it Reflex_Yes Tense_Fut Tense_Imp @@ -594,7 +602,6 @@ cpdef enum FeatureValues: AdpType_Voc # cz AdpType_Comprep # cz AdpType_Circ # U - AdpType_Voc # U AdvType_Man AdvType_Loc AdvType_Tim @@ -607,16 +614,15 @@ cpdef enum FeatureValues: ConjType_Oper # cz, U ConjType_Comp # cz, U Connegative_Yes # fi - # fi - Derivation_Minen - Derivation_Sti - Derivation_Inen - Derivation_Lainen - Derivation_Ja - Derivation_Ton - Derivation_Vs - Derivation_Ttain - Derivation_Ttaa + Derivation_Minen # fi + Derivation_Sti # fi + Derivation_Inen # fi + Derivation_Lainen # fi + Derivation_Ja # fi + Derivation_Ton # fi + Derivation_Vs # fi + Derivation_Ttain # fi + Derivation_Ttaa # fi Echo_Rdp # U Echo_Ech # U Foreign_Foreign # cz, fi, U @@ -721,5 +727,3 @@ cpdef enum FeatureValues: VerbType_Cop # U VerbType_Mod # U VerbType_Light # U - - From f8f2f4e545f752e25160dae93691828a01d8dce5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:18:19 +0200 Subject: [PATCH 20/48] * Temporarily add PUNC name to parts_of_specch dictionary, until better solution --- spacy/parts_of_speech.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 994a48eba..7081cfab9 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -15,6 +15,7 @@ UNIV_POS_NAMES = { "VERB": VERB, "X": X, "PUNCT": PUNCT, + "PUNC": PUNCT, "SPACE": SPACE, "EOL": EOL } From 1d7f2d3abc91480d53c8886786435e8a08b5def4 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:18:36 +0200 Subject: [PATCH 21/48] * Hack on morphology structs --- spacy/structs.pxd | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/spacy/structs.pxd b/spacy/structs.pxd index f3095df51..f150fa312 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -1,4 +1,4 @@ -from libc.stdint cimport uint8_t, uint32_t, int32_t +from libc.stdint cimport uint8_t, uint32_t, int32_t, uint64_t from .typedefs cimport flags_t, attr_t, hash_t from .parts_of_speech cimport univ_pos_t @@ -25,21 +25,16 @@ cdef struct LexemeC: float sentiment float l2_norm - -cdef struct Morphology: - uint8_t number - uint8_t tenspect # Tense/aspect/voice - uint8_t mood - uint8_t gender - uint8_t person - uint8_t case - uint8_t misc +cdef struct MorphFeatC: + int name + int value -cdef struct PosTag: - Morphology morph - int id +cdef struct MorphologyC: + uint64_t[4] feature_set + MorphFeatC* features univ_pos_t pos + int n cdef struct Entity: @@ -59,7 +54,7 @@ cdef struct Constituent: cdef struct TokenC: const LexemeC* lex - Morphology morph + const MorphologyC* morph const Constituent* ctnt univ_pos_t pos bint spacy From a3d5e6c0dde9a844823ff1eefc7938d134179003 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:19:01 +0200 Subject: [PATCH 22/48] * Reform constructor and save/load workflow in parser model --- spacy/syntax/parser.pxd | 1 - spacy/syntax/parser.pyx | 18 ++++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd index 4ee30341a..70a0229c2 100644 --- a/spacy/syntax/parser.pxd +++ b/spacy/syntax/parser.pxd @@ -11,7 +11,6 @@ from .stateclass cimport StateClass cdef class Parser: - cdef readonly object cfg cdef readonly Model model cdef readonly TransitionSystem moves diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 6282339bd..7987547fa 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -67,16 +67,22 @@ def ParserFactory(transition_system): cdef class Parser: - def __init__(self, StringStore strings, model_dir, transition_system): + def __init__(self, StringStore strings, transition_system, model): + self.moves = transition_system + self.model = model + + @classmethod + def from_dir(cls, model_dir, strings, transition_system): if not os.path.exists(model_dir): print >> sys.stderr, "Warning: No model found at", model_dir elif not os.path.isdir(model_dir): print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory" - else: - self.cfg = Config.read(model_dir, 'config') - self.moves = transition_system(strings, self.cfg.labels) - templates = get_templates(self.cfg.features) - self.model = Model(self.moves.n_moves, templates, model_dir) + cfg = Config.read(model_dir, 'config') + moves = transition_system(strings, cfg.labels) + templates = get_templates(cfg.features) + model = Model(moves.n_moves, templates, model_dir) + return cls(strings, moves, model) + def __call__(self, Doc tokens): cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) From b4faf551f545c7ef47f73d0f9efaad8374fa0f65 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:19:21 +0200 Subject: [PATCH 23/48] * Refactor language-independent tagger class --- spacy/tagger.pxd | 11 ++- spacy/tagger.pyx | 223 +++++++++++++++++++++++++++++++---------------- 2 files changed, 151 insertions(+), 83 deletions(-) diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 4aa9acc43..213781047 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -4,24 +4,23 @@ from cymem.cymem cimport Pool from ._ml cimport Model from .strings cimport StringStore -from .structs cimport TokenC, LexemeC, Morphology, PosTag +from .structs cimport TokenC, LexemeC from .parts_of_speech cimport univ_pos_t +from .vocab cimport Vocab cdef class Tagger: cdef readonly Pool mem cdef readonly StringStore strings cdef readonly Model model + cdef readonly Vocab vocab cdef public object lemmatizer cdef PreshMapArray _morph_cache cdef public dict freqs - cdef PosTag* tags - cdef readonly object tag_names - cdef readonly object tag_map cdef readonly int n_tags cdef int predict(self, int i, const TokenC* tokens) except -1 cdef int update(self, int i, const TokenC* tokens, int gold) except -1 - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 + #cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 + #cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index ccb40fd22..5d015b6cc 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -6,50 +6,129 @@ from thinc.typedefs cimport atom_t, weight_t from .typedefs cimport attr_t from .tokens.doc cimport Doc -from .morphology cimport set_morph_from_dict from .attrs cimport TAG from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE +from .attrs cimport * +from ._ml cimport arg_max + -cdef struct _CachedMorph: - Morphology morph - int lemma +cpdef enum: + P2_orth + P2_cluster + P2_shape + P2_prefix + P2_suffix + P2_pos + P2_lemma + P2_flags + + P1_orth + P1_cluster + P1_shape + P1_prefix + P1_suffix + P1_pos + P1_lemma + P1_flags + + W_orth + W_cluster + W_shape + W_prefix + W_suffix + W_pos + W_lemma + W_flags + + N1_orth + N1_cluster + N1_shape + N1_prefix + N1_suffix + N1_pos + N1_lemma + N1_flags + + N2_orth + N2_cluster + N2_shape + N2_prefix + N2_suffix + N2_pos + N2_lemma + N2_flags + + N_CONTEXT_FIELDS cdef class Tagger: """A part-of-speech tagger for English""" + @classmethod + def read_config(cls, data_dir): + return json.load(open(path.join(data_dir, 'pos', 'config.json'))) + + @classmethod + def default_templates(cls): + return ( + (W_orth,), + (P1_lemma, P1_pos), + (P2_lemma, P2_pos), + (N1_orth,), + (N2_orth,), + + (W_suffix,), + (W_prefix,), + + (P1_pos,), + (P2_pos,), + (P1_pos, P2_pos), + (P1_pos, W_orth), + (P1_suffix,), + (N1_suffix,), + + (W_shape,), + (W_cluster,), + (N1_cluster,), + (N2_cluster,), + (P1_cluster,), + (P2_cluster,), + + (W_flags,), + (N1_flags,), + (N2_flags,), + (P1_flags,), + (P2_flags,), + ) + def make_lemmatizer(self): return None - def __init__(self, StringStore strings, data_dir): + def __init__(self, Vocab vocab, templates): self.mem = Pool() - model_dir = path.join(data_dir, 'pos') - self.strings = strings - cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) - self.tag_names = sorted(cfg['tag_names']) - assert self.tag_names - self.n_tags = len(self.tag_names) - self.tag_map = cfg['tag_map'] - cdef int n_tags = len(self.tag_names) + 1 + self.vocab = vocab + + cdef int n_tags = self.vocab.morphology.n_tags + 1 - self.model = Model(n_tags, cfg['templates'], model_dir) - self._morph_cache = PreshMapArray(n_tags) - self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) - for i, tag in enumerate(sorted(self.tag_names)): - pos, props = self.tag_map[tag] - self.tags[i].id = i - self.tags[i].pos = pos - set_morph_from_dict(&self.tags[i].morph, props) - if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')): - self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer', - 'morphs.json')))) - self.lemmatizer = self.make_lemmatizer(data_dir) + self.model = Model(n_tags, templates) self.freqs = {TAG: defaultdict(int)} for tag in self.tag_names: - self.freqs[TAG][self.strings[tag]] = 1 + self.freqs[TAG][self.vocab.strings[tag]] = 1 self.freqs[TAG][0] = 1 + @property + def tag_names(self): + return tuple(sorted(self.vocab.morphology.tag_map.keys())) + + @classmethod + def from_dir(cls, data_dir, vocab): + if path.exists(path.join(data_dir, 'templates.json')): + templates = json.loads(open(path.join(data_dir, 'templates.json'))) + else: + templates = cls.default_templates() + return cls(vocab, templates) + def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -63,18 +142,14 @@ cdef class Tagger: for i in range(tokens.length): if tokens.data[i].pos == 0: guess = self.predict(i, tokens.data) - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) - + self.vocab.morphology.assign_tag(&tokens.data[i], guess) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - tokens.data[i].tag = self.strings[tag_strs[i]] - self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])], - tokens.data) + self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -88,57 +163,51 @@ cdef class Tagger: for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) loss = golds[i] != -1 and guess != golds[i] - tokens.data[i].tag = self.strings[self.tag_names[guess]] - self.set_morph(i, &self.tags[guess], tokens.data) + + self.vocab.morphology.assign_tag(&tokens.data[i], guess) correct += loss == 0 self.freqs[TAG][tokens.data[i].tag] += 1 return correct cdef int predict(self, int i, const TokenC* tokens) except -1: - raise NotImplementedError + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + return arg_max(scores, self.model.n_classes) cdef int update(self, int i, const TokenC* tokens, int gold) except -1: - raise NotImplementedError + cdef atom_t[N_CONTEXT_FIELDS] context + _fill_from_token(&context[P2_orth], &tokens[i-2]) + _fill_from_token(&context[P1_orth], &tokens[i-1]) + _fill_from_token(&context[W_orth], &tokens[i]) + _fill_from_token(&context[N1_orth], &tokens[i+1]) + _fill_from_token(&context[N2_orth], &tokens[i+2]) + scores = self.model.score(context) + guess = arg_max(scores, self.model.n_classes) + loss = guess != gold if gold != -1 else 0 + self.model.update(context, guess, gold, loss) + return guess - cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1: - tokens[i].pos = tag.pos - cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) - if cached is NULL: - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) - cached.morph = tag.morph - self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) - tokens[i].lemma = cached.lemma - tokens[i].morph = cached.morph - cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: - if self.lemmatizer is None: - return lex.orth - cdef unicode py_string = self.strings[lex.orth] - if pos != NOUN and pos != VERB and pos != ADJ: - return lex.orth - cdef set lemma_strings - cdef unicode lemma_string - lemma_strings = self.lemmatizer(py_string, pos) - lemma_string = sorted(lemma_strings)[0] - lemma = self.strings[lemma_string] - return lemma - - def load_morph_exceptions(self, dict exc): - cdef unicode pos_str - cdef unicode form_str - cdef unicode lemma_str - cdef dict entries - cdef dict props - cdef int lemma - cdef attr_t orth - cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) - for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] - cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) - cached.lemma = self.strings[lemma_str] - set_morph_from_dict(&cached.morph, props) - self._morph_cache.set(pos, orth, cached) +cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: + context[0] = t.lex.lower + context[1] = t.lex.cluster + context[2] = t.lex.shape + context[3] = t.lex.prefix + context[4] = t.lex.suffix + context[5] = t.tag + context[6] = t.lemma + if t.lex.flags & (1 << IS_ALPHA): + context[7] = 1 + elif t.lex.flags & (1 << IS_PUNCT): + context[7] = 2 + elif t.lex.flags & (1 << LIKE_URL): + context[7] = 3 + elif t.lex.flags & (1 << LIKE_NUM): + context[7] = 4 + else: + context[7] = 0 From 119c0f8c3fae12dc33d3e52e282072c54d306738 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:20:11 +0200 Subject: [PATCH 24/48] * Hack out morphology stuff from tokenizer, while morphology being reimplemented. --- spacy/tokenizer.pxd | 2 +- spacy/tokenizer.pyx | 45 +++++++++++++++++++++++++++------------------ 2 files changed, 28 insertions(+), 19 deletions(-) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index a7f69c5aa..19b8aa026 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -4,7 +4,7 @@ from preshed.maps cimport PreshMap from cymem.cymem cimport Pool from .typedefs cimport hash_t -from .structs cimport LexemeC, TokenC, Morphology +from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc from .vocab cimport Vocab, _Cached diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1e857aefc..38daf1c5a 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -11,7 +11,6 @@ from cpython cimport Py_UNICODE_ISSPACE from cymem.cymem cimport Pool from preshed.maps cimport PreshMap -from .morphology cimport set_morph_from_dict from .strings cimport hash_string cimport cython @@ -29,7 +28,7 @@ cdef class Tokenizer: self._suffix_re = suffix_re self._infix_re = infix_re self.vocab = vocab - self._load_special_tokenization(rules, self.vocab.pos_tags) + self._load_special_tokenization(rules) @classmethod def from_dir(cls, Vocab vocab, data_dir): @@ -242,7 +241,7 @@ cdef class Tokenizer: match = self._suffix_re.search(string) return (match.end() - match.start()) if match is not None else 0 - def _load_special_tokenization(self, object rules, object tag_map): + def _load_special_tokenization(self, special_cases): '''Add a special-case tokenization rule. ''' cdef int i @@ -253,25 +252,15 @@ cdef class Tokenizer: cdef dict props cdef LexemeC** lexemes cdef hash_t hashed - for chunk, substrings in sorted(rules.items()): + for chunk, substrings in sorted(special_cases.items()): tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) for i, props in enumerate(substrings): form = props['F'] - lemma = props.get("L", None) tokens[i].lex = self.vocab.get(self.vocab.mem, form) - if lemma is not None: - tokens[i].lemma = self.vocab.strings[lemma] - else: - tokens[i].lemma = 0 - if 'pos' in props: - tokens[i].tag = self.vocab.strings[props['pos']] - tokens[i].pos = tag_map[props['pos']][0] - # These are defaults, which can be over-ridden by the - # token-specific props. - set_morph_from_dict(&tokens[i].morph, tag_map[props['pos']][1]) - if tokens[i].lemma == 0: - tokens[i].lemma = tokens[i].lex.orth - set_morph_from_dict(&tokens[i].morph, props) + lemma = props.get('L', form) + tokens[i].lemma = self.vocab.strings[lemma] + #TODO + #self.vocab.morphology.assign_from_dict(&tokens[i], props) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False @@ -279,3 +268,23 @@ cdef class Tokenizer: hashed = hash_string(chunk) self._specials.set(hashed, cached) self._cache.set(hashed, cached) + + +#if lemma is not None: +# tokens[i].lemma = self.vocab.strings[lemma] +#else: +# tokens[i].lemma = 0 +#if 'pos' in props: +# inflection = self.vocab.morphology.get(props['pos']) +# inflection.assign(&tokens[i]) +# # These are defaults, which can be over-ridden by the +# # token-specific props. +# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']] +# #tokens[i].pos = pos +# ## These are defaults, which can be over-ridden by the +# ## token-specific props. +# #set_morph_from_dict(&tokens[i].morph, morph_features) +# #if tokens[i].lemma == 0: +# # tokens[i].lemma = tokens[i].lex.orth +##set_morph_from_dict(&tokens[i].morph, props) + From d30029979ed7d24cb56ed74f2ec3f2b910550173 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:20:46 +0200 Subject: [PATCH 25/48] * Avoid import of morphology in spans --- spacy/tokens/spans.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx index f1c19f308..e2aa1a7f9 100644 --- a/spacy/tokens/spans.pyx +++ b/spacy/tokens/spans.pyx @@ -1,7 +1,7 @@ from __future__ import unicode_literals from collections import defaultdict -from ..structs cimport Morphology, TokenC, LexemeC +from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t From 2d521768a30a3d8c64cf30987932d3b448ed08fa Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:21:03 +0200 Subject: [PATCH 26/48] * Store Morphology class in Vocab --- spacy/vocab.pxd | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index cf7a46388..5c88dca68 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -7,6 +7,7 @@ from murmurhash.mrmr cimport hash64 from .structs cimport LexemeC, TokenC from .typedefs cimport utf8_t, attr_t, hash_t from .strings cimport StringStore +from .morphology cimport Morphology cdef LexemeC EMPTY_LEXEME @@ -27,6 +28,7 @@ cdef class Vocab: cpdef public lexeme_props_getter cdef Pool mem cpdef readonly StringStore strings + cpdef readonly Morphology morphology cdef readonly int length cdef public object _serializer cdef public object data_dir From 1302d35dff09e64b4863a4b43df8cf37254e5c2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:21:46 +0200 Subject: [PATCH 27/48] * Rework interfaces in vocab --- spacy/vocab.pyx | 53 ++++++++++++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4c35ea41c..fa196166e 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -21,6 +21,7 @@ from .cfile cimport CFile from cymem.cymem cimport Address from . import util from .serialize.packer cimport Packer +from .attrs cimport PROB DEF MAX_VEC_SIZE = 100000 @@ -35,27 +36,37 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - def __init__(self, data_dir=None, get_lex_attr=None, load_vectors=True, pos_tags=None): + @classmethod + def default_morphology(cls): + return Morphology({'VBZ': ['VERB', {}]}, [], None) + + def __init__(self, get_lex_attr=None, morphology=None, vectors=None): + self.get_lex_attr = get_lex_attr + if morphology is None: + morphology = self.default_morphology() + self.morphology = morphology + self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() - self.get_lex_attr = get_lex_attr - self.repvec_length = 0 + self.length = 1 - self.pos_tags = pos_tags - if data_dir is not None: - if not path.exists(data_dir): - raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if not path.isdir(data_dir): - raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - self.load_lexemes(path.join(data_dir, 'strings.txt'), - path.join(data_dir, 'lexemes.bin')) - if load_vectors and path.exists(path.join(data_dir, 'vec.bin')): - self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) - self._serializer = None - self.data_dir = data_dir + + @classmethod + def from_dir(cls, data_dir, get_lex_attr=None, morphology=None, vectors=None): + if not path.exists(data_dir): + raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) + if not path.isdir(data_dir): + raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, + morphology=morphology) + self.load_lexemes(path.join(data_dir, 'strings.txt'), + path.join(data_dir, 'lexemes.bin')) + if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): + self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) + return self property serializer: def __get__(self): @@ -83,7 +94,6 @@ cdef class Vocab: lex = self._by_hash.get(key) cdef size_t addr if lex != NULL: - print string, lex.orth, self.strings[string] assert lex.orth == self.strings[string] return lex else: @@ -106,17 +116,21 @@ cdef class Vocab: cdef hash_t key cdef bint is_oov = mem is not self.mem mem = self.mem - #if len(string) < 3: - # mem = self.mem + if len(string) < 3: + mem = self.mem lex = mem.alloc(sizeof(LexemeC), 1) lex.orth = self.strings[string] + lex.length = len(string) lex.id = self.length if self.get_lex_attr is not None: for attr, func in self.get_lex_attr.items(): value = func(string) if isinstance(value, unicode): value = self.strings[value] - Lexeme.set_struct_attr(lex, attr, value) + if attr == PROB: + lex.prob = value + else: + Lexeme.set_struct_attr(lex, attr, value) if is_oov: lex.id = 0 else: @@ -128,7 +142,6 @@ cdef class Vocab: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: self._by_hash.set(key, lex) self._by_orth.set(lex.orth, lex) - print "Add lex", key, lex.orth, self.strings[lex.orth] self.length += 1 def __iter__(self): From 658c4a39305edf7ebdb6da0c090ee09343d26644 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:22:06 +0200 Subject: [PATCH 28/48] * Mark test_inital as requiring models --- tests/parser/test_initial_actions_parse.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/parser/test_initial_actions_parse.py b/tests/parser/test_initial_actions_parse.py index c1603cd93..cdaf25f91 100644 --- a/tests/parser/test_initial_actions_parse.py +++ b/tests/parser/test_initial_actions_parse.py @@ -1,6 +1,7 @@ import pytest +@pytest.mark.models def test_initial(EN): doc = EN.tokenizer(u'I ate the pizza with anchovies.') EN.tagger(doc) From ff9db9f3ae6655eb4e6c6b7ebd739228b09c3ca1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:22:26 +0200 Subject: [PATCH 29/48] * Fix serializer tests for new attr scheme --- tests/serialize/test_codecs.py | 21 +++------------------ tests/serialize/test_packer.py | 27 ++++++--------------------- 2 files changed, 9 insertions(+), 39 deletions(-) diff --git a/tests/serialize/test_codecs.py b/tests/serialize/test_codecs.py index ad9012068..00177f21a 100644 --- a/tests/serialize/test_codecs.py +++ b/tests/serialize/test_codecs.py @@ -41,25 +41,10 @@ def test_attribute(): def test_vocab_codec(): - def get_lex_props(string, prob): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': prob, - 'sentiment': 0 - } - vocab = Vocab() - vocab['dog'] = get_lex_props('dog', 0.001) - vocab['the'] = get_lex_props('the', 0.05) - vocab['jumped'] = get_lex_props('jumped', 0.005) + lex = vocab['dog'] + lex = vocab['the'] + lex = vocab['jumped'] codec = HuffmanCodec([(lex.orth, lex.prob) for lex in vocab]) diff --git a/tests/serialize/test_packer.py b/tests/serialize/test_packer.py index 5770a8938..6ec583d08 100644 --- a/tests/serialize/test_packer.py +++ b/tests/serialize/test_packer.py @@ -5,6 +5,7 @@ import re import pytest import numpy +from spacy.language import Language from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer @@ -17,30 +18,14 @@ from spacy.serialize.packer import Packer from spacy.serialize.bits import BitArray -def get_lex_props(string, prob=-22, is_oov=False): - return { - 'flags': 0, - 'length': len(string), - 'orth': string, - 'lower': string, - 'norm': string, - 'shape': string, - 'prefix': string[0], - 'suffix': string[-3:], - 'cluster': 0, - 'prob': prob, - 'sentiment': 0 - } - - @pytest.fixture def vocab(): - vocab = Vocab(get_lex_props=get_lex_props) - vocab['dog'] = get_lex_props('dog', 0.001) + vocab = Vocab(Language.default_lex_attrs()) + lex = vocab['dog'] assert vocab[vocab.strings['dog']].orth_ == 'dog' - vocab['the'] = get_lex_props('the', 0.01) - vocab['quick'] = get_lex_props('quick', 0.005) - vocab['jumped'] = get_lex_props('jumped', 0.007) + lex = vocab['the'] + lex = vocab['quick'] + lex = vocab['jumped'] return vocab From 884251801ea4e3c26e4f9f606f6ee2c091fd488c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:22:50 +0200 Subject: [PATCH 30/48] * Mark space tests as requiring model --- tests/tagger/test_spaces.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/tagger/test_spaces.py b/tests/tagger/test_spaces.py index c3052160e..0ef05637b 100644 --- a/tests/tagger/test_spaces.py +++ b/tests/tagger/test_spaces.py @@ -14,6 +14,7 @@ def tagged(EN): tokens = EN(string, tag=True, parse=False) return tokens +@pytest.mark.models def test_spaces(tagged): assert tagged[0].pos != SPACE assert tagged[0].pos_ != 'SPACE' From c07eea8563c0c361842caee304ec0007d40629e6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 26 Aug 2015 19:23:04 +0200 Subject: [PATCH 31/48] * Comment out old doc tests for now --- tests/test_docs.py | 155 +++++++++++++++++++++++---------------------- 1 file changed, 78 insertions(+), 77 deletions(-) diff --git a/tests/test_docs.py b/tests/test_docs.py index 70c8b8c63..4b0831dfd 100644 --- a/tests/test_docs.py +++ b/tests/test_docs.py @@ -1,80 +1,81 @@ # -*- coding: utf-8 -*- """Sphinx doctest is just too hard. Manually paste doctest examples here""" +import pytest -@pytest.mark.models -def test_1(): - import spacy.en - from spacy.parts_of_speech import ADV - # Load the pipeline, and call it with some text. - nlp = spacy.en.English() - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", - tag=True, parse=False) - o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) - assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" - - o = nlp.vocab[u'back'].prob - assert o == -7.033305644989014 - o = nlp.vocab[u'not'].prob - assert o == -5.332601070404053 - o = nlp.vocab[u'quietly'].prob - assert o == -11.994928359985352 - - -@pytest.mark.models -def test2(): - import spacy.en - from spacy.parts_of_speech import ADV - nlp = spacy.en.English() - # Find log probability of Nth most frequent word - probs = [lex.prob for lex in nlp.vocab] - probs.sort() - is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' - -@pytest.mark.models -def test3(): - import spacy.en - from spacy.parts_of_speech import ADV - nlp = spacy.en.English() - # Find log probability of Nth most frequent word - probs = [lex.prob for lex in nlp.vocab] - probs.sort() - is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] - tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") - o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) - assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' - - pleaded = tokens[7] - assert pleaded.repvec.shape == (300,) - o = pleaded.repvec[:5] - assert sum(o) != 0 - from numpy import dot - from numpy.linalg import norm - - cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) - words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] - words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) - words.reverse() - o = [w.orth_ for w in words[0:20]] - assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', - u'pleads', u'testified', u'conspired', u'motioned', u'demurred', - u'countersued', u'remonstrated', u'begged', u'apologised', - u'consented', u'acquiesced', u'petitioned', u'quarreled', - u'appealed', u'pleading'] - o = [w.orth_ for w in words[50:60]] - assert o == [u'martialed', u'counselled', u'bragged', - u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', - u'dissented', u'yearned'] - o = [w.orth_ for w in words[100:110]] - assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', - u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', - u'clerked'] - - #o = [w.orth_ for w in words[1000:1010]] - #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', - # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] - #o = [w.orth_ for w in words[50000:50010]] - #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', - # u'dirty', u'rims', u'artists'] +#@pytest.mark.models +#def test_1(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# # Load the pipeline, and call it with some text. +# nlp = spacy.en.English() +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’", +# tag=True, parse=False) +# o = u''.join(tok.string.upper() if tok.pos == ADV else tok.string for tok in tokens) +# assert u"‘Give it BACK,’ he pleaded ABJECTLY, ‘it’s mine.’" +# +# o = nlp.vocab[u'back'].prob +# assert o == -7.033305644989014 +# o = nlp.vocab[u'not'].prob +# assert o == -5.332601070404053 +# o = nlp.vocab[u'quietly'].prob +# assert o == -11.994928359985352 +# +# +#@pytest.mark.m +#def test2(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# nlp = spacy.en.English() +# # Find log probability of Nth most frequent word +# probs = [lex.prob for lex in nlp.vocab] +# probs.sort() +# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") +# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) +# o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' +# +#@pytest.mark.models +#def test3(): +# import spacy.en +# from spacy.parts_of_speech import ADV +# nlp = spacy.en.English() +# # Find log probability of Nth most frequent word +# probs = [lex.prob for lex in nlp.vocab] +# probs.sort() +# is_adverb = lambda tok: tok.pos == ADV and tok.prob < probs[-1000] +# tokens = nlp(u"‘Give it back,’ he pleaded abjectly, ‘it’s mine.’") +# o = u''.join(tok.string.upper() if is_adverb(tok) else tok.string for tok in tokens) +# assert o == u'‘Give it back,’ he pleaded ABJECTLY, ‘it’s mine.’' +# +# pleaded = tokens[7] +# assert pleaded.repvec.shape == (300,) +# o = pleaded.repvec[:5] +# assert sum(o) != 0 +# from numpy import dot +# from numpy.linalg import norm +# +# cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2)) +# words = [w for w in nlp.vocab if w.is_lower and w.has_repvec] +# words.sort(key=lambda w: cosine(w.repvec, pleaded.repvec)) +# words.reverse() +# o = [w.orth_ for w in words[0:20]] +# assert o == [u'pleaded', u'pled', u'plead', u'confessed', u'interceded', +# u'pleads', u'testified', u'conspired', u'motioned', u'demurred', +# u'countersued', u'remonstrated', u'begged', u'apologised', +# u'consented', u'acquiesced', u'petitioned', u'quarreled', +# u'appealed', u'pleading'] +# o = [w.orth_ for w in words[50:60]] +# assert o == [u'martialed', u'counselled', u'bragged', +# u'backtracked', u'caucused', u'refiled', u'dueled', u'mused', +# u'dissented', u'yearned'] +# o = [w.orth_ for w in words[100:110]] +# assert o == [u'acquits', u'cabled', u'ducked', u'sentenced', +# u'gaoled', u'perjured', u'absconded', u'bargained', u'overstayed', +# u'clerked'] +# +# #o = [w.orth_ for w in words[1000:1010]] +# #assert o == [u'scorned', u'baled', u'righted', u'requested', u'swindled', +# # u'posited', u'firebombed', u'slimed', u'deferred', u'sagged'] +# #o = [w.orth_ for w in words[50000:50010]] +# #assert o == [u'fb', u'ford', u'systems', u'puck', u'anglers', u'ik', u'tabloid', +# # u'dirty', u'rims', u'artists'] From 320ced276a4da0f2db54594c9fb4f7e59084c86e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 09:15:41 +0200 Subject: [PATCH 32/48] * Add tagger training script --- bin/tagger/train.py | 175 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100755 bin/tagger/train.py diff --git a/bin/tagger/train.py b/bin/tagger/train.py new file mode 100755 index 000000000..9cd8cc011 --- /dev/null +++ b/bin/tagger/train.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python +from __future__ import division +from __future__ import unicode_literals +from __future__ import print_function + +import os +from os import path +import shutil +import codecs +import random + +import plac +import re + +import spacy.util +from spacy.en import English + +from spacy.tagger import Tagger + +from spacy.syntax.util import Config +from spacy.gold import read_json_file +from spacy.gold import GoldParse + +from spacy.scorer import Scorer + + +def score_model(scorer, nlp, raw_text, annot_tuples): + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + else: + tokens = nlp.tokenizer(raw_text) + nlp.tagger(tokens) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold) + + +def _merge_sents(sents): + m_deps = [[], [], [], [], [], []] + m_brackets = [] + i = 0 + for (ids, words, tags, heads, labels, ner), brackets in sents: + m_deps[0].extend(id_ + i for id_ in ids) + m_deps[1].extend(words) + m_deps[2].extend(tags) + m_deps[3].extend(head + i for head in heads) + m_deps[4].extend(labels) + m_deps[5].extend(ner) + m_brackets.extend((b['first'] + i, b['last'] + i, b['label']) for b in brackets) + i += len(ids) + return [(m_deps, m_brackets)] + + +def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', + seed=0, gold_preproc=False, n_sents=0, corruption_level=0, + beam_width=1, verbose=False, + use_orig_arc_eager=False): + if n_sents > 0: + gold_tuples = gold_tuples[:n_sents] + + templates = Tagger.default_templates() + nlp = Language(data_dir=model_dir, tagger=False) + nlp.tagger = Tagger.blank(nlp.vocab, templates) + + print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") + for itn in range(n_iter): + scorer = Scorer() + loss = 0 + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, ctnt in sents: + words = annot_tuples[1] + gold_tags = annot_tuples[2] + score_model(scorer, nlp, raw_text, annot_tuples) + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(words) + else: + tokens = nlp.tokenizer(raw_text) + loss += nlp.tagger.train(tokens, gold_tags) + random.shuffle(gold_tuples) + print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, + scorer.tags_acc, + scorer.token_acc)) + nlp.end_training(model_dir) + +def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, + beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + scorer = Scorer() + for raw_text, sents in gold_tuples: + if gold_preproc: + raw_text = None + else: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=verbose) + return scorer + + +def write_parses(Language, dev_loc, model_dir, out_loc, beam_width=None): + nlp = Language(data_dir=model_dir) + if beam_width is not None: + nlp.parser.cfg.beam_width = beam_width + gold_tuples = read_json_file(dev_loc) + scorer = Scorer() + out_file = codecs.open(out_loc, 'w', 'utf8') + for raw_text, sents in gold_tuples: + sents = _merge_sents(sents) + for annot_tuples, brackets in sents: + if raw_text is None: + tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) + nlp.tagger(tokens) + nlp.entity(tokens) + nlp.parser(tokens) + else: + tokens = nlp(raw_text, merge_mwes=False) + gold = GoldParse(tokens, annot_tuples) + scorer.score(tokens, gold, verbose=False) + for t in tokens: + out_file.write( + '%s\t%s\t%s\t%s\n' % (t.orth_, t.tag_, t.head.orth_, t.dep_) + ) + return scorer + + +@plac.annotations( + train_loc=("Location of training file or directory"), + dev_loc=("Location of development file or directory"), + model_dir=("Location of output model directory",), + eval_only=("Skip training, and only evaluate", "flag", "e", bool), + corruption_level=("Amount of noise to add to training data", "option", "c", float), + gold_preproc=("Use gold-standard sentence boundaries in training?", "flag", "g", bool), + out_loc=("Out location", "option", "o", str), + n_sents=("Number of training sentences", "option", "n", int), + n_iter=("Number of training iterations", "option", "i", int), + verbose=("Verbose error reporting", "flag", "v", bool), + debug=("Debug mode", "flag", "d", bool), +) +def main(train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False, + debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False): + if not eval_only: + gold_train = list(read_json_file(train_loc)) + train(English, gold_train, model_dir, + feat_set='basic' if not debug else 'debug', + gold_preproc=gold_preproc, n_sents=n_sents, + corruption_level=corruption_level, n_iter=n_iter, + verbose=verbose) + #if out_loc: + # write_parses(English, dev_loc, model_dir, out_loc, beam_width=beam_width) + scorer = evaluate(English, list(read_json_file(dev_loc)), + model_dir, gold_preproc=gold_preproc, verbose=verbose) + print('TOK', scorer.token_acc) + print('POS', scorer.tags_acc) + print('UAS', scorer.uas) + print('LAS', scorer.las) + + print('NER P', scorer.ents_p) + print('NER R', scorer.ents_r) + print('NER F', scorer.ents_f) + + +if __name__ == '__main__': + plac.call(main) From 0af139e18376cb2286c8a53a0233fea79130c738 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 09:16:11 +0200 Subject: [PATCH 33/48] * Tagger training now working. Still need to test load/save of model. Morphology still broken. --- lang_data/en/tag_map.json | 35 ++++++++++++++++---------- spacy/_ml.pyx | 6 +++-- spacy/language.py | 30 +++++++++++----------- spacy/lemmatizer.py | 46 +++++++++++++++++++++------------- spacy/morphology.pxd | 5 +++- spacy/morphology.pyx | 21 ++++++++++------ spacy/parts_of_speech.pxd | 11 ++++++--- spacy/parts_of_speech.pyx | 16 +++++++----- spacy/tagger.pxd | 18 ++------------ spacy/tagger.pyx | 52 +++++++++++++++++++-------------------- 10 files changed, 134 insertions(+), 106 deletions(-) diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json index 57d3eedee..b9f8269f7 100644 --- a/lang_data/en/tag_map.json +++ b/lang_data/en/tag_map.json @@ -1,11 +1,12 @@ { -".": {"pos": "punc", "punctype": "peri"}, -",": {"pos": "punc", "punctype": "comm"}, -"-LRB-": {"pos": "punc", "punctype": "brck", "puncside": "ini"}, -"-RRB-": {"pos": "punc", "punctype": "brck", "puncside": "fin"}, -"``": {"pos": "punc", "punctype": "quot", "puncside": "ini"}, -"\"\"": {"pos": "punc", "punctype": "quot", "puncside": "fin"}, -":": {"pos": "punc"}, +".": {"pos": "punct", "puncttype": "peri"}, +",": {"pos": "punct", "puncttype": "comm"}, +"-LRB-": {"pos": "punct", "puncttype": "brck", "punctside": "ini"}, +"-RRB-": {"pos": "punct", "puncttype": "brck", "punctside": "fin"}, +"``": {"pos": "punct", "puncttype": "quot", "punctside": "ini"}, +"\"\"": {"pos": "punct", "puncttype": "quot", "punctside": "fin"}, +"''": {"pos": "punct", "puncttype": "quot", "punctside": "fin"}, +":": {"pos": "punct"}, "$": {"pos": "sym", "other": {"symtype": "currency"}}, "#": {"pos": "sym", "other": {"symtype": "numbersign"}}, "AFX": {"pos": "adj", "hyph": "hyph"}, @@ -13,15 +14,15 @@ "CD": {"pos": "num", "numtype": "card"}, "DT": {"pos": "adj", "prontype": "prn"}, "EX": {"pos": "adv", "advtype": "ex"}, -"FW": {"foreign": "foreign"}, -"HYPH": {"pos": "punc", "punctype": "dash"}, +"FW": {"pos": "x", "foreign": "foreign"}, +"HYPH": {"pos": "punct", "puncttype": "dash"}, "IN": {"pos": "adp"}, "JJ": {"pos": "adj", "degree": "pos"}, "JJR": {"pos": "adj", "degree": "comp"}, "JJS": {"pos": "adj", "degree": "sup"}, -"LS": {"pos": "punc", "numtype": "ord"}, +"LS": {"pos": "punct", "numtype": "ord"}, "MD": {"pos": "verb", "verbtype": "mod"}, -"NIL": {}, +"NIL": {"pos": "no_tag"}, "NN": {"pos": "noun", "number": "sing"}, "NNP": {"pos": "noun", "nountype": "prop", "number": "sing"}, "NNPS": {"pos": "noun", "nountype": "prop", "number": "plur"}, @@ -36,7 +37,7 @@ "RP": {"pos": "part"}, "SYM": {"pos": "sym"}, "TO": {"pos": "part", "parttype": "inf", "verbform": "inf"}, -"UH": {"pos": "int"}, +"UH": {"pos": "intJ"}, "VB": {"pos": "verb", "verbform": "inf"}, "VBD": {"pos": "verb", "verbform": "fin", "tense": "past"}, "VBG": {"pos": "verb", "verbform": "part", "tense": "pres", "aspect": "prog"}, @@ -47,5 +48,13 @@ "WP": {"pos": "noun", "prontype": "int|rel"}, "WP$": {"pos": "adj", "poss": "poss", "prontype": "int|rel"}, "WRB": {"pos": "adv", "prontype": "int|rel"}, -"SP": {"pos": "space"} +"SP": {"pos": "space"}, +"ADD": {"pos": "x"}, +"NFP": {"pos": "punct"}, +"GW": {"pos": "x"}, +"AFX": {"pos": "x"}, +"HYPH": {"pos": "punct"}, +"XX": {"pos": "x"}, +"BES": {"pos": "verb"}, +"HVS": {"pos": "verb"}, } diff --git a/spacy/_ml.pyx b/spacy/_ml.pyx index 18908e89e..56c080fa6 100644 --- a/spacy/_ml.pyx +++ b/spacy/_ml.pyx @@ -91,6 +91,8 @@ cdef class Model: count_feats(counts[guess], feats, n_feats, -cost) self._model.update(counts) - def end_training(self): + def end_training(self, model_loc=None): + if model_loc is None: + model_loc = self.model_loc self._model.end_training() - self._model.dump(self.model_loc, freq_thresh=0) + self._model.dump(model_loc, freq_thresh=0) diff --git a/spacy/language.py b/spacy/language.py index 706df34a5..2a07d1f5f 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,5 +1,10 @@ from os import path +try: + import ujson as json +except ImportError: + import json + from .tokenizer import Tokenizer from .morphology import Morphology from .vocab import Vocab @@ -13,6 +18,8 @@ from . import orth from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager +from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD + class Language(object): @staticmethod @@ -113,14 +120,6 @@ class Language(object): attrs.IS_OOV: lambda string: True } - @classmethod - def default_dep_templates(cls): - return [] - - @classmethod - def default_ner_templates(cls): - return [] - @classmethod def default_dep_labels(cls): return {0: {'ROOT': True}} @@ -186,10 +185,11 @@ class Language(object): return None @classmethod - def default_matcher(cls, vocab, data_dir=None): - if data_dir is None: - data_dir = cls.default_data_dir() - return Matcher.from_dir(data_dir, vocab) + def default_matcher(cls, vocab, data_dir): + if path.exists(data_dir): + return Matcher.from_dir(data_dir, vocab) + else: + return None def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None, serializer=None): @@ -245,9 +245,9 @@ class Language(object): def end_training(self, data_dir=None): if data_dir is None: data_dir = self.data_dir - self.parser.model.end_training() - self.entity.model.end_training() - self.tagger.model.end_training() + self.parser.model.end_training(path.join(data_dir, 'deps', 'model')) + self.entity.model.end_training(path.join(data_dir, 'ner', 'model')) + self.tagger.model.end_training(path.join(data_dir, 'pos', 'model')) self.vocab.strings.dump(path.join(data_dir, 'vocab', 'strings.txt')) with open(path.join(data_dir, 'vocab', 'serializer.json'), 'w') as file_: diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 660a16eb9..05029391b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -2,29 +2,41 @@ from __future__ import unicode_literals from os import path import codecs +try: + import ujson as json +except ImportError: + import json + +from .parts_of_speech import NOUN, VERB, ADJ + class Lemmatizer(object): - def __init__(self, wn_dict_dir, noun_id, verb_id, adj_id): - self.noun_id = noun_id - self.verb_id = verb_id - self.adj_id = adj_id - self.index = {} - self.exc = {} + @classmethod + def from_dir(cls, data_dir): + index = {} + exc = {} for pos in ['adj', 'adv', 'noun', 'verb']: - self.index[pos] = read_index(path.join(wn_dict_dir, 'index.%s' % pos)) - self.exc[pos] = read_exc(path.join(wn_dict_dir, '%s.exc' % pos)) + index[pos] = read_index(path.join(data_dir, 'index.%s' % pos)) + exc[pos] = read_exc(path.join(data_dir, '%s.exc' % pos)) + rules = json.load(open(path.join(data_dir, 'lemma_rules.json'))) + return cls(index, exc, rules) + + def __init__(self, index, exceptions, rules): + self.index = index + self.exc = exceptions + self.rules = rules def __call__(self, string, pos): - - return lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) - if pos == self.noun_id: - return self.noun(string) - elif pos == self.verb_id: - return self.verb(string) - elif pos == self.adj_id: - return self.adj(string) + if pos == NOUN: + pos = 'noun' + elif pos == VERB: + pos = 'verb' + elif pos == ADJ: + pos = 'adj' else: - raise Exception("Cannot lemmatize with unknown pos: %s" % pos) + return string + lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) + return min(lemmas) def noun(self, string): return self(string, 'noun') diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index 7f2ebe34b..e0f85f96f 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,13 +1,16 @@ from .structs cimport TokenC +from .strings cimport StringStore cdef class Morphology: + cdef readonly object strings + cdef public object lemmatizer cdef public object tag_map cdef public object tag_names cdef public object tag_ids cdef public int n_tags - cdef int assign_tag(self, TokenC* token, int tag) except -1 + cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1 cdef int assign_from_dict(self, TokenC* token, props) except -1 diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index f32009351..2b8fa3960 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,4 +1,5 @@ from os import path +from .lemmatizer import Lemmatizer try: import ujson as json @@ -9,7 +10,15 @@ from spacy.parts_of_speech import UNIV_POS_NAMES cdef class Morphology: + @classmethod + def from_dir(cls, data_dir, lemmatizer=None): + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + if lemmatizer is None: + lemmatizer = Lemmatizer.from_dir(data_dir) + return cls(tag_map, {}, lemmatizer) + def __init__(self, tag_map, fused_tokens, lemmatizer): + self.lemmatizer = lemmatizer self.tag_map = tag_map self.n_tags = len(tag_map) self.tag_names = tuple(sorted(tag_map.keys())) @@ -17,15 +26,13 @@ cdef class Morphology: for i, tag_str in enumerate(self.tag_names): self.tag_ids[tag_str] = i - @classmethod - def from_dir(cls, data_dir): - tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) - return cls(tag_map, {}, None) - - cdef int assign_tag(self, TokenC* token, int tag) except -1: + cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1: + # TODO Caching props = self.tag_map[self.tag_names[tag]] token.pos = UNIV_POS_NAMES[props['pos'].upper()] - token.tag = tag + token.tag = strings[self.tag_names[tag]] + lemma = self.lemmatizer(strings[token.lex.orth], token.pos) + token.lemma = strings[lemma] #token.inflection = # TODO cdef int assign_from_dict(self, TokenC* token, props) except -1: diff --git a/spacy/parts_of_speech.pxd b/spacy/parts_of_speech.pxd index b915b9dde..e410c6971 100644 --- a/spacy/parts_of_speech.pxd +++ b/spacy/parts_of_speech.pxd @@ -2,17 +2,22 @@ cpdef enum univ_pos_t: NO_TAG ADJ - ADV ADP + ADV + AUX CONJ DET + INTJ NOUN NUM + PART PRON - PRT + PROPN + PUNCT + SCONJ + SYM VERB X - PUNCT EOL SPACE N_UNIV_TAGS diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index 7081cfab9..8c2348a47 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -4,18 +4,22 @@ from __future__ import unicode_literals UNIV_POS_NAMES = { "NO_TAG": NO_TAG, "ADJ": ADJ, - "ADV": ADV, "ADP": ADP, + "ADV": ADV, + "AUX": AUX, "CONJ": CONJ, "DET": DET, + "INTJ": INTJ, "NOUN": NOUN, "NUM": NUM, + "PART": PART, "PRON": PRON, - "PRT": PRT, + "PROPN": PROPN, + "PUNCT": PUNCT, + "SCONJ": SCONJ, + "SYM": SYM, "VERB": VERB, "X": X, - "PUNCT": PUNCT, - "PUNC": PUNCT, - "SPACE": SPACE, - "EOL": EOL + "EOL": EOL, + "SPACE": SPACE } diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 213781047..28d7fc711 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -1,26 +1,12 @@ -from preshed.maps cimport PreshMapArray -from preshed.counter cimport PreshCounter -from cymem.cymem cimport Pool - from ._ml cimport Model -from .strings cimport StringStore -from .structs cimport TokenC, LexemeC -from .parts_of_speech cimport univ_pos_t +from .structs cimport TokenC from .vocab cimport Vocab cdef class Tagger: - cdef readonly Pool mem - cdef readonly StringStore strings - cdef readonly Model model cdef readonly Vocab vocab - cdef public object lemmatizer - cdef PreshMapArray _morph_cache + cdef readonly Model model cdef public dict freqs - cdef readonly int n_tags - cdef int predict(self, int i, const TokenC* tokens) except -1 cdef int update(self, int i, const TokenC* tokens, int gold) except -1 - #cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 - #cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 5d015b6cc..7b638c724 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -8,7 +8,7 @@ from .typedefs cimport attr_t from .tokens.doc cimport Doc from .attrs cimport TAG from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON -from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE +from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .attrs cimport * from ._ml cimport arg_max @@ -102,24 +102,10 @@ cdef class Tagger: (P2_flags,), ) - def make_lemmatizer(self): - return None - - def __init__(self, Vocab vocab, templates): - self.mem = Pool() - self.vocab = vocab - - cdef int n_tags = self.vocab.morphology.n_tags + 1 - - self.model = Model(n_tags, templates) - self.freqs = {TAG: defaultdict(int)} - for tag in self.tag_names: - self.freqs[TAG][self.vocab.strings[tag]] = 1 - self.freqs[TAG][0] = 1 - - @property - def tag_names(self): - return tuple(sorted(self.vocab.morphology.tag_map.keys())) + @classmethod + def blank(cls, vocab, templates): + model = Model(vocab.morphology.n_tags, templates, model_loc=None) + return cls(vocab, model) @classmethod def from_dir(cls, data_dir, vocab): @@ -127,7 +113,22 @@ cdef class Tagger: templates = json.loads(open(path.join(data_dir, 'templates.json'))) else: templates = cls.default_templates() - return cls(vocab, templates) + model = Model(vocab.morphology.n_tags, templates, data_dir) + return cls(vocab, model) + + def __init__(self, Vocab vocab, model): + self.vocab = vocab + self.model = model + + # TODO: Move this to tag map + self.freqs = {TAG: defaultdict(int)} + for tag in self.tag_names: + self.freqs[TAG][self.vocab.strings[tag]] = 1 + self.freqs[TAG][0] = 1 + + @property + def tag_names(self): + return self.vocab.morphology.tag_names def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -142,29 +143,28 @@ cdef class Tagger: for i in range(tokens.length): if tokens.data[i].pos == 0: guess = self.predict(i, tokens.data) - self.vocab.morphology.assign_tag(&tokens.data[i], guess) + self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) + self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def train(self, Doc tokens, object gold_tag_strs): + assert len(tokens) == len(gold_tag_strs) cdef int i cdef int loss cdef const weight_t* scores - golds = [self.tag_names.index(g) if g is not None else -1 - for g in gold_tag_strs] + golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] correct = 0 for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) loss = golds[i] != -1 and guess != golds[i] - - self.vocab.morphology.assign_tag(&tokens.data[i], guess) + self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) correct += loss == 0 self.freqs[TAG][tokens.data[i].tag] += 1 return correct From b6b1e1aa1296f7f8a3fb0a669c290ef12853073d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 10:26:02 +0200 Subject: [PATCH 34/48] * Add link for Finnish model --- bin/init_model.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/init_model.py b/bin/init_model.py index 0badf71fc..e81d668aa 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -38,6 +38,7 @@ from spacy.parts_of_speech import NOUN, VERB, ADJ import spacy.en import spacy.de +import spacy.fi @@ -184,7 +185,8 @@ def setup_vocab(get_lex_attr, src_dir, dst_dir): def main(lang_id, lang_data_dir, corpora_dir, model_dir): languages = { 'en': spacy.en.English.default_lex_attrs(), - 'de': spacy.de.Deutsch.default_lex_attrs() + 'de': spacy.de.Deutsch.default_lex_attrs(), + 'fi': spacy.fi.Finnish.default_lex_attrs() } model_dir = Path(model_dir) @@ -203,6 +205,11 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir): if (lang_data_dir / 'gazetteer.json').exists(): copyfile(str(lang_data_dir / 'gazetteer.json'), str(model_dir / 'vocab' / 'gazetteer.json')) + + if (lang_data_dir / 'lemma_rules.json').exists(): + copyfile(str(lang_data_dir / 'lemma_rules.json'), + str(model_dir / 'vocab' / 'lemma_rules.json')) + if not (model_dir / 'wordnet').exists() and (corpora_dir / 'wordnet').exists(): copytree(str(corpora_dir / 'wordnet' / 'dict'), str(model_dir / 'wordnet')) From f0a7c99554db884aa602120d3a709f6f77419639 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 10:26:19 +0200 Subject: [PATCH 35/48] * Relax rule-requirement in lemmatizer --- spacy/lemmatizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 05029391b..5e08e80a4 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -35,7 +35,7 @@ class Lemmatizer(object): pos = 'adj' else: return string - lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules[pos]) + lemmas = lemmatize(string, self.index[pos], self.exc[pos], self.rules.get(pos, [])) return min(lemmas) def noun(self, string): From 5b89e2454c0386eba8f0a7e1e6fff901dee45dbd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 10:26:36 +0200 Subject: [PATCH 36/48] * Improve error-reporting in tagger --- spacy/tagger.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 7b638c724..dff96e6ea 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -159,7 +159,11 @@ cdef class Tagger: cdef int i cdef int loss cdef const weight_t* scores - golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] + try: + golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] + except ValueError: + raise ValueError( + [g for g in gold_tag_strs if g is not None and g not in self.tag_names]) correct = 0 for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) From 86c4a8e3e29b756888883d32d9e2c5f5229424c8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 27 Aug 2015 23:11:51 +0200 Subject: [PATCH 37/48] * Work on new morphology organization --- spacy/morphology.pyx | 95 +++++++++++++++++++++++++++++++------------- 1 file changed, 68 insertions(+), 27 deletions(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 2b8fa3960..7f6afa016 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -7,6 +7,12 @@ except ImportError: import json from spacy.parts_of_speech import UNIV_POS_NAMES + + +cdef struct MorphAnalysisC: + uint64_t[4] features + attr_t lemma + attr_t pos cdef class Morphology: @@ -25,39 +31,74 @@ cdef class Morphology: self.tag_ids = {} for i, tag_str in enumerate(self.tag_names): self.tag_ids[tag_str] = i + self._cache = PreshMapArray() - cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1: - # TODO Caching - props = self.tag_map[self.tag_names[tag]] - token.pos = UNIV_POS_NAMES[props['pos'].upper()] - token.tag = strings[self.tag_names[tag]] - lemma = self.lemmatizer(strings[token.lex.orth], token.pos) - token.lemma = strings[lemma] - #token.inflection = # TODO + cdef int assign_tag(self, TokenC* token, tag) except -1: + analysis = self._cache.get(tag, token.lex.orth) + if analysis is NULL: + analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) + cached = self.decode_tag(tag) + cached.lemma = self.lemmatize(token.pos, token.lex) + token.lemma = analysis.lemma + token.pos = analysis.pos + token.tag = analysis.tag + token.morph = analysis.features - cdef int assign_from_dict(self, TokenC* token, props) except -1: + cdef int assign_feature(self, TokenC* token, feature, value) except -1: pass def load_morph_exceptions(self, dict exc): - pass # Map (form, pos) to (lemma, inflection) - #cdef unicode pos_str - #cdef unicode form_str - #cdef unicode lemma_str - #cdef dict entries - #cdef dict props - #cdef int lemma - #cdef attr_t orth - #cdef int pos - #for pos_str, entries in exc.items(): - # pos = self.tag_names.index(pos_str) - # for form_str, props in entries.items(): - # lemma_str = props.get('L', form_str) - # orth = self.strings[form_str] - # cached = self.mem.alloc(1, sizeof(InflectedLemma)) - # cached.lemma = self.strings[lemma_str] - # set_morph_from_dict(&cached.morph, props) - # self._morph_cache.set(pos, orth, cached) + cdef unicode pos_str + cdef unicode form_str + cdef unicode lemma_str + cdef dict entries + cdef dict props + cdef int lemma + cdef attr_t orth + cdef int pos + for pos_str, entries in exc.items(): + pos = self.tag_names.index(pos_str) + for form_str, props in entries.items(): + lemma_str = props.get('L', form_str) + orth = self.strings[form_str] + cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) + cached.lemma = self.strings[lemma_str] + self.set_features(cached, props) + self._cache.set(pos, orth, cached) + + def _load_special_tokenization(self, special_cases): + '''Add a special-case tokenization rule. + ''' + cdef int i + cdef list substrings + cdef unicode chunk + cdef unicode form + cdef unicode lemma + cdef dict props + cdef LexemeC** lexemes + cdef hash_t hashed + for chunk, substrings in sorted(special_cases.items()): + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + # Set the special tokens up to have morphology and lemmas if + # specified, otherwise use the part-of-speech tag (if specified) + form = props['F'] + tokens[i].lex = self.vocab.get(self.vocab.mem, form) + morphology = self.vocab.morphology.decode_dict(props) + tokens[i].lemma = morph_analysis.lemma + tokens[i].pos = morph_analysis.pos + tokens[i].tag = morph_analysis.tag + tokens[i].morph = morph_analysis.morph + cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) + cached.length = len(substrings) + cached.is_lex = False + cached.data.tokens = tokens + hashed = hash_string(chunk) + self._specials.set(hashed, cached) + self._cache.set(hashed, cached) + + #cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: From c2307fa9ee11e883a89086de26a877382a64f343 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 28 Aug 2015 02:02:33 +0200 Subject: [PATCH 38/48] * More work on language-generic parsing --- spacy/fi/__init__.py | 11 +++ spacy/language.py | 5 +- spacy/morphology.pxd | 33 ++++++-- spacy/morphology.pyx | 181 +++++++++++-------------------------------- spacy/structs.pxd | 13 +--- spacy/tagger.pyx | 15 ++-- spacy/tokenizer.pxd | 7 +- spacy/tokenizer.pyx | 40 ++-------- spacy/tokens/doc.pxd | 4 +- spacy/tokens/doc.pyx | 2 +- spacy/vocab.pxd | 3 +- spacy/vocab.pyx | 37 +++++---- 12 files changed, 129 insertions(+), 222 deletions(-) create mode 100644 spacy/fi/__init__.py diff --git a/spacy/fi/__init__.py b/spacy/fi/__init__.py new file mode 100644 index 000000000..8e7173767 --- /dev/null +++ b/spacy/fi/__init__.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals, print_function + +from os import path + +from ..language import Language + + +class Finnish(Language): + @classmethod + def default_data_dir(cls): + return path.join(path.dirname(__file__), 'data') diff --git a/spacy/language.py b/spacy/language.py index 2a07d1f5f..36ca5c636 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -148,13 +148,10 @@ class Language(object): vectors = cls.default_vectors(data_dir) if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs(data_dir) - if morphology is None: - morphology = cls.default_morphology(path.join(data_dir, 'vocab')) return Vocab.from_dir( path.join(data_dir, 'vocab'), get_lex_attr=get_lex_attr, - vectors=vectors, - morphology=morphology) + vectors=vectors) @classmethod def default_tokenizer(cls, vocab, data_dir): diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index e0f85f96f..eb2bb97f5 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -1,18 +1,41 @@ +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMapArray +from libc.stdint cimport uint64_t + from .structs cimport TokenC from .strings cimport StringStore +from .typedefs cimport attr_t +from .parts_of_speech cimport univ_pos_t + + +cdef struct RichTagC: + uint64_t morph + int id + univ_pos_t pos + attr_t name + + +cdef struct MorphAnalysisC: + RichTagC tag + attr_t lemma cdef class Morphology: + cdef readonly Pool mem cdef readonly object strings cdef public object lemmatizer - cdef public object tag_map + cdef public object n_tags + cdef public object reverse_index cdef public object tag_names - cdef public object tag_ids - cdef public int n_tags - cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1 + cdef RichTagC* rich_tags + cdef PreshMapArray _cache + + cdef int assign_tag(self, TokenC* token, tag) except -1 + + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1 + - cdef int assign_from_dict(self, TokenC* token, props) except -1 # #cpdef enum Feature_t: diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 7f6afa016..acca5eb9e 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -6,15 +6,10 @@ try: except ImportError: import json -from spacy.parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech import UNIV_POS_NAMES +from .parts_of_speech cimport ADJ, VERB, NOUN -cdef struct MorphAnalysisC: - uint64_t[4] features - attr_t lemma - attr_t pos - - cdef class Morphology: @classmethod def from_dir(cls, data_dir, lemmatizer=None): @@ -23,32 +18,37 @@ cdef class Morphology: lemmatizer = Lemmatizer.from_dir(data_dir) return cls(tag_map, {}, lemmatizer) - def __init__(self, tag_map, fused_tokens, lemmatizer): + def __init__(self, string_store, tag_map, lemmatizer): + self.mem = Pool() + self.strings = string_store self.lemmatizer = lemmatizer - self.tag_map = tag_map self.n_tags = len(tag_map) self.tag_names = tuple(sorted(tag_map.keys())) - self.tag_ids = {} - for i, tag_str in enumerate(self.tag_names): - self.tag_ids[tag_str] = i - self._cache = PreshMapArray() + self.reverse_index = {} + for i, (tag_str, props) in enumerate(sorted(tag_map.items())): + self.rich_tags[i].id = i + self.rich_tags[i].name = self.strings[tag_str] + self.rich_tags[i].morph = 0 + self.reverse_index[self.rich_tags[i].name] = i + self._cache = PreshMapArray(self.n_tags) cdef int assign_tag(self, TokenC* token, tag) except -1: - analysis = self._cache.get(tag, token.lex.orth) + cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag + analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) - cached = self.decode_tag(tag) - cached.lemma = self.lemmatize(token.pos, token.lex) + analysis.tag = self.rich_tags[tag_id] + analysis.lemma = self.lemmatize(tag, token.lex.orth) token.lemma = analysis.lemma - token.pos = analysis.pos - token.tag = analysis.tag - token.morph = analysis.features + token.pos = analysis.tag.pos + token.tag = analysis.tag.name + token.morph = analysis.tag.morph - cdef int assign_feature(self, TokenC* token, feature, value) except -1: + cdef int assign_feature(self, uint64_t* morph, feature, value) except -1: pass def load_morph_exceptions(self, dict exc): - # Map (form, pos) to (lemma, inflection) + # Map (form, pos) to (lemma, rich tag) cdef unicode pos_str cdef unicode form_str cdef unicode lemma_str @@ -57,121 +57,30 @@ cdef class Morphology: cdef int lemma cdef attr_t orth cdef int pos - for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) + for tag_str, entries in exc.items(): + tag = self.strings[tag_str] + rich_tag = self.rich_tags[self.reverse_index[tag]] for form_str, props in entries.items(): - lemma_str = props.get('L', form_str) - orth = self.strings[form_str] cached = self.mem.alloc(1, sizeof(MorphAnalysisC)) - cached.lemma = self.strings[lemma_str] - self.set_features(cached, props) - self._cache.set(pos, orth, cached) + orth = self.strings[form_str] + for name_str, value_str in props.items(): + if name_str == 'L': + cached.lemma = self.strings[value_str] + else: + self.assign_feature(&cached.tag.morph, name_str, value_str) + if cached.lemma == 0: + cached.lemma = self.lemmatize(rich_tag.pos, orth) + self._cache.set(rich_tag.pos, orth, cached) - def _load_special_tokenization(self, special_cases): - '''Add a special-case tokenization rule. - ''' - cdef int i - cdef list substrings - cdef unicode chunk - cdef unicode form - cdef unicode lemma - cdef dict props - cdef LexemeC** lexemes - cdef hash_t hashed - for chunk, substrings in sorted(special_cases.items()): - tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) - for i, props in enumerate(substrings): - # Set the special tokens up to have morphology and lemmas if - # specified, otherwise use the part-of-speech tag (if specified) - form = props['F'] - tokens[i].lex = self.vocab.get(self.vocab.mem, form) - morphology = self.vocab.morphology.decode_dict(props) - tokens[i].lemma = morph_analysis.lemma - tokens[i].pos = morph_analysis.pos - tokens[i].tag = morph_analysis.tag - tokens[i].morph = morph_analysis.morph - cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) - cached.length = len(substrings) - cached.is_lex = False - cached.data.tokens = tokens - hashed = hash_string(chunk) - self._specials.set(hashed, cached) - self._cache.set(hashed, cached) - - - - -#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: -# morph.number = props.get('number', 0) -# morph.tenspect = props.get('tenspect', 0) -# morph.mood = props.get('mood', 0) -# morph.gender = props.get('gender', 0) -# morph.person = props.get('person', 0) -# morph.case = props.get('case', 0) -# morph.misc = props.get('misc', 0) -# -# -#cdef class Morphology: -# cdef Pool mem -# cdef PreshMap table -# -# def __init__(self, tags, exceptions): -# pass -# -# def __getitem__(self, hash_t id_): -# pass -# -# cdef const InflectionC* get(self, hash_t key) except NULL: -# pass -# -# cdef MorphAnalysis analyse(const TokenC* token) except -1: -# cdef struct MorphAnalysis morphology -# tokens[i].pos = tag.pos -# cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth) -# if cached is NULL: -# cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph)) -# cached.lemma = self.lemmatize(tag.pos, tokens[i].lex) -# cached.morph = tag.morph -# self._morph_cache.set(tag.id, tokens[i].lex.orth, cached) -# tokens[i].lemma = cached.lemma -# tokens[i].morph = cached.morph -# -# cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: -# if self.lemmatizer is None: -# return lex.orth -# cdef unicode py_string = self.strings[lex.orth] -# if pos != NOUN and pos != VERB and pos != ADJ: -# return lex.orth -# cdef set lemma_strings -# cdef unicode lemma_string -# lemma_strings = self.lemmatizer(py_string, pos) -# lemma_string = sorted(lemma_strings)[0] -# lemma = self.strings[lemma_string] -# return lemma -# -# -#cdef class Inflection: -# cdef InflectionC* c -# -# def __init__(self, container, id_): -# self.c = container[id_] -# self.container = container -# -# for i, feat_id in enumerate(feat_ids): -# feature, value = parse_id(feat_id) -# self.add_value(feature, value, True) -# -# def has(self, Value_t feat_value_id): -# part = feat_value_id % 64 -# bit = feat_value_id / 64 -# if self.value_set[part] & bit: -# return True -# else: -# return False -# -# property pos: def __get__(self): return self.c.pos -# -# property id: def __get__(self): return self.c.id -# -# property features: -# pass + def lemmatize(self, const univ_pos_t pos, attr_t orth): + if self.lemmatizer is None: + return orth + cdef unicode py_string = self.strings[orth] + if pos != NOUN and pos != VERB and pos != ADJ: + return orth + cdef set lemma_strings + cdef unicode lemma_string + lemma_strings = self.lemmatizer(py_string, pos) + lemma_string = sorted(lemma_strings)[0] + lemma = self.strings[lemma_string] + return lemma diff --git a/spacy/structs.pxd b/spacy/structs.pxd index f150fa312..a0a3d65a3 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -25,17 +25,6 @@ cdef struct LexemeC: float sentiment float l2_norm -cdef struct MorphFeatC: - int name - int value - - -cdef struct MorphologyC: - uint64_t[4] feature_set - MorphFeatC* features - univ_pos_t pos - int n - cdef struct Entity: int start @@ -54,8 +43,8 @@ cdef struct Constituent: cdef struct TokenC: const LexemeC* lex - const MorphologyC* morph const Constituent* ctnt + uint64_t morph univ_pos_t pos bint spacy int tag diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index dff96e6ea..6fea4af88 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -104,7 +104,7 @@ cdef class Tagger: @classmethod def blank(cls, vocab, templates): - model = Model(vocab.morphology.n_tags, templates, model_loc=None) + model = Model(vocab.n_tags, templates, model_loc=None) return cls(vocab, model) @classmethod @@ -113,7 +113,7 @@ cdef class Tagger: templates = json.loads(open(path.join(data_dir, 'templates.json'))) else: templates = cls.default_templates() - model = Model(vocab.morphology.n_tags, templates, data_dir) + model = Model(vocab.n_tags, templates, data_dir) return cls(vocab, model) def __init__(self, Vocab vocab, model): @@ -128,7 +128,7 @@ cdef class Tagger: @property def tag_names(self): - return self.vocab.morphology.tag_names + return self.vocab.tag_names def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. @@ -143,14 +143,15 @@ cdef class Tagger: for i in range(tokens.length): if tokens.data[i].pos == 0: guess = self.predict(i, tokens.data) - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i]) + self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length @@ -168,7 +169,9 @@ cdef class Tagger: for i in range(tokens.length): guess = self.update(i, tokens.data, golds[i]) loss = golds[i] != -1 and guess != golds[i] - self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess) + + self.vocab.morphology.assign_tag(&tokens.data[i], guess) + correct += loss == 0 self.freqs[TAG][tokens.data[i].tag] += 1 return correct diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 19b8aa026..9d60d2a6e 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -7,12 +7,7 @@ from .typedefs cimport hash_t from .structs cimport LexemeC, TokenC from .strings cimport StringStore from .tokens.doc cimport Doc -from .vocab cimport Vocab, _Cached - - -cdef union LexemesOrTokens: - const LexemeC* const* lexemes - TokenC* tokens +from .vocab cimport Vocab, LexemesOrTokens, _Cached cdef class Tokenizer: diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 38daf1c5a..d54770d2b 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -192,9 +192,7 @@ cdef class Tokenizer: tokens.push_back(prefixes[0][i], False) if string: cache_hit = self._try_cache(hash_string(string), tokens) - if cache_hit: - pass - else: + if not cache_hit: match = self.find_infix(string) if match is None: tokens.push_back(self.vocab.get(tokens.mem, string), False) @@ -253,38 +251,10 @@ cdef class Tokenizer: cdef LexemeC** lexemes cdef hash_t hashed for chunk, substrings in sorted(special_cases.items()): - tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) - for i, props in enumerate(substrings): - form = props['F'] - tokens[i].lex = self.vocab.get(self.vocab.mem, form) - lemma = props.get('L', form) - tokens[i].lemma = self.vocab.strings[lemma] - #TODO - #self.vocab.morphology.assign_from_dict(&tokens[i], props) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) cached.is_lex = False - cached.data.tokens = tokens - hashed = hash_string(chunk) - self._specials.set(hashed, cached) - self._cache.set(hashed, cached) - - -#if lemma is not None: -# tokens[i].lemma = self.vocab.strings[lemma] -#else: -# tokens[i].lemma = 0 -#if 'pos' in props: -# inflection = self.vocab.morphology.get(props['pos']) -# inflection.assign(&tokens[i]) -# # These are defaults, which can be over-ridden by the -# # token-specific props. -# #pos, morph_features = self.vocab.morphology.tag_map[props['pos']] -# #tokens[i].pos = pos -# ## These are defaults, which can be over-ridden by the -# ## token-specific props. -# #set_morph_from_dict(&tokens[i].morph, morph_features) -# #if tokens[i].lemma == 0: -# # tokens[i].lemma = tokens[i].lex.orth -##set_morph_from_dict(&tokens[i].morph, props) - + cached.data.tokens = self.vocab.make_fused_token(substrings) + key = hash_string(chunk) + self._specials.set(key, cached) + self._cache.set(key, cached) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 121018770..a13858175 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil ctypedef const LexemeC* const_Lexeme_ptr -ctypedef TokenC* TokenC_ptr +ctypedef const TokenC* const_TokenC_ptr ctypedef fused LexemeOrToken: const_Lexeme_ptr - TokenC_ptr + const_TokenC_ptr cdef class Doc: diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 0fa562dfb..80facc8db 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -209,7 +209,7 @@ cdef class Doc: if self.length == self.max_length: self._realloc(self.length * 2) cdef TokenC* t = &self.data[self.length] - if LexemeOrToken is TokenC_ptr: + if LexemeOrToken is const_TokenC_ptr: t[0] = lex_or_tok[0] else: t.lex = lex_or_tok diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd index 5c88dca68..d9bf32582 100644 --- a/spacy/vocab.pxd +++ b/spacy/vocab.pxd @@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME cdef union LexemesOrTokens: const LexemeC* const* lexemes - TokenC* tokens + const TokenC* tokens cdef struct _Cached: @@ -37,6 +37,7 @@ cdef class Vocab: cdef const LexemeC* get(self, Pool mem, unicode string) except NULL cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL + cdef const TokenC* make_fused_token(self, substrings) except NULL cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1 diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index fa196166e..085fb38f9 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -17,6 +17,7 @@ from .strings cimport hash_string from .orth cimport word_shape from .typedefs cimport attr_t from .cfile cimport CFile +from .lemmatizer import Lemmatizer from cymem.cymem cimport Address from . import util @@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC cdef class Vocab: '''A map container for a language's LexemeC structs. ''' - @classmethod - def default_morphology(cls): - return Morphology({'VBZ': ['VERB', {}]}, [], None) - - def __init__(self, get_lex_attr=None, morphology=None, vectors=None): - self.get_lex_attr = get_lex_attr - if morphology is None: - morphology = self.default_morphology() - self.morphology = morphology - + def __init__(self, get_lex_attr=None, tag_map=None, vectors=None): self.mem = Pool() self._by_hash = PreshMap() self._by_orth = PreshMap() self.strings = StringStore() + self.get_lex_attr = get_lex_attr + self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {})) self.length = 1 self._serializer = None @@ -60,10 +54,9 @@ cdef class Vocab: raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, - morphology=morphology) - self.load_lexemes(path.join(data_dir, 'strings.txt'), - path.join(data_dir, 'lexemes.bin')) + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) + cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) return self @@ -172,6 +165,22 @@ cdef class Vocab: orth = id_or_string return Lexeme(self, orth) + cdef const TokenC* make_fused_token(self, substrings) except NULL: + cdef int i + tokens = self.mem.alloc(len(substrings) + 1, sizeof(TokenC)) + for i, props in enumerate(substrings): + token = &tokens[i] + # Set the special tokens up to have morphology and lemmas if + # specified, otherwise use the part-of-speech tag (if specified) + token.lex = self.get(self.mem, props['F']) + if 'pos' in props: + self.morphology.assign_tag(token, props['pos']) + if 'L' in props: + tokens[i].lemma = self.strings[props['L']] + for feature, value in props.get('morph', {}).items(): + self.morphology.assign_feature(&token.morph, feature, value) + return tokens + def dump(self, loc): if path.exists(loc): assert not path.isdir(loc) From 534e3dda3cbd4f8677fc30f75879bfc3225a6b2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 28 Aug 2015 03:44:54 +0200 Subject: [PATCH 39/48] * More work on language independent parsing --- spacy/language.py | 7 +------ spacy/morphology.pxd | 2 +- spacy/morphology.pyx | 21 ++++++++++----------- spacy/tagger.pyx | 6 +++--- spacy/vocab.pyx | 4 +++- 5 files changed, 18 insertions(+), 22 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 36ca5c636..881df7d1a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -6,7 +6,6 @@ except ImportError: import json from .tokenizer import Tokenizer -from .morphology import Morphology from .vocab import Vocab from .syntax.parser import Parser from .tagger import Tagger @@ -132,16 +131,12 @@ class Language(object): def default_data_dir(cls): return path.join(path.dirname(__file__), 'data') - @classmethod - def default_morphology(cls, data_dir): - return Morphology.from_dir(data_dir) - @classmethod def default_vectors(cls, data_dir): return None @classmethod - def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None, morphology=None): + def default_vocab(cls, data_dir=None, get_lex_attr=None, vectors=None): if data_dir is None: data_dir = cls.default_data_dir() if vectors is None: diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index eb2bb97f5..2229da0ad 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -22,7 +22,7 @@ cdef struct MorphAnalysisC: cdef class Morphology: cdef readonly Pool mem - cdef readonly object strings + cdef readonly StringStore strings cdef public object lemmatizer cdef public object n_tags cdef public object reverse_index diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index acca5eb9e..12d435c7d 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -11,20 +11,15 @@ from .parts_of_speech cimport ADJ, VERB, NOUN cdef class Morphology: - @classmethod - def from_dir(cls, data_dir, lemmatizer=None): - tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) - if lemmatizer is None: - lemmatizer = Lemmatizer.from_dir(data_dir) - return cls(tag_map, {}, lemmatizer) - - def __init__(self, string_store, tag_map, lemmatizer): + def __init__(self, StringStore string_store, tag_map, lemmatizer): self.mem = Pool() self.strings = string_store self.lemmatizer = lemmatizer - self.n_tags = len(tag_map) + self.n_tags = len(tag_map) + 1 self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} + + self.rich_tags = self.mem.alloc(self.n_tags, sizeof(RichTagC)) for i, (tag_str, props) in enumerate(sorted(tag_map.items())): self.rich_tags[i].id = i self.rich_tags[i].name = self.strings[tag_str] @@ -33,12 +28,16 @@ cdef class Morphology: self._cache = PreshMapArray(self.n_tags) cdef int assign_tag(self, TokenC* token, tag) except -1: - cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag + cdef int tag_id + if isinstance(tag, basestring): + tag_id = self.reverse_index[self.strings[tag]] + else: + tag_id = tag analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: analysis = self.mem.alloc(1, sizeof(MorphAnalysisC)) analysis.tag = self.rich_tags[tag_id] - analysis.lemma = self.lemmatize(tag, token.lex.orth) + analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth) token.lemma = analysis.lemma token.pos = analysis.tag.pos token.tag = analysis.tag.name diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 6fea4af88..756bb7ea4 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -104,7 +104,7 @@ cdef class Tagger: @classmethod def blank(cls, vocab, templates): - model = Model(vocab.n_tags, templates, model_loc=None) + model = Model(vocab.morphology.n_tags, templates, model_loc=None) return cls(vocab, model) @classmethod @@ -113,7 +113,7 @@ cdef class Tagger: templates = json.loads(open(path.join(data_dir, 'templates.json'))) else: templates = cls.default_templates() - model = Model(vocab.n_tags, templates, data_dir) + model = Model(vocab.morphology.n_tags, templates, data_dir) return cls(vocab, model) def __init__(self, Vocab vocab, model): @@ -128,7 +128,7 @@ cdef class Tagger: @property def tag_names(self): - return self.vocab.tag_names + return self.vocab.morphology.tag_names def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 085fb38f9..596570a98 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -49,13 +49,15 @@ cdef class Vocab: self._serializer = None @classmethod - def from_dir(cls, data_dir, get_lex_attr=None, morphology=None, vectors=None): + def from_dir(cls, data_dir, get_lex_attr=None, vectors=None): if not path.exists(data_dir): raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) if not path.isdir(data_dir): raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map) + self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin')) if vectors is None and path.exists(path.join(data_dir, 'vec.bin')): self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin')) From 950ce3666084aae00c5f8300515db8004c86198c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:51:30 +0200 Subject: [PATCH 40/48] * Update init model --- bin/init_model.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/bin/init_model.py b/bin/init_model.py index e81d668aa..cffd9df96 100644 --- a/bin/init_model.py +++ b/bin/init_model.py @@ -20,6 +20,7 @@ from __future__ import unicode_literals from ast import literal_eval import math import gzip +import json import plac from pathlib import Path @@ -39,6 +40,7 @@ from spacy.parts_of_speech import NOUN, VERB, ADJ import spacy.en import spacy.de import spacy.fi +import spacy.it @@ -143,7 +145,7 @@ def _read_senses(loc): return lexicon -def setup_vocab(get_lex_attr, src_dir, dst_dir): +def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir): if not dst_dir.exists(): dst_dir.mkdir() @@ -152,7 +154,7 @@ def setup_vocab(get_lex_attr, src_dir, dst_dir): write_binary_vectors(str(vectors_src), str(dst_dir / 'vec.bin')) else: print("Warning: Word vectors file not found") - vocab = Vocab(get_lex_attr=get_lex_attr) + vocab = Vocab(get_lex_attr=get_lex_attr, tag_map=tag_map) clusters = _read_clusters(src_dir / 'clusters.txt') probs, oov_prob = _read_probs(src_dir / 'words.sgt.prob') if not probs: @@ -186,7 +188,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir): languages = { 'en': spacy.en.English.default_lex_attrs(), 'de': spacy.de.Deutsch.default_lex_attrs(), - 'fi': spacy.fi.Finnish.default_lex_attrs() + 'fi': spacy.fi.Finnish.default_lex_attrs(), + 'it': spacy.it.Italian.default_lex_attrs(), } model_dir = Path(model_dir) @@ -199,8 +202,9 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir): if not model_dir.exists(): model_dir.mkdir() + tag_map = json.load((lang_data_dir / 'tag_map.json').open()) setup_tokenizer(lang_data_dir, model_dir / 'tokenizer') - setup_vocab(languages[lang_id], corpora_dir, model_dir / 'vocab') + setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab') if (lang_data_dir / 'gazetteer.json').exists(): copyfile(str(lang_data_dir / 'gazetteer.json'), From d1eea2d865b0b42d02195143788343ea3eb620b3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:51:48 +0200 Subject: [PATCH 41/48] * Update train.py for language-generic spaCy --- bin/parser/train.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/bin/parser/train.py b/bin/parser/train.py index 68217fcb3..abd5eb16e 100755 --- a/bin/parser/train.py +++ b/bin/parser/train.py @@ -14,7 +14,6 @@ import re import spacy.util from spacy.en import English -from spacy.en.pos import POS_TEMPLATES, POS_TAGS, setup_model_dir from spacy.syntax.util import Config from spacy.gold import read_json_file @@ -22,6 +21,11 @@ from spacy.gold import GoldParse from spacy.scorer import Scorer +from spacy.syntax.arc_eager import ArcEager +from spacy.syntax.ner import BiluoPushDown +from spacy.tagger import Tagger +from spacy.syntax.parser import Parser + def _corrupt(c, noise_level): if random.random() >= noise_level: @@ -80,32 +84,28 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', beam_width=1, verbose=False, use_orig_arc_eager=False): dep_model_dir = path.join(model_dir, 'deps') - pos_model_dir = path.join(model_dir, 'pos') ner_model_dir = path.join(model_dir, 'ner') if path.exists(dep_model_dir): shutil.rmtree(dep_model_dir) - if path.exists(pos_model_dir): - shutil.rmtree(pos_model_dir) if path.exists(ner_model_dir): shutil.rmtree(ner_model_dir) os.mkdir(dep_model_dir) - os.mkdir(pos_model_dir) os.mkdir(ner_model_dir) - setup_model_dir(sorted(POS_TAGS.keys()), POS_TAGS, POS_TEMPLATES, pos_model_dir) - Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, - labels=Language.ParserTransitionSystem.get_labels(gold_tuples), + labels=ArcEager.get_labels(gold_tuples), beam_width=beam_width) Config.write(ner_model_dir, 'config', features='ner', seed=seed, - labels=Language.EntityTransitionSystem.get_labels(gold_tuples), + labels=BiluoPushDown.get_labels(gold_tuples), beam_width=0) if n_sents > 0: gold_tuples = gold_tuples[:n_sents] - nlp = Language(data_dir=model_dir) - + nlp = Language(data_dir=model_dir, tagger=False, parser=False, entity=False) + nlp.tagger = Tagger.blank(nlp.vocab, Tagger.default_templates()) + nlp.parser = Parser.from_dir(dep_model_dir, nlp.vocab.strings, ArcEager) + nlp.entity = Parser.from_dir(ner_model_dir, nlp.vocab.strings, BiluoPushDown) print("Itn.\tP.Loss\tUAS\tNER F.\tTag %\tToken %") for itn in range(n_iter): scorer = Scorer() @@ -140,7 +140,7 @@ def train(Language, gold_tuples, model_dir, n_iter=15, feat_set=u'basic', print('%d:\t%d\t%.3f\t%.3f\t%.3f\t%.3f' % (itn, loss, scorer.uas, scorer.ents_f, scorer.tags_acc, scorer.token_acc)) - nlp.end_training() + nlp.end_training(model_dir) def evaluate(Language, gold_tuples, model_dir, gold_preproc=False, verbose=False, beam_width=None): From e35bb36be75eb90daf9ff5ef0d79cfb14940a281 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:52:32 +0200 Subject: [PATCH 42/48] * Ensure Lexeme.check_flag returns a boolean value --- spacy/lexeme.pxd | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 510840b2b..6f333829f 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -35,7 +35,10 @@ cdef class Lexeme: @staticmethod cdef inline attr_t get_struct_attr(const LexemeC* lex, attr_id_t feat_name) nogil: if feat_name < (sizeof(flags_t) * 8): - return Lexeme.check_flag(lex, feat_name) + if Lexeme.check_flag(lex, feat_name): + return 1 + else: + return 0 elif feat_name == ID: return lex.id elif feat_name == ORTH: @@ -78,7 +81,10 @@ cdef class Lexeme: @staticmethod cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil: - return lexeme.flags & (1 << flag_id) + if lexeme.flags & (1 << flag_id): + return True + else: + return False @staticmethod cdef inline bint set_flag(LexemeC* lex, attr_id_t flag_id, int value) nogil: From 7cc56ada6eaa5d662f044cab3c12ece1035c5274 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:52:51 +0200 Subject: [PATCH 43/48] * Temporarily add py_set_flag attribute in Lexeme --- spacy/lexeme.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index e99bcfa7c..2c69a527c 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -29,6 +29,12 @@ cdef class Lexeme: self.c = vocab.get_by_orth(vocab.mem, orth) assert self.c.orth == orth + def py_set_flag(self, attr_id_t flag_id): + Lexeme.set_flag(self.c, flag_id, True) + + def py_check_flag(self, attr_id_t flag_id): + return True if Lexeme.check_flag(self.c, flag_id) else False + property orth_: def __get__(self): return self.vocab.strings[self.c.orth] From 6427a3fcac3035e78a6f004f8e36e401a68acc1c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:53:12 +0200 Subject: [PATCH 44/48] * Temporarily import flag attributes in matcher --- spacy/matcher.pyx | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 2cc91a368..f6f1ad3ba 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -8,6 +8,7 @@ from cymem.cymem cimport Pool from libcpp.vector cimport vector from .attrs cimport LENGTH, ENT_TYPE, ORTH, NORM, LEMMA, LOWER, SHAPE +from .attrs cimport FLAG13, FLAG14, FLAG15, FLAG16, FLAG17, FLAG18, FLAG19, FLAG20, FLAG21, FLAG22, FLAG23, FLAG24, FLAG25 from .tokens.doc cimport get_token_attr from .tokens.doc cimport Doc from .vocab cimport Vocab @@ -53,6 +54,8 @@ cdef int match(const Pattern* pattern, const TokenC* token) except -1: cdef int i for i in range(pattern.length): if get_token_attr(token, pattern.spec[i].attr) != pattern.spec[i].value: + print "Pattern fail", pattern.spec[i].attr, pattern.spec[i].value + print get_token_attr(token, pattern.spec[i].attr) return False return True @@ -76,7 +79,10 @@ def _convert_strings(token_specs, string_store): attr = map_attr_name(attr) if isinstance(value, basestring): value = string_store[value] + if isinstance(value, bool): + value = int(value) converted[-1].append((attr, value)) + print "Converted", converted[-1] return converted @@ -92,6 +98,32 @@ def map_attr_name(attr): return SHAPE elif attr == 'NORM': return NORM + elif attr == 'FLAG13': + return FLAG13 + elif attr == 'FLAG14': + return FLAG14 + elif attr == 'FLAG15': + return FLAG15 + elif attr == 'FLAG16': + return FLAG16 + elif attr == 'FLAG17': + return FLAG17 + elif attr == 'FLAG18': + return FLAG18 + elif attr == 'FLAG19': + return FLAG19 + elif attr == 'FLAG20': + return FLAG20 + elif attr == 'FLAG21': + return FLAG21 + elif attr == 'FLAG22': + return FLAG22 + elif attr == 'FLAG23': + return FLAG23 + elif attr == 'FLAG24': + return FLAG24 + elif attr == 'FLAG25': + return FLAG25 else: raise Exception("TODO: Finish supporting attr mapping %s" % attr) @@ -130,6 +162,7 @@ cdef class Matcher: # TODO: Do something more clever about multiple patterns for single # entity for spec in specs: + assert len(spec) >= 1 spec = _convert_strings(spec, self.vocab.strings) self.patterns.push_back(init_pattern(self.mem, spec, etype)) @@ -142,11 +175,13 @@ cdef class Matcher: cdef Pattern* state matches = [] for token_i in range(doc.length): + print 'check', doc[token_i].orth_ token = &doc.data[token_i] q = 0 for i in range(partials.size()): state = partials.at(i) if match(state, token): + print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: @@ -156,6 +191,7 @@ cdef class Matcher: for i in range(self.n_patterns): state = self.patterns[i] if match(state, token): + print 'match!' if is_final(state): matches.append(get_entity(state, token, token_i)) else: From 9eae9837c4b97a681fdfe52f6380ab56fe7b6065 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:53:39 +0200 Subject: [PATCH 45/48] * Fix morphology look up --- spacy/morphology.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 12d435c7d..fc6a4936b 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -30,7 +30,11 @@ cdef class Morphology: cdef int assign_tag(self, TokenC* token, tag) except -1: cdef int tag_id if isinstance(tag, basestring): - tag_id = self.reverse_index[self.strings[tag]] + try: + tag_id = self.reverse_index[self.strings[tag]] + except KeyError: + print tag + raise else: tag_id = tag analysis = self._cache.get(tag_id, token.lex.orth) From c9f2082e3cb09484cad34ea30505b8dc5dd2bf41 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:54:51 +0200 Subject: [PATCH 46/48] * Fix compilation error in en/tag_map.json --- lang_data/en/tag_map.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lang_data/en/tag_map.json b/lang_data/en/tag_map.json index b9f8269f7..8678e5afe 100644 --- a/lang_data/en/tag_map.json +++ b/lang_data/en/tag_map.json @@ -56,5 +56,5 @@ "HYPH": {"pos": "punct"}, "XX": {"pos": "x"}, "BES": {"pos": "verb"}, -"HVS": {"pos": "verb"}, +"HVS": {"pos": "verb"} } From 238b2f533ba3512266c470061a3c393f6eaa9b63 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:55:53 +0200 Subject: [PATCH 47/48] * Add lemma rules --- lang_data/de/lemma_rules.json | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 lang_data/de/lemma_rules.json diff --git a/lang_data/de/lemma_rules.json b/lang_data/de/lemma_rules.json new file mode 100644 index 000000000..e69de29bb From b3703836f9c58af6c3bb0c1c6698e31322feeb1f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 6 Sep 2015 17:56:11 +0200 Subject: [PATCH 48/48] * Add en lemma rules --- lang_data/en/lemma_rules.json | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 lang_data/en/lemma_rules.json diff --git a/lang_data/en/lemma_rules.json b/lang_data/en/lemma_rules.json new file mode 100644 index 000000000..c45eb1df6 --- /dev/null +++ b/lang_data/en/lemma_rules.json @@ -0,0 +1,31 @@ +{ + "noun": [ + ["s", ""], + ["ses", "s"], + ["ves", "f"], + ["xes", "x"], + ["zes", "z"], + ["ches", "ch"], + ["shes", "sh"], + ["men", "man"], + ["ies", "y"] + ], + + "verb": [ + ["s", ""], + ["ies", "y"], + ["es", "e"], + ["es", ""], + ["ed", "e"], + ["ed", ""], + ["ing", "e"], + ["ing", ""] + ], + + "adj": [ + ["er", ""], + ["est", ""], + ["er", "e"], + ["est", "e"] + ] +}