* More work on language-generic parsing

2025-08-25 06:24:55 +03:00 · 2015-08-28 02:02:33 +02:00 · 2015-08-28 02:02:33 +02:00 · c2307fa9ee
commit c2307fa9ee
parent 86c4a8e3e2
12 changed files with 129 additions and 222 deletions
--- a/spacy/fi/init.py
+++ b/spacy/fi/init.py
@ -0,0 +1,11 @@
+from __future__ import unicode_literals, print_function
+
+from os import path
+
+from ..language import Language
+
+
+class Finnish(Language):
+    @classmethod
+    def default_data_dir(cls):
+        return path.join(path.dirname(__file__), 'data')
--- a/spacy/language.py
+++ b/spacy/language.py
@ -148,13 +148,10 @@ class Language(object):
            vectors = cls.default_vectors(data_dir)
        if get_lex_attr is None:
            get_lex_attr = cls.default_lex_attrs(data_dir)
-        if morphology is None:
-            morphology = cls.default_morphology(path.join(data_dir, 'vocab'))
        return Vocab.from_dir(
                path.join(data_dir, 'vocab'),
                get_lex_attr=get_lex_attr,
-                vectors=vectors,
-                morphology=morphology)
+                vectors=vectors)

    @classmethod
    def default_tokenizer(cls, vocab, data_dir):
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,18 +1,41 @@
+from cymem.cymem cimport Pool
+from preshed.maps cimport PreshMapArray
+from libc.stdint cimport uint64_t
+
 from .structs cimport TokenC
 from .strings cimport StringStore
+from .typedefs cimport attr_t
+from .parts_of_speech cimport univ_pos_t
+
+
+cdef struct RichTagC:
+    uint64_t morph
+    int id
+    univ_pos_t pos
+    attr_t name
+
+
+cdef struct MorphAnalysisC:
+    RichTagC tag
+    attr_t lemma


 cdef class Morphology:
+    cdef readonly Pool mem
    cdef readonly object strings
    cdef public object lemmatizer
-    cdef public object tag_map
+    cdef public object n_tags
+    cdef public object reverse_index
    cdef public object tag_names
-    cdef public object tag_ids
-    cdef public int n_tags

-    cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
+    cdef RichTagC* rich_tags
+    cdef PreshMapArray _cache
+
+    cdef int assign_tag(self, TokenC* token, tag) except -1
+
+    cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
+

-    cdef int assign_from_dict(self, TokenC* token, props) except -1

 #
 #cpdef enum Feature_t:
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -6,15 +6,10 @@ try:
 except ImportError:
    import json

-from spacy.parts_of_speech import UNIV_POS_NAMES
+from .parts_of_speech import UNIV_POS_NAMES
+from .parts_of_speech cimport ADJ, VERB, NOUN


-cdef struct MorphAnalysisC:
-    uint64_t[4] features
-    attr_t lemma
-    attr_t pos
-    
-
 cdef class Morphology:
    @classmethod
    def from_dir(cls, data_dir, lemmatizer=None):
@ -23,32 +18,37 @@ cdef class Morphology:
            lemmatizer = Lemmatizer.from_dir(data_dir)
        return cls(tag_map, {}, lemmatizer)

-    def __init__(self, tag_map, fused_tokens, lemmatizer):
+    def __init__(self, string_store, tag_map, lemmatizer):
+        self.mem = Pool()
+        self.strings = string_store
        self.lemmatizer = lemmatizer
-        self.tag_map = tag_map
        self.n_tags = len(tag_map)
        self.tag_names = tuple(sorted(tag_map.keys()))
-        self.tag_ids = {}
-        for i, tag_str in enumerate(self.tag_names):
-            self.tag_ids[tag_str] = i
-        self._cache = PreshMapArray()
+        self.reverse_index = {}
+        for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
+            self.rich_tags[i].id = i
+            self.rich_tags[i].name = self.strings[tag_str]
+            self.rich_tags[i].morph = 0
+            self.reverse_index[self.rich_tags[i].name] = i
+        self._cache = PreshMapArray(self.n_tags)

    cdef int assign_tag(self, TokenC* token, tag) except -1:
-        analysis = <MorphAnalysisC*>self._cache.get(tag, token.lex.orth)
+        cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
+        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
        if analysis is NULL:
            analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            cached = self.decode_tag(tag)
-            cached.lemma = self.lemmatize(token.pos, token.lex)
+            analysis.tag = self.rich_tags[tag_id]
+            analysis.lemma = self.lemmatize(tag, token.lex.orth)
        token.lemma = analysis.lemma
-        token.pos = analysis.pos
-        token.tag = analysis.tag
-        token.morph = analysis.features
+        token.pos = analysis.tag.pos
+        token.tag = analysis.tag.name
+        token.morph = analysis.tag.morph

-    cdef int assign_feature(self, TokenC* token, feature, value) except -1:
+    cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
        pass

    def load_morph_exceptions(self, dict exc):
-        # Map (form, pos) to (lemma, inflection)
+        # Map (form, pos) to (lemma, rich tag)
        cdef unicode pos_str
        cdef unicode form_str
        cdef unicode lemma_str
@ -57,121 +57,30 @@ cdef class Morphology:
        cdef int lemma
        cdef attr_t orth
        cdef int pos
-        for pos_str, entries in exc.items():
-            pos = self.tag_names.index(pos_str)
+        for tag_str, entries in exc.items():
+            tag = self.strings[tag_str]
+            rich_tag = self.rich_tags[self.reverse_index[tag]]
            for form_str, props in entries.items():
-                lemma_str = props.get('L', form_str)
-                orth = self.strings[form_str]
                cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-                cached.lemma = self.strings[lemma_str]
-                self.set_features(cached, props)
-                self._cache.set(pos, orth, <void*>cached)
+                orth = self.strings[form_str]
+                for name_str, value_str in props.items():
+                    if name_str == 'L':
+                        cached.lemma = self.strings[value_str]
+                    else:
+                        self.assign_feature(&cached.tag.morph, name_str, value_str)
+                if cached.lemma == 0:
+                    cached.lemma = self.lemmatize(rich_tag.pos, orth)
+                self._cache.set(rich_tag.pos, orth, <void*>cached)

-    def _load_special_tokenization(self, special_cases):
-        '''Add a special-case tokenization rule.
-        '''
-        cdef int i
-        cdef list substrings
-        cdef unicode chunk
-        cdef unicode form
-        cdef unicode lemma
-        cdef dict props
-        cdef LexemeC** lexemes
-        cdef hash_t hashed
-        for chunk, substrings in sorted(special_cases.items()):
-            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
-            for i, props in enumerate(substrings):
-                # Set the special tokens up to have morphology and lemmas if
-                # specified, otherwise use the part-of-speech tag (if specified)
-                form = props['F']
-                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
-                morphology = self.vocab.morphology.decode_dict(props)
-                tokens[i].lemma = morph_analysis.lemma
-                tokens[i].pos = morph_analysis.pos
-                tokens[i].tag = morph_analysis.tag
-                tokens[i].morph = morph_analysis.morph
-            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
-            cached.length = len(substrings)
-            cached.is_lex = False
-            cached.data.tokens = tokens
-            hashed = hash_string(chunk)
-            self._specials.set(hashed, cached)
-            self._cache.set(hashed, cached)
-
-
-
-
-#cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
-#    morph.number = props.get('number', 0)
-#    morph.tenspect = props.get('tenspect', 0)
-#    morph.mood = props.get('mood', 0)
-#    morph.gender = props.get('gender', 0)
-#    morph.person = props.get('person', 0)
-#    morph.case = props.get('case', 0)
-#    morph.misc = props.get('misc', 0)
-#
-#
-#cdef class Morphology:
-#    cdef Pool mem
-#    cdef PreshMap table
-#
-#    def __init__(self, tags, exceptions):
-#        pass
-#
-#    def __getitem__(self, hash_t id_):
-#        pass
-#
-#    cdef const InflectionC* get(self, hash_t key) except NULL:
-#        pass
-#
-#    cdef MorphAnalysis analyse(const TokenC* token) except -1:
-#        cdef struct MorphAnalysis morphology
-#        tokens[i].pos = tag.pos
-#        cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
-#        if cached is NULL:
-#            cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
-#            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
-#            cached.morph = tag.morph
-#            self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
-#        tokens[i].lemma = cached.lemma
-#        tokens[i].morph = cached.morph
-#        
-#    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
-#        if self.lemmatizer is None:
-#            return lex.orth
-#        cdef unicode py_string = self.strings[lex.orth]
-#        if pos != NOUN and pos != VERB and pos != ADJ:
-#            return lex.orth
-#        cdef set lemma_strings
-#        cdef unicode lemma_string
-#        lemma_strings = self.lemmatizer(py_string, pos)
-#        lemma_string = sorted(lemma_strings)[0]
-#        lemma = self.strings[lemma_string]
-#        return lemma
-#        
-#
-#cdef class Inflection:
-#    cdef InflectionC* c
-#
-#    def __init__(self, container, id_):
-#        self.c = container[id_]
-#        self.container = container
-#        
-#        for i, feat_id in enumerate(feat_ids):
-#            feature, value = parse_id(feat_id)
-#            self.add_value(feature, value, True)
-#
-#    def has(self, Value_t feat_value_id):
-#        part = feat_value_id % 64
-#        bit = feat_value_id / 64
-#        if self.value_set[part] & bit:
-#            return True
-#        else:
-#            return False
-#
-#    property pos: def __get__(self): return self.c.pos
-#
-#    property id: def __get__(self): return self.c.id
-#
-#    property features:
-#        pass
+    def lemmatize(self, const univ_pos_t pos, attr_t orth):
+        if self.lemmatizer is None:
+            return orth
+        cdef unicode py_string = self.strings[orth]
+        if pos != NOUN and pos != VERB and pos != ADJ:
+            return orth
+        cdef set lemma_strings
+        cdef unicode lemma_string
+        lemma_strings = self.lemmatizer(py_string, pos)
+        lemma_string = sorted(lemma_strings)[0]
+        lemma = self.strings[lemma_string]
+        return lemma
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -25,17 +25,6 @@ cdef struct LexemeC:
    float sentiment
    float l2_norm

-cdef struct MorphFeatC:
-    int name
-    int value
-
-
-cdef struct MorphologyC:
-    uint64_t[4] feature_set
-    MorphFeatC* features
-    univ_pos_t pos
-    int n
-

 cdef struct Entity:
    int start
@ -54,8 +43,8 @@ cdef struct Constituent:

 cdef struct TokenC:
    const LexemeC* lex
-    const MorphologyC* morph
    const Constituent* ctnt
+    uint64_t morph
    univ_pos_t pos
    bint spacy
    int tag
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -104,7 +104,7 @@ cdef class Tagger:

    @classmethod
    def blank(cls, vocab, templates):
-        model = Model(vocab.morphology.n_tags, templates, model_loc=None)
+        model = Model(vocab.n_tags, templates, model_loc=None)
        return cls(vocab, model)

    @classmethod
@ -113,7 +113,7 @@ cdef class Tagger:
            templates = json.loads(open(path.join(data_dir, 'templates.json')))
        else:
            templates = cls.default_templates()
-        model = Model(vocab.morphology.n_tags, templates, data_dir)
+        model = Model(vocab.n_tags, templates, data_dir)
        return cls(vocab, model)

    def __init__(self, Vocab vocab, model):
@ -128,7 +128,7 @@ cdef class Tagger:

    @property
    def tag_names(self):
-        return self.vocab.morphology.tag_names
+        return self.vocab.tag_names

    def __call__(self, Doc tokens):
        """Apply the tagger, setting the POS tags onto the Doc object.
@ -143,14 +143,15 @@ cdef class Tagger:
        for i in range(tokens.length):
            if tokens.data[i].pos == 0:
                guess = self.predict(i, tokens.data)
-                self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
+                self.vocab.morphology.assign_tag(&tokens.data[i], guess)
+
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
-            self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
+            self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

@ -168,7 +169,9 @@ cdef class Tagger:
        for i in range(tokens.length):
            guess = self.update(i, tokens.data, golds[i])
            loss = golds[i] != -1 and guess != golds[i]
-            self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
+
+            self.vocab.morphology.assign_tag(&tokens.data[i], guess)
+            
            correct += loss == 0
            self.freqs[TAG][tokens.data[i].tag] += 1
        return correct
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -7,12 +7,7 @@ from .typedefs cimport hash_t
 from .structs cimport LexemeC, TokenC
 from .strings cimport StringStore
 from .tokens.doc cimport Doc
-from .vocab cimport Vocab, _Cached
-
-
-cdef union LexemesOrTokens:
-    const LexemeC* const* lexemes
-    TokenC* tokens
+from .vocab cimport Vocab, LexemesOrTokens, _Cached


 cdef class Tokenizer:
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -192,9 +192,7 @@ cdef class Tokenizer:
                tokens.push_back(prefixes[0][i], False)
        if string:
            cache_hit = self._try_cache(hash_string(string), tokens)
-            if cache_hit:
-                pass
-            else:
+            if not cache_hit:
                match = self.find_infix(string)
                if match is None:
                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
@ -253,38 +251,10 @@ cdef class Tokenizer:
        cdef LexemeC** lexemes
        cdef hash_t hashed
        for chunk, substrings in sorted(special_cases.items()):
-            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
-            for i, props in enumerate(substrings):
-                form = props['F']
-                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
-                lemma = props.get('L', form)
-                tokens[i].lemma = self.vocab.strings[lemma]
-                #TODO
-                #self.vocab.morphology.assign_from_dict(&tokens[i], props)
            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
            cached.length = len(substrings)
            cached.is_lex = False
-            cached.data.tokens = tokens
-            hashed = hash_string(chunk)
-            self._specials.set(hashed, cached)
-            self._cache.set(hashed, cached)
-
-
-#if lemma is not None:
-#    tokens[i].lemma = self.vocab.strings[lemma]
-#else:
-#    tokens[i].lemma = 0
-#if 'pos' in props:
-#    inflection = self.vocab.morphology.get(props['pos'])
-#    inflection.assign(&tokens[i])
-#    # These are defaults, which can be over-ridden by the
-#    # token-specific props.
-#    #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
-#    #tokens[i].pos = pos
-#    ## These are defaults, which can be over-ridden by the
-#    ## token-specific props.
-#    #set_morph_from_dict(&tokens[i].morph, morph_features)
-#    #if tokens[i].lemma == 0:
-#    #    tokens[i].lemma = tokens[i].lex.orth
-##set_morph_from_dict(&tokens[i].morph, props)
-
+            cached.data.tokens = self.vocab.make_fused_token(substrings)
+            key = hash_string(chunk)
+            self._specials.set(key, cached)
+            self._cache.set(key, cached)
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil


 ctypedef const LexemeC* const_Lexeme_ptr
-ctypedef TokenC* TokenC_ptr
+ctypedef const TokenC* const_TokenC_ptr

 ctypedef fused LexemeOrToken:
    const_Lexeme_ptr
-    TokenC_ptr
+    const_TokenC_ptr


 cdef class Doc:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -209,7 +209,7 @@ cdef class Doc:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        cdef TokenC* t = &self.data[self.length]
-        if LexemeOrToken is TokenC_ptr:
+        if LexemeOrToken is const_TokenC_ptr:
            t[0] = lex_or_tok[0]
        else:
            t.lex = lex_or_tok
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME

 cdef union LexemesOrTokens:
    const LexemeC* const* lexemes
-    TokenC* tokens
+    const TokenC* tokens


 cdef struct _Cached:
@ -37,6 +37,7 @@ cdef class Vocab:

    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
+    cdef const TokenC* make_fused_token(self, substrings) except NULL
    
    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -17,6 +17,7 @@ from .strings cimport hash_string
 from .orth cimport word_shape
 from .typedefs cimport attr_t
 from .cfile cimport CFile
+from .lemmatizer import Lemmatizer

 from cymem.cymem cimport Address
 from . import util
@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
-    @classmethod
-    def default_morphology(cls):
-        return Morphology({'VBZ': ['VERB', {}]}, [], None)
-
-    def __init__(self, get_lex_attr=None, morphology=None, vectors=None):
-        self.get_lex_attr = get_lex_attr
-        if morphology is None:
-            morphology = self.default_morphology()
-        self.morphology = morphology
-        
+    def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
+        self.get_lex_attr = get_lex_attr
+        self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
        
        self.length = 1
        self._serializer = None
@ -60,10 +54,9 @@ cdef class Vocab:
            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
        if not path.isdir(data_dir):
            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
-        cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors,
-                              morphology=morphology)
-        self.load_lexemes(path.join(data_dir, 'strings.txt'),
-                          path.join(data_dir, 'lexemes.bin'))
+        tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
+        cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
+        self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
        if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
            self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
        return self
@ -172,6 +165,22 @@ cdef class Vocab:
            orth = id_or_string
        return Lexeme(self, orth)

+    cdef const TokenC* make_fused_token(self, substrings) except NULL:
+        cdef int i
+        tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
+        for i, props in enumerate(substrings):
+            token = &tokens[i]
+            # Set the special tokens up to have morphology and lemmas if
+            # specified, otherwise use the part-of-speech tag (if specified)
+            token.lex = <LexemeC*>self.get(self.mem, props['F'])
+            if 'pos' in props:
+                self.morphology.assign_tag(token, props['pos'])
+            if 'L' in props:
+                tokens[i].lemma = self.strings[props['L']]
+            for feature, value in props.get('morph', {}).items():
+                self.morphology.assign_feature(&token.morph, feature, value)
+        return tokens
+    
    def dump(self, loc):
        if path.exists(loc):
            assert not path.isdir(loc)