* More work on language-generic parsing

2025-07-11 08:42:28 +03:00 · 2015-08-28 02:02:33 +02:00 · 2015-08-28 02:02:33 +02:00 · c2307fa9ee
commit c2307fa9ee
parent 86c4a8e3e2
12 changed files with 129 additions and 222 deletions
--- a/spacy/fi/init.py
+++ b/spacy/fi/init.py
@ -0,0 +1,11 @@
 from __future__ import unicode_literals, print_function
 from os import path
 from ..language import Language
 class Finnish(Language):
    @classmethod
    def default_data_dir(cls):
        return path.join(path.dirname(__file__), 'data')
--- a/spacy/language.py
+++ b/spacy/language.py
@ -148,13 +148,10 @@ class Language(object):
            vectors = cls.default_vectors(data_dir)
        if get_lex_attr is None:
            get_lex_attr = cls.default_lex_attrs(data_dir)
        if morphology is None:
            morphology = cls.default_morphology(path.join(data_dir, 'vocab'))
        return Vocab.from_dir(
                path.join(data_dir, 'vocab'),
                get_lex_attr=get_lex_attr,
-                vectors=vectors,
+                vectors=vectors)
                morphology=morphology)
    @classmethod
    def default_tokenizer(cls, vocab, data_dir):
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,18 +1,41 @@
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMapArray
 from libc.stdint cimport uint64_t
 from .structs cimport TokenC
 from .strings cimport StringStore
 from .typedefs cimport attr_t
 from .parts_of_speech cimport univ_pos_t
 cdef struct RichTagC:
    uint64_t morph
    int id
    univ_pos_t pos
    attr_t name
 cdef struct MorphAnalysisC:
    RichTagC tag
    attr_t lemma
 cdef class Morphology:
    cdef readonly Pool mem
    cdef readonly object strings
    cdef public object lemmatizer
-    cdef public object tag_map
+    cdef public object n_tags
    cdef public object reverse_index
    cdef public object tag_names
    cdef public object tag_ids
    cdef public int n_tags
-    cdef int assign_tag(self, StringStore strings, TokenC* token, int tag) except -1
+    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache
    cdef int assign_tag(self, TokenC* token, tag) except -1
    cdef int assign_feature(self, uint64_t* morph, feature, value) except -1
    cdef int assign_from_dict(self, TokenC* token, props) except -1
 #
 #cpdef enum Feature_t:
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -6,13 +6,8 @@ try:
 except ImportError:
    import json
-from spacy.parts_of_speech import UNIV_POS_NAMES
+from .parts_of_speech import UNIV_POS_NAMES
-
+from .parts_of_speech cimport ADJ, VERB, NOUN
 cdef struct MorphAnalysisC:
    uint64_t[4] features
    attr_t lemma
    attr_t pos
 cdef class Morphology:
@ -23,32 +18,37 @@ cdef class Morphology:
            lemmatizer = Lemmatizer.from_dir(data_dir)
        return cls(tag_map, {}, lemmatizer)
-    def __init__(self, tag_map, fused_tokens, lemmatizer):
+    def __init__(self, string_store, tag_map, lemmatizer):
        self.mem = Pool()
        self.strings = string_store
        self.lemmatizer = lemmatizer
        self.tag_map = tag_map
        self.n_tags = len(tag_map)
        self.tag_names = tuple(sorted(tag_map.keys()))
-        self.tag_ids = {}
+        self.reverse_index = {}
-        for i, tag_str in enumerate(self.tag_names):
+        for i, (tag_str, props) in enumerate(sorted(tag_map.items())):
-            self.tag_ids[tag_str] = i
+            self.rich_tags[i].id = i
-        self._cache = PreshMapArray()
+            self.rich_tags[i].name = self.strings[tag_str]
            self.rich_tags[i].morph = 0
            self.reverse_index[self.rich_tags[i].name] = i
        self._cache = PreshMapArray(self.n_tags)
    cdef int assign_tag(self, TokenC* token, tag) except -1:
-        analysis = <MorphAnalysisC*>self._cache.get(tag, token.lex.orth)
+        cdef int tag_id = self.strings[tag] if isinstance(tag, basestring) else tag
        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
        if analysis is NULL:
            analysis = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-            cached = self.decode_tag(tag)
+            analysis.tag = self.rich_tags[tag_id]
-            cached.lemma = self.lemmatize(token.pos, token.lex)
+            analysis.lemma = self.lemmatize(tag, token.lex.orth)
        token.lemma = analysis.lemma
-        token.pos = analysis.pos
+        token.pos = analysis.tag.pos
-        token.tag = analysis.tag
+        token.tag = analysis.tag.name
-        token.morph = analysis.features
+        token.morph = analysis.tag.morph
-    cdef int assign_feature(self, TokenC* token, feature, value) except -1:
+    cdef int assign_feature(self, uint64_t* morph, feature, value) except -1:
        pass
    def load_morph_exceptions(self, dict exc):
-        # Map (form, pos) to (lemma, inflection)
+        # Map (form, pos) to (lemma, rich tag)
        cdef unicode pos_str
        cdef unicode form_str
        cdef unicode lemma_str
@ -57,121 +57,30 @@ cdef class Morphology:
        cdef int lemma
        cdef attr_t orth
        cdef int pos
-        for pos_str, entries in exc.items():
+        for tag_str, entries in exc.items():
-            pos = self.tag_names.index(pos_str)
+            tag = self.strings[tag_str]
            rich_tag = self.rich_tags[self.reverse_index[tag]]
            for form_str, props in entries.items():
                lemma_str = props.get('L', form_str)
                orth = self.strings[form_str]
                cached = <MorphAnalysisC*>self.mem.alloc(1, sizeof(MorphAnalysisC))
-                cached.lemma = self.strings[lemma_str]
+                orth = self.strings[form_str]
-                self.set_features(cached, props)
+                for name_str, value_str in props.items():
-                self._cache.set(pos, orth, <void*>cached)
+                    if name_str == 'L':
                        cached.lemma = self.strings[value_str]
                    else:
                        self.assign_feature(&cached.tag.morph, name_str, value_str)
                if cached.lemma == 0:
                    cached.lemma = self.lemmatize(rich_tag.pos, orth)
                self._cache.set(rich_tag.pos, orth, <void*>cached)
-    def _load_special_tokenization(self, special_cases):
+    def lemmatize(self, const univ_pos_t pos, attr_t orth):
-        '''Add a special-case tokenization rule.
+        if self.lemmatizer is None:
-        '''
+            return orth
-        cdef int i
+        cdef unicode py_string = self.strings[orth]
-        cdef list substrings
+        if pos != NOUN and pos != VERB and pos != ADJ:
-        cdef unicode chunk
+            return orth
-        cdef unicode form
+        cdef set lemma_strings
-        cdef unicode lemma
+        cdef unicode lemma_string
-        cdef dict props
+        lemma_strings = self.lemmatizer(py_string, pos)
-        cdef LexemeC** lexemes
+        lemma_string = sorted(lemma_strings)[0]
-        cdef hash_t hashed
+        lemma = self.strings[lemma_string]
-        for chunk, substrings in sorted(special_cases.items()):
+        return lemma
            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
            for i, props in enumerate(substrings):
                # Set the special tokens up to have morphology and lemmas if
                # specified, otherwise use the part-of-speech tag (if specified)
                form = props['F']
                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
                morphology = self.vocab.morphology.decode_dict(props)
                tokens[i].lemma = morph_analysis.lemma
                tokens[i].pos = morph_analysis.pos
                tokens[i].tag = morph_analysis.tag
                tokens[i].morph = morph_analysis.morph
            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
            cached.length = len(substrings)
            cached.is_lex = False
            cached.data.tokens = tokens
            hashed = hash_string(chunk)
            self._specials.set(hashed, cached)
            self._cache.set(hashed, cached)
 #cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
 #    morph.number = props.get('number', 0)
 #    morph.tenspect = props.get('tenspect', 0)
 #    morph.mood = props.get('mood', 0)
 #    morph.gender = props.get('gender', 0)
 #    morph.person = props.get('person', 0)
 #    morph.case = props.get('case', 0)
 #    morph.misc = props.get('misc', 0)
 #
 #
 #cdef class Morphology:
 #    cdef Pool mem
 #    cdef PreshMap table
 #
 #    def __init__(self, tags, exceptions):
 #        pass
 #
 #    def __getitem__(self, hash_t id_):
 #        pass
 #
 #    cdef const InflectionC* get(self, hash_t key) except NULL:
 #        pass
 #
 #    cdef MorphAnalysis analyse(const TokenC* token) except -1:
 #        cdef struct MorphAnalysis morphology
 #        tokens[i].pos = tag.pos
 #        cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
 #        if cached is NULL:
 #            cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
 #            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
 #            cached.morph = tag.morph
 #            self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
 #        tokens[i].lemma = cached.lemma
 #        tokens[i].morph = cached.morph
 #        
 #    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
 #        if self.lemmatizer is None:
 #            return lex.orth
 #        cdef unicode py_string = self.strings[lex.orth]
 #        if pos != NOUN and pos != VERB and pos != ADJ:
 #            return lex.orth
 #        cdef set lemma_strings
 #        cdef unicode lemma_string
 #        lemma_strings = self.lemmatizer(py_string, pos)
 #        lemma_string = sorted(lemma_strings)[0]
 #        lemma = self.strings[lemma_string]
 #        return lemma
 #        
 #
 #cdef class Inflection:
 #    cdef InflectionC* c
 #
 #    def __init__(self, container, id_):
 #        self.c = container[id_]
 #        self.container = container
 #        
 #        for i, feat_id in enumerate(feat_ids):
 #            feature, value = parse_id(feat_id)
 #            self.add_value(feature, value, True)
 #
 #    def has(self, Value_t feat_value_id):
 #        part = feat_value_id % 64
 #        bit = feat_value_id / 64
 #        if self.value_set[part] & bit:
 #            return True
 #        else:
 #            return False
 #
 #    property pos: def __get__(self): return self.c.pos
 #
 #    property id: def __get__(self): return self.c.id
 #
 #    property features:
 #        pass
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@ -25,17 +25,6 @@ cdef struct LexemeC:
    float sentiment
    float l2_norm
 cdef struct MorphFeatC:
    int name
    int value
 cdef struct MorphologyC:
    uint64_t[4] feature_set
    MorphFeatC* features
    univ_pos_t pos
    int n
 cdef struct Entity:
    int start
@ -54,8 +43,8 @@ cdef struct Constituent:
 cdef struct TokenC:
    const LexemeC* lex
    const MorphologyC* morph
    const Constituent* ctnt
    uint64_t morph
    univ_pos_t pos
    bint spacy
    int tag
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -104,7 +104,7 @@ cdef class Tagger:
    @classmethod
    def blank(cls, vocab, templates):
-        model = Model(vocab.morphology.n_tags, templates, model_loc=None)
+        model = Model(vocab.n_tags, templates, model_loc=None)
        return cls(vocab, model)
    @classmethod
@ -113,7 +113,7 @@ cdef class Tagger:
            templates = json.loads(open(path.join(data_dir, 'templates.json')))
        else:
            templates = cls.default_templates()
-        model = Model(vocab.morphology.n_tags, templates, data_dir)
+        model = Model(vocab.n_tags, templates, data_dir)
        return cls(vocab, model)
    def __init__(self, Vocab vocab, model):
@ -128,7 +128,7 @@ cdef class Tagger:
    @property
    def tag_names(self):
-        return self.vocab.morphology.tag_names
+        return self.vocab.tag_names
    def __call__(self, Doc tokens):
        """Apply the tagger, setting the POS tags onto the Doc object.
@ -143,14 +143,15 @@ cdef class Tagger:
        for i in range(tokens.length):
            if tokens.data[i].pos == 0:
                guess = self.predict(i, tokens.data)
-                self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
+                self.vocab.morphology.assign_tag(&tokens.data[i], guess)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length
    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
-            self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], tag_strs[i])
+            self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length
@ -168,7 +169,9 @@ cdef class Tagger:
        for i in range(tokens.length):
            guess = self.update(i, tokens.data, golds[i])
            loss = golds[i] != -1 and guess != golds[i]
-            self.vocab.morphology.assign_tag(self.vocab.strings, &tokens.data[i], guess)
+
            self.vocab.morphology.assign_tag(&tokens.data[i], guess)
            correct += loss == 0
            self.freqs[TAG][tokens.data[i].tag] += 1
        return correct
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -7,12 +7,7 @@ from .typedefs cimport hash_t
 from .structs cimport LexemeC, TokenC
 from .strings cimport StringStore
 from .tokens.doc cimport Doc
-from .vocab cimport Vocab, _Cached
+from .vocab cimport Vocab, LexemesOrTokens, _Cached
 cdef union LexemesOrTokens:
    const LexemeC* const* lexemes
    TokenC* tokens
 cdef class Tokenizer:
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -192,9 +192,7 @@ cdef class Tokenizer:
                tokens.push_back(prefixes[0][i], False)
        if string:
            cache_hit = self._try_cache(hash_string(string), tokens)
-            if cache_hit:
+            if not cache_hit:
                pass
            else:
                match = self.find_infix(string)
                if match is None:
                    tokens.push_back(self.vocab.get(tokens.mem, string), False)
@ -253,38 +251,10 @@ cdef class Tokenizer:
        cdef LexemeC** lexemes
        cdef hash_t hashed
        for chunk, substrings in sorted(special_cases.items()):
            tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
            for i, props in enumerate(substrings):
                form = props['F']
                tokens[i].lex = <LexemeC*>self.vocab.get(self.vocab.mem, form)
                lemma = props.get('L', form)
                tokens[i].lemma = self.vocab.strings[lemma]
                #TODO
                #self.vocab.morphology.assign_from_dict(&tokens[i], props)
            cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
            cached.length = len(substrings)
            cached.is_lex = False
-            cached.data.tokens = tokens
+            cached.data.tokens = self.vocab.make_fused_token(substrings)
-            hashed = hash_string(chunk)
+            key = hash_string(chunk)
-            self._specials.set(hashed, cached)
+            self._specials.set(key, cached)
-            self._cache.set(hashed, cached)
+            self._cache.set(key, cached)
 #if lemma is not None:
 #    tokens[i].lemma = self.vocab.strings[lemma]
 #else:
 #    tokens[i].lemma = 0
 #if 'pos' in props:
 #    inflection = self.vocab.morphology.get(props['pos'])
 #    inflection.assign(&tokens[i])
 #    # These are defaults, which can be over-ridden by the
 #    # token-specific props.
 #    #pos, morph_features = self.vocab.morphology.tag_map[props['pos']]
 #    #tokens[i].pos = pos
 #    ## These are defaults, which can be over-ridden by the
 #    ## token-specific props.
 #    #set_morph_from_dict(&tokens[i].morph, morph_features)
 #    #if tokens[i].lemma == 0:
 #    #    tokens[i].lemma = tokens[i].lex.orth
 ##set_morph_from_dict(&tokens[i].morph, props)
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -12,11 +12,11 @@ cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil
 ctypedef const LexemeC* const_Lexeme_ptr
-ctypedef TokenC* TokenC_ptr
+ctypedef const TokenC* const_TokenC_ptr
 ctypedef fused LexemeOrToken:
    const_Lexeme_ptr
-    TokenC_ptr
+    const_TokenC_ptr
 cdef class Doc:
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -209,7 +209,7 @@ cdef class Doc:
        if self.length == self.max_length:
            self._realloc(self.length * 2)
        cdef TokenC* t = &self.data[self.length]
-        if LexemeOrToken is TokenC_ptr:
+        if LexemeOrToken is const_TokenC_ptr:
            t[0] = lex_or_tok[0]
        else:
            t.lex = lex_or_tok
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -15,7 +15,7 @@ cdef LexemeC EMPTY_LEXEME
 cdef union LexemesOrTokens:
    const LexemeC* const* lexemes
-    TokenC* tokens
+    const TokenC* tokens
 cdef struct _Cached:
@ -37,6 +37,7 @@ cdef class Vocab:
    cdef const LexemeC* get(self, Pool mem, unicode string) except NULL
    cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL
    cdef const TokenC* make_fused_token(self, substrings) except NULL
    cdef const LexemeC* _new_lexeme(self, Pool mem, unicode string) except NULL
    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -17,6 +17,7 @@ from .strings cimport hash_string
 from .orth cimport word_shape
 from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .lemmatizer import Lemmatizer
 from cymem.cymem cimport Address
 from . import util
@ -36,20 +37,13 @@ EMPTY_LEXEME.repvec = EMPTY_VEC
 cdef class Vocab:
    '''A map container for a language's LexemeC structs.
    '''
-    @classmethod
+    def __init__(self, get_lex_attr=None, tag_map=None, vectors=None):
    def default_morphology(cls):
        return Morphology({'VBZ': ['VERB', {}]}, [], None)
    def __init__(self, get_lex_attr=None, morphology=None, vectors=None):
        self.get_lex_attr = get_lex_attr
        if morphology is None:
            morphology = self.default_morphology()
        self.morphology = morphology
        self.mem = Pool()
        self._by_hash = PreshMap()
        self._by_orth = PreshMap()
        self.strings = StringStore()
        self.get_lex_attr = get_lex_attr
        self.morphology = Morphology(self.strings, tag_map, Lemmatizer({}, {}, {}))
        self.length = 1
        self._serializer = None
@ -60,10 +54,9 @@ cdef class Vocab:
            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
        if not path.isdir(data_dir):
            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
-        cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors,
+        tag_map = json.load(open(path.join(data_dir, 'tag_map.json')))
-                              morphology=morphology)
+        cdef Vocab self = cls(get_lex_attr=get_lex_attr, vectors=vectors, tag_map=tag_map)
-        self.load_lexemes(path.join(data_dir, 'strings.txt'),
+        self.load_lexemes(path.join(data_dir, 'strings.txt'), path.join(data_dir, 'lexemes.bin'))
                          path.join(data_dir, 'lexemes.bin'))
        if vectors is None and path.exists(path.join(data_dir, 'vec.bin')):
            self.repvec_length = self.load_rep_vectors(path.join(data_dir, 'vec.bin'))
        return self
@ -172,6 +165,22 @@ cdef class Vocab:
            orth = id_or_string
        return Lexeme(self, orth)
    cdef const TokenC* make_fused_token(self, substrings) except NULL:
        cdef int i
        tokens = <TokenC*>self.mem.alloc(len(substrings) + 1, sizeof(TokenC))
        for i, props in enumerate(substrings):
            token = &tokens[i]
            # Set the special tokens up to have morphology and lemmas if
            # specified, otherwise use the part-of-speech tag (if specified)
            token.lex = <LexemeC*>self.get(self.mem, props['F'])
            if 'pos' in props:
                self.morphology.assign_tag(token, props['pos'])
            if 'L' in props:
                tokens[i].lemma = self.strings[props['L']]
            for feature, value in props.get('morph', {}).items():
                self.morphology.assign_feature(&token.morph, feature, value)
        return tokens
    def dump(self, loc):
        if path.exists(loc):
            assert not path.isdir(loc)