Allow Morphology class to setup tokens

Add Morphology.assign_untagged() C-method, and call it from Doc.push_back() when a token is created. This gives a place to allow the Morphology class to initialize token data.
2025-07-15 18:52:29 +03:00 · 2017-10-11 03:24:14 +02:00 · 2017-10-11 03:24:14 +02:00 · a6ac4699eb
commit a6ac4699eb
parent 820bf85075 3b527fa52b
5 changed files with 53 additions and 4 deletions
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -24,6 +24,8 @@ class Lemmatizer(object):
            univ_pos = 'adj'
        elif univ_pos == PUNCT:
            univ_pos = 'punct'
+        else:
+            return set([string.lower()])
        # See Issue #435 for example of where this logic is requied.
        if self.is_base_form(univ_pos, morphology):
            return set([string.lower()])
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -35,6 +35,8 @@ cdef class Morphology:
    cdef RichTagC* rich_tags
    cdef PreshMapArray _cache

+    cdef int assign_untagged(self, TokenC* token) except -1
+
    cdef int assign_tag(self, TokenC* token, tag) except -1

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -42,7 +42,7 @@ cdef class Morphology:
        self.tag_names = tuple(sorted(tag_map.keys()))
        self.reverse_index = {}

-        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags, sizeof(RichTagC))
+        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
            self.tag_map[tag_str] = dict(attrs)
            attrs = _normalize_props(attrs)
@ -52,6 +52,10 @@ cdef class Morphology:
            self.rich_tags[i].morph = 0
            self.rich_tags[i].pos = attrs[POS]
            self.reverse_index[self.rich_tags[i].name] = i
+        # Add a 'null' tag, which we can reference when assign morphology to
+        # untagged tokens.
+        self.rich_tags[self.n_tags].id = self.n_tags
+ 
        self._cache = PreshMapArray(self.n_tags)
        self.exc = {}
        if exc is not None:
@ -62,6 +66,10 @@ cdef class Morphology:
        return (Morphology, (self.strings, self.tag_map, self.lemmatizer,
                             self.exc), None, None)

+    cdef int assign_untagged(self, TokenC* token) except -1:
+        '''Set morphological attributes on a token without a POS tag.'''
+        token.lemma = self.lemmatize(0, token.lex.orth, {})
+
    cdef int assign_tag(self, TokenC* token, tag) except -1:
        if isinstance(tag, basestring):
            tag = self.strings.add(tag)
@ -72,7 +80,7 @@ cdef class Morphology:
            token.tag = tag

    cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
-        if tag_id >= self.n_tags:
+        if tag_id > self.n_tags:
            raise ValueError("Unknown tag ID: %s" % tag_id)
        # TODO: It's pretty arbitrary to put this logic here. I guess the justification
        # is that this is where the specific word and the tag interact. Still,
@ -151,8 +159,6 @@ cdef class Morphology:
        cdef unicode py_string = self.strings[orth]
        if self.lemmatizer is None:
            return self.strings.add(py_string.lower())
-        if univ_pos not in (NOUN, VERB, ADJ, PUNCT):
-            return self.strings.add(py_string.lower())
        cdef set lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
--- a/spacy/tests/doc/test_creation.py
+++ b/spacy/tests/doc/test_creation.py
@ -0,0 +1,37 @@
+'''Test Doc sets up tokens correctly.'''
+from __future__ import unicode_literals
+import pytest
+
+from ...vocab import Vocab
+from ...tokens.doc import Doc
+from ...lemmatizerlookup import Lemmatizer
+
+
+@pytest.fixture
+def lemmatizer():
+    return Lemmatizer({'dogs': 'dog', 'boxen': 'box', 'mice': 'mouse'})
+
+
+@pytest.fixture
+def vocab(lemmatizer):
+    return Vocab(lemmatizer=lemmatizer)
+
+
+def test_empty_doc(vocab):
+    doc = Doc(vocab)
+    assert len(doc) == 0
+
+
+def test_single_word(vocab):
+    doc = Doc(vocab, words=['a'])
+    assert doc.text == 'a '
+    doc = Doc(vocab, words=['a'], spaces=[False])
+    assert doc.text == 'a'
+
+
+def test_lookup_lemmatization(vocab):
+    doc = Doc(vocab, words=['dogs', 'dogses'])
+    assert doc[0].text == 'dogs'
+    assert doc[0].lemma_ == 'dog'
+    assert doc[1].text == 'dogses'
+    assert doc[1].lemma_ == 'dogses'
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -512,6 +512,8 @@ cdef class Doc:
        assert t.lex.orth != 0
        t.spacy = has_space
        self.length += 1
+        # Set morphological attributes, e.g. by lemma, if possible
+        self.vocab.morphology.assign_untagged(t)
        self._py_tokens.append(None)
        return t.idx + t.lex.length + t.spacy