* Revise interface to Token. Strings now have attribute names like norm1_

2025-12-03 08:14:20 +03:00 · 2015-01-15 03:51:47 +11:00 · 2015-01-15 03:51:47 +11:00 · 802867e96a
commit 802867e96a
parent 7d3c40de7d
5 changed files with 75 additions and 63 deletions
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -44,3 +44,20 @@ cdef class Tokens:
 cdef class Token:
    cdef readonly Tokens _seq
    cdef readonly int i
    cdef readonly attr_t idx
    cdef readonly attr_t cluster
    cdef readonly attr_t length
    cdef readonly attr_t sic
    cdef readonly attr_t norm1
    cdef readonly attr_t norm2
    cdef readonly attr_t shape
    cdef readonly attr_t prefix
    cdef readonly attr_t suffix
    cdef readonly float prob
    cdef readonly float sentiment
    cdef readonly attr_t flags
    cdef readonly attr_t lemma
    cdef readonly univ_tag_t pos
    cdef readonly attr_t fine_pos
    cdef readonly attr_t dep_tag
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -184,6 +184,23 @@ cdef class Token:
    def __init__(self, Tokens tokens, int i):
        self._seq = tokens
        self.i = i
        cdef const TokenC* t = &tokens.data[i]
        self.idx = t.idx
        self.cluster = t.lex.cluster
        self.length = t.lex.length
        self.sic = t.lex.sic
        self.norm1 = t.lex.norm1
        self.norm2 = t.lex.norm2
        self.shape = t.lex.shape
        self.prefix = t.lex.prefix
        self.suffix = t.lex.suffix
        self.prob = t.lex.prob
        self.sentiment = t.lex.sentiment
        self.flags = t.lex.flags
        self.lemma = t.lemma
        self.pos = t.pos
        self.fine_pos = t.fine_pos
        self.dep_tag = t.dep_tag
    def __unicode__(self):
        cdef const TokenC* t = &self._seq.data[self.i]
@ -203,34 +220,11 @@ cdef class Token:
        """
        return self._seq.data[self.i].lex.length
-    property idx:
+    property head:
-        """The index into the original string at which the token starts.
+        """The token predicted by the parser to be the head of the current token."""
        The following is supposed to always be true:
        >>> original_string[token.idx:token.idx len(token) == token.string
        """
        def __get__(self):
-            return self._seq.data[self.i].idx
+            cdef const TokenC* t = &self._seq.data[self.i]
-
+            return Token(self._seq, self.i + t.head)
    property cluster:
        """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
        Similar words have better-than-chance likelihood of having similar cluster
        IDs, although the clustering is quite noisy.  Cluster IDs make good features,
        and help to make models slightly more robust to domain variation.
        A common trick is to use only the first N bits of a cluster ID in a feature,
        as the more general part of the hierarchical clustering is often more accurate
        than the lower categories.
        To assist in this, I encode the cluster IDs little-endian, to allow a simple
        bit-mask:
        >>> six_bits = cluster & (2**6 - 1)
        """
        def __get__(self):
            return self._seq.data[self.i].lex.cluster
    property string:
        """The unicode string of the word, with no whitespace padding."""
@ -241,10 +235,31 @@ cdef class Token:
            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
            return py_ustr
-    property lemma:
+    property sic_:
-        """The unicode string of the word's lemma.  If no part-of-speech tag is
+        def __get__(self):
-        assigned, the most common part-of-speech tag of the word is used.
+            return self._seq.vocab.strings[self.sic]
-        """
+
    property norm1_:
        def __get__(self):
            return self._seq.vocab.strings[self.norm1]
    property norm2_:
        def __get__(self):
            return self._seq.vocab.strings[self.norm2]
    property shape_:
        def __get__(self):
            return self._seq.vocab.strings[self.shape]
    property prefix_:
        def __get__(self):
            return self._seq.vocab.strings[self.prefix]
    property suffix_:
        def __get__(self):
            return self._seq.vocab.strings[self.suffix]
    property lemma_:
        def __get__(self):
            cdef const TokenC* t = &self._seq.data[self.i]
            if t.lemma == 0:
@ -252,36 +267,16 @@ cdef class Token:
            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
            return py_ustr
-    property dep_tag:
+    property pos_:
        """The ID integer of the word's dependency label.  If no parse has been
        assigned, defaults to 0.
        """
        def __get__(self):
-            return self._seq.data[self.i].dep_tag
+            return self._seq.vocab.strings[self.pos]
-    property pos:
+    property fine_pos_:
        """The ID integer of the word's part-of-speech tag, from the 13-tag
        Google Universal Tag Set.  Constants for this tag set are available in
        spacy.typedefs.
        """
        def __get__(self):
-            return self._seq.data[self.i].pos
+            return self._seq.vocab.strings[self.fine_pos]
-    property fine_pos:
+    property dep_tag_:
        """The ID integer of the word's fine-grained part-of-speech tag, as assigned
        by the tagger model.  Fine-grained tags include morphological information,
        and other distinctions, and allow a more accurate tagger to be trained.
        """
        def __get__(self):
-            return self._seq.data[self.i].fine_pos
+            return self._seq.vocab.strings[self.dep_tag]
    property sic:
        def __get__(self):
            return self._seq.data[self.i].lex.sic
    property head:
        """The token predicted by the parser to be the head of the current token."""
        def __get__(self):
            cdef const TokenC* t = &self._seq.data[self.i]
            return Token(self._seq, self.i + t.head)
--- a/tests/test_add_lemmas.py
+++ b/tests/test_add_lemmas.py
@ -15,7 +15,7 @@ def tagged(EN):
@pytest.fixture
 def lemmas(tagged):
-    return [t.lemma for t in tagged]
+    return [t.lemma_ for t in tagged]
 def test_lemmas(lemmas, tagged):
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -26,7 +26,7 @@ def test_LL(EN):
    tokens = EN("we'll")
    assert len(tokens) == 2
    assert tokens[1].string == "'ll"
-    assert tokens[1].lemma == "will"
+    assert tokens[1].lemma_ == "will"
    assert tokens[0].string == "we"
@ -34,9 +34,9 @@ def test_aint(EN):
    tokens = EN("ain't")
    assert len(tokens) == 2
    assert tokens[0].string == "ai"
-    assert tokens[0].lemma == "be"
+    assert tokens[0].lemma_ == "be"
    assert tokens[1].string == "n't"
-    assert tokens[1].lemma == "not"
+    assert tokens[1].lemma_ == "not"
 def test_capitalized(EN):
@ -47,7 +47,7 @@ def test_capitalized(EN):
    tokens = EN("Ain't")
    assert len(tokens) == 2
    assert tokens[0].string == "Ai"
-    assert tokens[0].lemma == "be"
+    assert tokens[0].lemma_ == "be"
 def test_punct(EN):
--- a/tests/test_morph_exceptions.py
+++ b/tests/test_morph_exceptions.py
@ -22,4 +22,4 @@ def test_load_exc(EN, morph_exc):
    tokens = EN('I like his style.', tag=True)
    his = tokens[2]
    assert EN.tagger.tag_names[his.fine_pos] == 'PRP$'
-    assert his.lemma == '-PRP-'
+    assert his.lemma_ == '-PRP-'