* Revise interface to Token. Strings now have attribute names like norm1_

2025-07-16 03:02:41 +03:00 · 2015-01-15 03:51:47 +11:00 · 2015-01-15 03:51:47 +11:00 · 802867e96a
commit 802867e96a
parent 7d3c40de7d
5 changed files with 75 additions and 63 deletions
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -44,3 +44,20 @@ cdef class Tokens:
 cdef class Token:
    cdef readonly Tokens _seq
    cdef readonly int i
+
+    cdef readonly attr_t idx
+    cdef readonly attr_t cluster
+    cdef readonly attr_t length
+    cdef readonly attr_t sic
+    cdef readonly attr_t norm1
+    cdef readonly attr_t norm2
+    cdef readonly attr_t shape
+    cdef readonly attr_t prefix
+    cdef readonly attr_t suffix
+    cdef readonly float prob
+    cdef readonly float sentiment
+    cdef readonly attr_t flags
+    cdef readonly attr_t lemma
+    cdef readonly univ_tag_t pos
+    cdef readonly attr_t fine_pos
+    cdef readonly attr_t dep_tag
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -184,6 +184,23 @@ cdef class Token:
    def __init__(self, Tokens tokens, int i):
        self._seq = tokens
        self.i = i
+        cdef const TokenC* t = &tokens.data[i]
+        self.idx = t.idx
+        self.cluster = t.lex.cluster
+        self.length = t.lex.length
+        self.sic = t.lex.sic
+        self.norm1 = t.lex.norm1
+        self.norm2 = t.lex.norm2
+        self.shape = t.lex.shape
+        self.prefix = t.lex.prefix
+        self.suffix = t.lex.suffix
+        self.prob = t.lex.prob
+        self.sentiment = t.lex.sentiment
+        self.flags = t.lex.flags
+        self.lemma = t.lemma
+        self.pos = t.pos
+        self.fine_pos = t.fine_pos
+        self.dep_tag = t.dep_tag

    def __unicode__(self):
        cdef const TokenC* t = &self._seq.data[self.i]
@ -203,34 +220,11 @@ cdef class Token:
        """
        return self._seq.data[self.i].lex.length

-    property idx:
-        """The index into the original string at which the token starts.
-
-        The following is supposed to always be true:
-        
-        >>> original_string[token.idx:token.idx len(token) == token.string
-        """
+    property head:
+        """The token predicted by the parser to be the head of the current token."""
        def __get__(self):
-            return self._seq.data[self.i].idx
-
-    property cluster:
-        """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
-    
-        Similar words have better-than-chance likelihood of having similar cluster
-        IDs, although the clustering is quite noisy.  Cluster IDs make good features,
-        and help to make models slightly more robust to domain variation.
-
-        A common trick is to use only the first N bits of a cluster ID in a feature,
-        as the more general part of the hierarchical clustering is often more accurate
-        than the lower categories.
-
-        To assist in this, I encode the cluster IDs little-endian, to allow a simple
-        bit-mask:
-
-        >>> six_bits = cluster & (2**6 - 1)
-        """
-        def __get__(self):
-            return self._seq.data[self.i].lex.cluster
+            cdef const TokenC* t = &self._seq.data[self.i]
+            return Token(self._seq, self.i + t.head)

    property string:
        """The unicode string of the word, with no whitespace padding."""
@ -241,10 +235,31 @@ cdef class Token:
            cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
            return py_ustr

-    property lemma:
-        """The unicode string of the word's lemma.  If no part-of-speech tag is
-        assigned, the most common part-of-speech tag of the word is used.
-        """
+    property sic_:
+        def __get__(self):
+            return self._seq.vocab.strings[self.sic]
+
+    property norm1_:
+        def __get__(self):
+            return self._seq.vocab.strings[self.norm1]
+
+    property norm2_:
+        def __get__(self):
+            return self._seq.vocab.strings[self.norm2]
+
+    property shape_:
+        def __get__(self):
+            return self._seq.vocab.strings[self.shape]
+
+    property prefix_:
+        def __get__(self):
+            return self._seq.vocab.strings[self.prefix]
+
+    property suffix_:
+        def __get__(self):
+            return self._seq.vocab.strings[self.suffix]
+
+    property lemma_:
        def __get__(self):
            cdef const TokenC* t = &self._seq.data[self.i]
            if t.lemma == 0:
@ -252,36 +267,16 @@ cdef class Token:
            cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
            return py_ustr

-    property dep_tag:
-        """The ID integer of the word's dependency label.  If no parse has been
-        assigned, defaults to 0.
-        """
+    property pos_:
        def __get__(self):
-            return self._seq.data[self.i].dep_tag
+            return self._seq.vocab.strings[self.pos]

-    property pos:
-        """The ID integer of the word's part-of-speech tag, from the 13-tag
-        Google Universal Tag Set.  Constants for this tag set are available in
-        spacy.typedefs.
-        """
+    property fine_pos_:
        def __get__(self):
-            return self._seq.data[self.i].pos
-
-    property fine_pos:
-        """The ID integer of the word's fine-grained part-of-speech tag, as assigned
-        by the tagger model.  Fine-grained tags include morphological information,
-        and other distinctions, and allow a more accurate tagger to be trained.
-        """
+            return self._seq.vocab.strings[self.fine_pos]

+    property dep_tag_:
        def __get__(self):
-            return self._seq.data[self.i].fine_pos
+            return self._seq.vocab.strings[self.dep_tag]

-    property sic:
-        def __get__(self):
-            return self._seq.data[self.i].lex.sic

-    property head:
-        """The token predicted by the parser to be the head of the current token."""
-        def __get__(self):
-            cdef const TokenC* t = &self._seq.data[self.i]
-            return Token(self._seq, self.i + t.head)
--- a/tests/test_add_lemmas.py
+++ b/tests/test_add_lemmas.py
@ -15,7 +15,7 @@ def tagged(EN):

@pytest.fixture
 def lemmas(tagged):
-    return [t.lemma for t in tagged]
+    return [t.lemma_ for t in tagged]


 def test_lemmas(lemmas, tagged):
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -26,7 +26,7 @@ def test_LL(EN):
    tokens = EN("we'll")
    assert len(tokens) == 2
    assert tokens[1].string == "'ll"
-    assert tokens[1].lemma == "will"
+    assert tokens[1].lemma_ == "will"
    assert tokens[0].string == "we"


@ -34,9 +34,9 @@ def test_aint(EN):
    tokens = EN("ain't")
    assert len(tokens) == 2
    assert tokens[0].string == "ai"
-    assert tokens[0].lemma == "be"
+    assert tokens[0].lemma_ == "be"
    assert tokens[1].string == "n't"
-    assert tokens[1].lemma == "not"
+    assert tokens[1].lemma_ == "not"


 def test_capitalized(EN):
@ -47,7 +47,7 @@ def test_capitalized(EN):
    tokens = EN("Ain't")
    assert len(tokens) == 2
    assert tokens[0].string == "Ai"
-    assert tokens[0].lemma == "be"
+    assert tokens[0].lemma_ == "be"


 def test_punct(EN):
--- a/tests/test_morph_exceptions.py
+++ b/tests/test_morph_exceptions.py
@ -22,4 +22,4 @@ def test_load_exc(EN, morph_exc):
    tokens = EN('I like his style.', tag=True)
    his = tokens[2]
    assert EN.tagger.tag_names[his.fine_pos] == 'PRP$'
-    assert his.lemma == '-PRP-'
+    assert his.lemma_ == '-PRP-'