diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 101bcad63..d13aea7cc 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -44,3 +44,20 @@ cdef class Tokens: cdef class Token: cdef readonly Tokens _seq cdef readonly int i + + cdef readonly attr_t idx + cdef readonly attr_t cluster + cdef readonly attr_t length + cdef readonly attr_t sic + cdef readonly attr_t norm1 + cdef readonly attr_t norm2 + cdef readonly attr_t shape + cdef readonly attr_t prefix + cdef readonly attr_t suffix + cdef readonly float prob + cdef readonly float sentiment + cdef readonly attr_t flags + cdef readonly attr_t lemma + cdef readonly univ_tag_t pos + cdef readonly attr_t fine_pos + cdef readonly attr_t dep_tag diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 4c0156df3..3583ffef6 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -184,6 +184,23 @@ cdef class Token: def __init__(self, Tokens tokens, int i): self._seq = tokens self.i = i + cdef const TokenC* t = &tokens.data[i] + self.idx = t.idx + self.cluster = t.lex.cluster + self.length = t.lex.length + self.sic = t.lex.sic + self.norm1 = t.lex.norm1 + self.norm2 = t.lex.norm2 + self.shape = t.lex.shape + self.prefix = t.lex.prefix + self.suffix = t.lex.suffix + self.prob = t.lex.prob + self.sentiment = t.lex.sentiment + self.flags = t.lex.flags + self.lemma = t.lemma + self.pos = t.pos + self.fine_pos = t.fine_pos + self.dep_tag = t.dep_tag def __unicode__(self): cdef const TokenC* t = &self._seq.data[self.i] @@ -203,34 +220,11 @@ cdef class Token: """ return self._seq.data[self.i].lex.length - property idx: - """The index into the original string at which the token starts. - - The following is supposed to always be true: - - >>> original_string[token.idx:token.idx len(token) == token.string - """ + property head: + """The token predicted by the parser to be the head of the current token.""" def __get__(self): - return self._seq.data[self.i].idx - - property cluster: - """The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering - - Similar words have better-than-chance likelihood of having similar cluster - IDs, although the clustering is quite noisy. Cluster IDs make good features, - and help to make models slightly more robust to domain variation. - - A common trick is to use only the first N bits of a cluster ID in a feature, - as the more general part of the hierarchical clustering is often more accurate - than the lower categories. - - To assist in this, I encode the cluster IDs little-endian, to allow a simple - bit-mask: - - >>> six_bits = cluster & (2**6 - 1) - """ - def __get__(self): - return self._seq.data[self.i].lex.cluster + cdef const TokenC* t = &self._seq.data[self.i] + return Token(self._seq, self.i + t.head) property string: """The unicode string of the word, with no whitespace padding.""" @@ -241,10 +235,31 @@ cdef class Token: cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic] return py_ustr - property lemma: - """The unicode string of the word's lemma. If no part-of-speech tag is - assigned, the most common part-of-speech tag of the word is used. - """ + property sic_: + def __get__(self): + return self._seq.vocab.strings[self.sic] + + property norm1_: + def __get__(self): + return self._seq.vocab.strings[self.norm1] + + property norm2_: + def __get__(self): + return self._seq.vocab.strings[self.norm2] + + property shape_: + def __get__(self): + return self._seq.vocab.strings[self.shape] + + property prefix_: + def __get__(self): + return self._seq.vocab.strings[self.prefix] + + property suffix_: + def __get__(self): + return self._seq.vocab.strings[self.suffix] + + property lemma_: def __get__(self): cdef const TokenC* t = &self._seq.data[self.i] if t.lemma == 0: @@ -252,36 +267,16 @@ cdef class Token: cdef unicode py_ustr = self._seq.vocab.strings[t.lemma] return py_ustr - property dep_tag: - """The ID integer of the word's dependency label. If no parse has been - assigned, defaults to 0. - """ + property pos_: def __get__(self): - return self._seq.data[self.i].dep_tag + return self._seq.vocab.strings[self.pos] - property pos: - """The ID integer of the word's part-of-speech tag, from the 13-tag - Google Universal Tag Set. Constants for this tag set are available in - spacy.typedefs. - """ + property fine_pos_: def __get__(self): - return self._seq.data[self.i].pos + return self._seq.vocab.strings[self.fine_pos] - property fine_pos: - """The ID integer of the word's fine-grained part-of-speech tag, as assigned - by the tagger model. Fine-grained tags include morphological information, - and other distinctions, and allow a more accurate tagger to be trained. - """ - + property dep_tag_: def __get__(self): - return self._seq.data[self.i].fine_pos + return self._seq.vocab.strings[self.dep_tag] - property sic: - def __get__(self): - return self._seq.data[self.i].lex.sic - property head: - """The token predicted by the parser to be the head of the current token.""" - def __get__(self): - cdef const TokenC* t = &self._seq.data[self.i] - return Token(self._seq, self.i + t.head) diff --git a/tests/test_add_lemmas.py b/tests/test_add_lemmas.py index 877a2ddee..44d7888fe 100644 --- a/tests/test_add_lemmas.py +++ b/tests/test_add_lemmas.py @@ -15,7 +15,7 @@ def tagged(EN): @pytest.fixture def lemmas(tagged): - return [t.lemma for t in tagged] + return [t.lemma_ for t in tagged] def test_lemmas(lemmas, tagged): diff --git a/tests/test_contractions.py b/tests/test_contractions.py index aeaccaaf2..420d1aec3 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -26,7 +26,7 @@ def test_LL(EN): tokens = EN("we'll") assert len(tokens) == 2 assert tokens[1].string == "'ll" - assert tokens[1].lemma == "will" + assert tokens[1].lemma_ == "will" assert tokens[0].string == "we" @@ -34,9 +34,9 @@ def test_aint(EN): tokens = EN("ain't") assert len(tokens) == 2 assert tokens[0].string == "ai" - assert tokens[0].lemma == "be" + assert tokens[0].lemma_ == "be" assert tokens[1].string == "n't" - assert tokens[1].lemma == "not" + assert tokens[1].lemma_ == "not" def test_capitalized(EN): @@ -47,7 +47,7 @@ def test_capitalized(EN): tokens = EN("Ain't") assert len(tokens) == 2 assert tokens[0].string == "Ai" - assert tokens[0].lemma == "be" + assert tokens[0].lemma_ == "be" def test_punct(EN): diff --git a/tests/test_morph_exceptions.py b/tests/test_morph_exceptions.py index c9f066101..22e2c702a 100644 --- a/tests/test_morph_exceptions.py +++ b/tests/test_morph_exceptions.py @@ -22,4 +22,4 @@ def test_load_exc(EN, morph_exc): tokens = EN('I like his style.', tag=True) his = tokens[2] assert EN.tagger.tag_names[his.fine_pos] == 'PRP$' - assert his.lemma == '-PRP-' + assert his.lemma_ == '-PRP-'