* Revise interface to Token. Strings now have attribute names like norm1_

This commit is contained in:
Matthew Honnibal 2015-01-15 03:51:47 +11:00
parent 7d3c40de7d
commit 802867e96a
5 changed files with 75 additions and 63 deletions

View File

@ -44,3 +44,20 @@ cdef class Tokens:
cdef class Token: cdef class Token:
cdef readonly Tokens _seq cdef readonly Tokens _seq
cdef readonly int i cdef readonly int i
cdef readonly attr_t idx
cdef readonly attr_t cluster
cdef readonly attr_t length
cdef readonly attr_t sic
cdef readonly attr_t norm1
cdef readonly attr_t norm2
cdef readonly attr_t shape
cdef readonly attr_t prefix
cdef readonly attr_t suffix
cdef readonly float prob
cdef readonly float sentiment
cdef readonly attr_t flags
cdef readonly attr_t lemma
cdef readonly univ_tag_t pos
cdef readonly attr_t fine_pos
cdef readonly attr_t dep_tag

View File

@ -184,6 +184,23 @@ cdef class Token:
def __init__(self, Tokens tokens, int i): def __init__(self, Tokens tokens, int i):
self._seq = tokens self._seq = tokens
self.i = i self.i = i
cdef const TokenC* t = &tokens.data[i]
self.idx = t.idx
self.cluster = t.lex.cluster
self.length = t.lex.length
self.sic = t.lex.sic
self.norm1 = t.lex.norm1
self.norm2 = t.lex.norm2
self.shape = t.lex.shape
self.prefix = t.lex.prefix
self.suffix = t.lex.suffix
self.prob = t.lex.prob
self.sentiment = t.lex.sentiment
self.flags = t.lex.flags
self.lemma = t.lemma
self.pos = t.pos
self.fine_pos = t.fine_pos
self.dep_tag = t.dep_tag
def __unicode__(self): def __unicode__(self):
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
@ -203,34 +220,11 @@ cdef class Token:
""" """
return self._seq.data[self.i].lex.length return self._seq.data[self.i].lex.length
property idx: property head:
"""The index into the original string at which the token starts. """The token predicted by the parser to be the head of the current token."""
The following is supposed to always be true:
>>> original_string[token.idx:token.idx len(token) == token.string
"""
def __get__(self): def __get__(self):
return self._seq.data[self.i].idx cdef const TokenC* t = &self._seq.data[self.i]
return Token(self._seq, self.i + t.head)
property cluster:
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
Similar words have better-than-chance likelihood of having similar cluster
IDs, although the clustering is quite noisy. Cluster IDs make good features,
and help to make models slightly more robust to domain variation.
A common trick is to use only the first N bits of a cluster ID in a feature,
as the more general part of the hierarchical clustering is often more accurate
than the lower categories.
To assist in this, I encode the cluster IDs little-endian, to allow a simple
bit-mask:
>>> six_bits = cluster & (2**6 - 1)
"""
def __get__(self):
return self._seq.data[self.i].lex.cluster
property string: property string:
"""The unicode string of the word, with no whitespace padding.""" """The unicode string of the word, with no whitespace padding."""
@ -241,10 +235,31 @@ cdef class Token:
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic] cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
return py_ustr return py_ustr
property lemma: property sic_:
"""The unicode string of the word's lemma. If no part-of-speech tag is def __get__(self):
assigned, the most common part-of-speech tag of the word is used. return self._seq.vocab.strings[self.sic]
"""
property norm1_:
def __get__(self):
return self._seq.vocab.strings[self.norm1]
property norm2_:
def __get__(self):
return self._seq.vocab.strings[self.norm2]
property shape_:
def __get__(self):
return self._seq.vocab.strings[self.shape]
property prefix_:
def __get__(self):
return self._seq.vocab.strings[self.prefix]
property suffix_:
def __get__(self):
return self._seq.vocab.strings[self.suffix]
property lemma_:
def __get__(self): def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i] cdef const TokenC* t = &self._seq.data[self.i]
if t.lemma == 0: if t.lemma == 0:
@ -252,36 +267,16 @@ cdef class Token:
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma] cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
return py_ustr return py_ustr
property dep_tag: property pos_:
"""The ID integer of the word's dependency label. If no parse has been
assigned, defaults to 0.
"""
def __get__(self): def __get__(self):
return self._seq.data[self.i].dep_tag return self._seq.vocab.strings[self.pos]
property pos: property fine_pos_:
"""The ID integer of the word's part-of-speech tag, from the 13-tag
Google Universal Tag Set. Constants for this tag set are available in
spacy.typedefs.
"""
def __get__(self): def __get__(self):
return self._seq.data[self.i].pos return self._seq.vocab.strings[self.fine_pos]
property fine_pos: property dep_tag_:
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
by the tagger model. Fine-grained tags include morphological information,
and other distinctions, and allow a more accurate tagger to be trained.
"""
def __get__(self): def __get__(self):
return self._seq.data[self.i].fine_pos return self._seq.vocab.strings[self.dep_tag]
property sic:
def __get__(self):
return self._seq.data[self.i].lex.sic
property head:
"""The token predicted by the parser to be the head of the current token."""
def __get__(self):
cdef const TokenC* t = &self._seq.data[self.i]
return Token(self._seq, self.i + t.head)

View File

@ -15,7 +15,7 @@ def tagged(EN):
@pytest.fixture @pytest.fixture
def lemmas(tagged): def lemmas(tagged):
return [t.lemma for t in tagged] return [t.lemma_ for t in tagged]
def test_lemmas(lemmas, tagged): def test_lemmas(lemmas, tagged):

View File

@ -26,7 +26,7 @@ def test_LL(EN):
tokens = EN("we'll") tokens = EN("we'll")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[1].string == "'ll" assert tokens[1].string == "'ll"
assert tokens[1].lemma == "will" assert tokens[1].lemma_ == "will"
assert tokens[0].string == "we" assert tokens[0].string == "we"
@ -34,9 +34,9 @@ def test_aint(EN):
tokens = EN("ain't") tokens = EN("ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].string == "ai" assert tokens[0].string == "ai"
assert tokens[0].lemma == "be" assert tokens[0].lemma_ == "be"
assert tokens[1].string == "n't" assert tokens[1].string == "n't"
assert tokens[1].lemma == "not" assert tokens[1].lemma_ == "not"
def test_capitalized(EN): def test_capitalized(EN):
@ -47,7 +47,7 @@ def test_capitalized(EN):
tokens = EN("Ain't") tokens = EN("Ain't")
assert len(tokens) == 2 assert len(tokens) == 2
assert tokens[0].string == "Ai" assert tokens[0].string == "Ai"
assert tokens[0].lemma == "be" assert tokens[0].lemma_ == "be"
def test_punct(EN): def test_punct(EN):

View File

@ -22,4 +22,4 @@ def test_load_exc(EN, morph_exc):
tokens = EN('I like his style.', tag=True) tokens = EN('I like his style.', tag=True)
his = tokens[2] his = tokens[2]
assert EN.tagger.tag_names[his.fine_pos] == 'PRP$' assert EN.tagger.tag_names[his.fine_pos] == 'PRP$'
assert his.lemma == '-PRP-' assert his.lemma_ == '-PRP-'