mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Revise interface to Token. Strings now have attribute names like norm1_
This commit is contained in:
parent
7d3c40de7d
commit
802867e96a
|
@ -44,3 +44,20 @@ cdef class Tokens:
|
|||
cdef class Token:
|
||||
cdef readonly Tokens _seq
|
||||
cdef readonly int i
|
||||
|
||||
cdef readonly attr_t idx
|
||||
cdef readonly attr_t cluster
|
||||
cdef readonly attr_t length
|
||||
cdef readonly attr_t sic
|
||||
cdef readonly attr_t norm1
|
||||
cdef readonly attr_t norm2
|
||||
cdef readonly attr_t shape
|
||||
cdef readonly attr_t prefix
|
||||
cdef readonly attr_t suffix
|
||||
cdef readonly float prob
|
||||
cdef readonly float sentiment
|
||||
cdef readonly attr_t flags
|
||||
cdef readonly attr_t lemma
|
||||
cdef readonly univ_tag_t pos
|
||||
cdef readonly attr_t fine_pos
|
||||
cdef readonly attr_t dep_tag
|
||||
|
|
109
spacy/tokens.pyx
109
spacy/tokens.pyx
|
@ -184,6 +184,23 @@ cdef class Token:
|
|||
def __init__(self, Tokens tokens, int i):
|
||||
self._seq = tokens
|
||||
self.i = i
|
||||
cdef const TokenC* t = &tokens.data[i]
|
||||
self.idx = t.idx
|
||||
self.cluster = t.lex.cluster
|
||||
self.length = t.lex.length
|
||||
self.sic = t.lex.sic
|
||||
self.norm1 = t.lex.norm1
|
||||
self.norm2 = t.lex.norm2
|
||||
self.shape = t.lex.shape
|
||||
self.prefix = t.lex.prefix
|
||||
self.suffix = t.lex.suffix
|
||||
self.prob = t.lex.prob
|
||||
self.sentiment = t.lex.sentiment
|
||||
self.flags = t.lex.flags
|
||||
self.lemma = t.lemma
|
||||
self.pos = t.pos
|
||||
self.fine_pos = t.fine_pos
|
||||
self.dep_tag = t.dep_tag
|
||||
|
||||
def __unicode__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
|
@ -203,34 +220,11 @@ cdef class Token:
|
|||
"""
|
||||
return self._seq.data[self.i].lex.length
|
||||
|
||||
property idx:
|
||||
"""The index into the original string at which the token starts.
|
||||
|
||||
The following is supposed to always be true:
|
||||
|
||||
>>> original_string[token.idx:token.idx len(token) == token.string
|
||||
"""
|
||||
property head:
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].idx
|
||||
|
||||
property cluster:
|
||||
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
||||
|
||||
Similar words have better-than-chance likelihood of having similar cluster
|
||||
IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
||||
and help to make models slightly more robust to domain variation.
|
||||
|
||||
A common trick is to use only the first N bits of a cluster ID in a feature,
|
||||
as the more general part of the hierarchical clustering is often more accurate
|
||||
than the lower categories.
|
||||
|
||||
To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
||||
bit-mask:
|
||||
|
||||
>>> six_bits = cluster & (2**6 - 1)
|
||||
"""
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].lex.cluster
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
return Token(self._seq, self.i + t.head)
|
||||
|
||||
property string:
|
||||
"""The unicode string of the word, with no whitespace padding."""
|
||||
|
@ -241,10 +235,31 @@ cdef class Token:
|
|||
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
|
||||
return py_ustr
|
||||
|
||||
property lemma:
|
||||
"""The unicode string of the word's lemma. If no part-of-speech tag is
|
||||
assigned, the most common part-of-speech tag of the word is used.
|
||||
"""
|
||||
property sic_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.sic]
|
||||
|
||||
property norm1_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.norm1]
|
||||
|
||||
property norm2_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.norm2]
|
||||
|
||||
property shape_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.shape]
|
||||
|
||||
property prefix_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.prefix]
|
||||
|
||||
property suffix_:
|
||||
def __get__(self):
|
||||
return self._seq.vocab.strings[self.suffix]
|
||||
|
||||
property lemma_:
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
if t.lemma == 0:
|
||||
|
@ -252,36 +267,16 @@ cdef class Token:
|
|||
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
||||
return py_ustr
|
||||
|
||||
property dep_tag:
|
||||
"""The ID integer of the word's dependency label. If no parse has been
|
||||
assigned, defaults to 0.
|
||||
"""
|
||||
property pos_:
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].dep_tag
|
||||
return self._seq.vocab.strings[self.pos]
|
||||
|
||||
property pos:
|
||||
"""The ID integer of the word's part-of-speech tag, from the 13-tag
|
||||
Google Universal Tag Set. Constants for this tag set are available in
|
||||
spacy.typedefs.
|
||||
"""
|
||||
property fine_pos_:
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].pos
|
||||
return self._seq.vocab.strings[self.fine_pos]
|
||||
|
||||
property fine_pos:
|
||||
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
||||
by the tagger model. Fine-grained tags include morphological information,
|
||||
and other distinctions, and allow a more accurate tagger to be trained.
|
||||
"""
|
||||
|
||||
property dep_tag_:
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].fine_pos
|
||||
return self._seq.vocab.strings[self.dep_tag]
|
||||
|
||||
property sic:
|
||||
def __get__(self):
|
||||
return self._seq.data[self.i].lex.sic
|
||||
|
||||
property head:
|
||||
"""The token predicted by the parser to be the head of the current token."""
|
||||
def __get__(self):
|
||||
cdef const TokenC* t = &self._seq.data[self.i]
|
||||
return Token(self._seq, self.i + t.head)
|
||||
|
|
|
@ -15,7 +15,7 @@ def tagged(EN):
|
|||
|
||||
@pytest.fixture
|
||||
def lemmas(tagged):
|
||||
return [t.lemma for t in tagged]
|
||||
return [t.lemma_ for t in tagged]
|
||||
|
||||
|
||||
def test_lemmas(lemmas, tagged):
|
||||
|
|
|
@ -26,7 +26,7 @@ def test_LL(EN):
|
|||
tokens = EN("we'll")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].string == "'ll"
|
||||
assert tokens[1].lemma == "will"
|
||||
assert tokens[1].lemma_ == "will"
|
||||
assert tokens[0].string == "we"
|
||||
|
||||
|
||||
|
@ -34,9 +34,9 @@ def test_aint(EN):
|
|||
tokens = EN("ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string == "ai"
|
||||
assert tokens[0].lemma == "be"
|
||||
assert tokens[0].lemma_ == "be"
|
||||
assert tokens[1].string == "n't"
|
||||
assert tokens[1].lemma == "not"
|
||||
assert tokens[1].lemma_ == "not"
|
||||
|
||||
|
||||
def test_capitalized(EN):
|
||||
|
@ -47,7 +47,7 @@ def test_capitalized(EN):
|
|||
tokens = EN("Ain't")
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string == "Ai"
|
||||
assert tokens[0].lemma == "be"
|
||||
assert tokens[0].lemma_ == "be"
|
||||
|
||||
|
||||
def test_punct(EN):
|
||||
|
|
|
@ -22,4 +22,4 @@ def test_load_exc(EN, morph_exc):
|
|||
tokens = EN('I like his style.', tag=True)
|
||||
his = tokens[2]
|
||||
assert EN.tagger.tag_names[his.fine_pos] == 'PRP$'
|
||||
assert his.lemma == '-PRP-'
|
||||
assert his.lemma_ == '-PRP-'
|
||||
|
|
Loading…
Reference in New Issue
Block a user