mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* Revise interface to Token. Strings now have attribute names like norm1_
This commit is contained in:
parent
7d3c40de7d
commit
802867e96a
|
@ -44,3 +44,20 @@ cdef class Tokens:
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
cdef readonly Tokens _seq
|
cdef readonly Tokens _seq
|
||||||
cdef readonly int i
|
cdef readonly int i
|
||||||
|
|
||||||
|
cdef readonly attr_t idx
|
||||||
|
cdef readonly attr_t cluster
|
||||||
|
cdef readonly attr_t length
|
||||||
|
cdef readonly attr_t sic
|
||||||
|
cdef readonly attr_t norm1
|
||||||
|
cdef readonly attr_t norm2
|
||||||
|
cdef readonly attr_t shape
|
||||||
|
cdef readonly attr_t prefix
|
||||||
|
cdef readonly attr_t suffix
|
||||||
|
cdef readonly float prob
|
||||||
|
cdef readonly float sentiment
|
||||||
|
cdef readonly attr_t flags
|
||||||
|
cdef readonly attr_t lemma
|
||||||
|
cdef readonly univ_tag_t pos
|
||||||
|
cdef readonly attr_t fine_pos
|
||||||
|
cdef readonly attr_t dep_tag
|
||||||
|
|
109
spacy/tokens.pyx
109
spacy/tokens.pyx
|
@ -184,6 +184,23 @@ cdef class Token:
|
||||||
def __init__(self, Tokens tokens, int i):
|
def __init__(self, Tokens tokens, int i):
|
||||||
self._seq = tokens
|
self._seq = tokens
|
||||||
self.i = i
|
self.i = i
|
||||||
|
cdef const TokenC* t = &tokens.data[i]
|
||||||
|
self.idx = t.idx
|
||||||
|
self.cluster = t.lex.cluster
|
||||||
|
self.length = t.lex.length
|
||||||
|
self.sic = t.lex.sic
|
||||||
|
self.norm1 = t.lex.norm1
|
||||||
|
self.norm2 = t.lex.norm2
|
||||||
|
self.shape = t.lex.shape
|
||||||
|
self.prefix = t.lex.prefix
|
||||||
|
self.suffix = t.lex.suffix
|
||||||
|
self.prob = t.lex.prob
|
||||||
|
self.sentiment = t.lex.sentiment
|
||||||
|
self.flags = t.lex.flags
|
||||||
|
self.lemma = t.lemma
|
||||||
|
self.pos = t.pos
|
||||||
|
self.fine_pos = t.fine_pos
|
||||||
|
self.dep_tag = t.dep_tag
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
|
@ -203,34 +220,11 @@ cdef class Token:
|
||||||
"""
|
"""
|
||||||
return self._seq.data[self.i].lex.length
|
return self._seq.data[self.i].lex.length
|
||||||
|
|
||||||
property idx:
|
property head:
|
||||||
"""The index into the original string at which the token starts.
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
|
|
||||||
The following is supposed to always be true:
|
|
||||||
|
|
||||||
>>> original_string[token.idx:token.idx len(token) == token.string
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].idx
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
|
return Token(self._seq, self.i + t.head)
|
||||||
property cluster:
|
|
||||||
"""The Brown cluster ID of the word: en.wikipedia.org/wiki/Brown_clustering
|
|
||||||
|
|
||||||
Similar words have better-than-chance likelihood of having similar cluster
|
|
||||||
IDs, although the clustering is quite noisy. Cluster IDs make good features,
|
|
||||||
and help to make models slightly more robust to domain variation.
|
|
||||||
|
|
||||||
A common trick is to use only the first N bits of a cluster ID in a feature,
|
|
||||||
as the more general part of the hierarchical clustering is often more accurate
|
|
||||||
than the lower categories.
|
|
||||||
|
|
||||||
To assist in this, I encode the cluster IDs little-endian, to allow a simple
|
|
||||||
bit-mask:
|
|
||||||
|
|
||||||
>>> six_bits = cluster & (2**6 - 1)
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
|
||||||
return self._seq.data[self.i].lex.cluster
|
|
||||||
|
|
||||||
property string:
|
property string:
|
||||||
"""The unicode string of the word, with no whitespace padding."""
|
"""The unicode string of the word, with no whitespace padding."""
|
||||||
|
@ -241,10 +235,31 @@ cdef class Token:
|
||||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
|
cdef unicode py_ustr = self._seq.vocab.strings[t.lex.sic]
|
||||||
return py_ustr
|
return py_ustr
|
||||||
|
|
||||||
property lemma:
|
property sic_:
|
||||||
"""The unicode string of the word's lemma. If no part-of-speech tag is
|
def __get__(self):
|
||||||
assigned, the most common part-of-speech tag of the word is used.
|
return self._seq.vocab.strings[self.sic]
|
||||||
"""
|
|
||||||
|
property norm1_:
|
||||||
|
def __get__(self):
|
||||||
|
return self._seq.vocab.strings[self.norm1]
|
||||||
|
|
||||||
|
property norm2_:
|
||||||
|
def __get__(self):
|
||||||
|
return self._seq.vocab.strings[self.norm2]
|
||||||
|
|
||||||
|
property shape_:
|
||||||
|
def __get__(self):
|
||||||
|
return self._seq.vocab.strings[self.shape]
|
||||||
|
|
||||||
|
property prefix_:
|
||||||
|
def __get__(self):
|
||||||
|
return self._seq.vocab.strings[self.prefix]
|
||||||
|
|
||||||
|
property suffix_:
|
||||||
|
def __get__(self):
|
||||||
|
return self._seq.vocab.strings[self.suffix]
|
||||||
|
|
||||||
|
property lemma_:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
cdef const TokenC* t = &self._seq.data[self.i]
|
||||||
if t.lemma == 0:
|
if t.lemma == 0:
|
||||||
|
@ -252,36 +267,16 @@ cdef class Token:
|
||||||
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
cdef unicode py_ustr = self._seq.vocab.strings[t.lemma]
|
||||||
return py_ustr
|
return py_ustr
|
||||||
|
|
||||||
property dep_tag:
|
property pos_:
|
||||||
"""The ID integer of the word's dependency label. If no parse has been
|
|
||||||
assigned, defaults to 0.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].dep_tag
|
return self._seq.vocab.strings[self.pos]
|
||||||
|
|
||||||
property pos:
|
property fine_pos_:
|
||||||
"""The ID integer of the word's part-of-speech tag, from the 13-tag
|
|
||||||
Google Universal Tag Set. Constants for this tag set are available in
|
|
||||||
spacy.typedefs.
|
|
||||||
"""
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].pos
|
return self._seq.vocab.strings[self.fine_pos]
|
||||||
|
|
||||||
property fine_pos:
|
property dep_tag_:
|
||||||
"""The ID integer of the word's fine-grained part-of-speech tag, as assigned
|
|
||||||
by the tagger model. Fine-grained tags include morphological information,
|
|
||||||
and other distinctions, and allow a more accurate tagger to be trained.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.data[self.i].fine_pos
|
return self._seq.vocab.strings[self.dep_tag]
|
||||||
|
|
||||||
property sic:
|
|
||||||
def __get__(self):
|
|
||||||
return self._seq.data[self.i].lex.sic
|
|
||||||
|
|
||||||
property head:
|
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
|
||||||
def __get__(self):
|
|
||||||
cdef const TokenC* t = &self._seq.data[self.i]
|
|
||||||
return Token(self._seq, self.i + t.head)
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ def tagged(EN):
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmas(tagged):
|
def lemmas(tagged):
|
||||||
return [t.lemma for t in tagged]
|
return [t.lemma_ for t in tagged]
|
||||||
|
|
||||||
|
|
||||||
def test_lemmas(lemmas, tagged):
|
def test_lemmas(lemmas, tagged):
|
||||||
|
|
|
@ -26,7 +26,7 @@ def test_LL(EN):
|
||||||
tokens = EN("we'll")
|
tokens = EN("we'll")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[1].string == "'ll"
|
assert tokens[1].string == "'ll"
|
||||||
assert tokens[1].lemma == "will"
|
assert tokens[1].lemma_ == "will"
|
||||||
assert tokens[0].string == "we"
|
assert tokens[0].string == "we"
|
||||||
|
|
||||||
|
|
||||||
|
@ -34,9 +34,9 @@ def test_aint(EN):
|
||||||
tokens = EN("ain't")
|
tokens = EN("ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].string == "ai"
|
assert tokens[0].string == "ai"
|
||||||
assert tokens[0].lemma == "be"
|
assert tokens[0].lemma_ == "be"
|
||||||
assert tokens[1].string == "n't"
|
assert tokens[1].string == "n't"
|
||||||
assert tokens[1].lemma == "not"
|
assert tokens[1].lemma_ == "not"
|
||||||
|
|
||||||
|
|
||||||
def test_capitalized(EN):
|
def test_capitalized(EN):
|
||||||
|
@ -47,7 +47,7 @@ def test_capitalized(EN):
|
||||||
tokens = EN("Ain't")
|
tokens = EN("Ain't")
|
||||||
assert len(tokens) == 2
|
assert len(tokens) == 2
|
||||||
assert tokens[0].string == "Ai"
|
assert tokens[0].string == "Ai"
|
||||||
assert tokens[0].lemma == "be"
|
assert tokens[0].lemma_ == "be"
|
||||||
|
|
||||||
|
|
||||||
def test_punct(EN):
|
def test_punct(EN):
|
||||||
|
|
|
@ -22,4 +22,4 @@ def test_load_exc(EN, morph_exc):
|
||||||
tokens = EN('I like his style.', tag=True)
|
tokens = EN('I like his style.', tag=True)
|
||||||
his = tokens[2]
|
his = tokens[2]
|
||||||
assert EN.tagger.tag_names[his.fine_pos] == 'PRP$'
|
assert EN.tagger.tag_names[his.fine_pos] == 'PRP$'
|
||||||
assert his.lemma == '-PRP-'
|
assert his.lemma_ == '-PRP-'
|
||||||
|
|
Loading…
Reference in New Issue
Block a user