mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
* Move Span class to own file
This commit is contained in:
parent
f02c39dfaf
commit
6f47a667cf
|
@ -10,6 +10,7 @@ from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLU
|
||||||
from .typedefs cimport POS, LEMMA
|
from .typedefs cimport POS, LEMMA
|
||||||
from .parts_of_speech import UNIV_POS_NAMES
|
from .parts_of_speech import UNIV_POS_NAMES
|
||||||
from .lexeme cimport check_flag
|
from .lexeme cimport check_flag
|
||||||
|
from .spans import Span
|
||||||
|
|
||||||
from unidecode import unidecode
|
from unidecode import unidecode
|
||||||
|
|
||||||
|
@ -132,7 +133,7 @@ cdef class Tokens:
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef const TokenC* token
|
cdef const TokenC* token
|
||||||
cdef int start = -1
|
cdef int start = -1
|
||||||
cdef object label = None
|
cdef int label = 0
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
token = &self.data[i]
|
token = &self.data[i]
|
||||||
if token.ent_iob == 1:
|
if token.ent_iob == 1:
|
||||||
|
@ -140,14 +141,14 @@ cdef class Tokens:
|
||||||
pass
|
pass
|
||||||
elif token.ent_iob == 2:
|
elif token.ent_iob == 2:
|
||||||
if start != -1:
|
if start != -1:
|
||||||
yield (start, i, label)
|
yield Span(self, start, i, label=label)
|
||||||
start = -1
|
start = -1
|
||||||
label = None
|
label = 0
|
||||||
elif token.ent_iob == 3:
|
elif token.ent_iob == 3:
|
||||||
start = i
|
start = i
|
||||||
label = self.vocab.strings[token.ent_type]
|
label = token.ent_type
|
||||||
if start != -1:
|
if start != -1:
|
||||||
yield (start, self.length, label)
|
yield Span(self, start, self.length, label=label)
|
||||||
|
|
||||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
|
@ -253,35 +254,6 @@ cdef class Tokens:
|
||||||
self.data[i] = parsed[i]
|
self.data[i] = parsed[i]
|
||||||
|
|
||||||
|
|
||||||
cdef class Span:
|
|
||||||
"""A slice from a Tokens object."""
|
|
||||||
def __cinit__(self, Tokens tokens, int start, int end):
|
|
||||||
self._seq = tokens
|
|
||||||
self.start = start
|
|
||||||
self.end = end
|
|
||||||
|
|
||||||
def __richcmp__(self, Span other, int op):
|
|
||||||
# Eq
|
|
||||||
if op in (1, 2, 5):
|
|
||||||
if self._seq is other._seq and \
|
|
||||||
self.start == other.start and \
|
|
||||||
self.end == other.end:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def __len__(self):
|
|
||||||
if self.end < self.start:
|
|
||||||
return 0
|
|
||||||
return self.end - self.start
|
|
||||||
|
|
||||||
def __getitem__(self, int i):
|
|
||||||
return self._seq[self.start + i]
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
for i in range(self.start, self.end):
|
|
||||||
yield self._seq[i]
|
|
||||||
|
|
||||||
|
|
||||||
cdef class Token:
|
cdef class Token:
|
||||||
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
|
||||||
via Tokens.__getitem__ and Tokens.__iter__.
|
via Tokens.__getitem__ and Tokens.__iter__.
|
||||||
|
|
Loading…
Reference in New Issue
Block a user