* Rename Tokens to Doc

This commit is contained in:
Matthew Honnibal 2015-07-08 18:53:00 +02:00
parent d0fc7f5ba9
commit bb522496dd
11 changed files with 41 additions and 43 deletions

View File

@ -9,7 +9,6 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t, id_t from .typedefs cimport hash_t, id_t
from .tokens cimport Tokens
cdef int arg_max(const weight_t* scores, const int n_classes) nogil cdef int arg_max(const weight_t* scores, const int n_classes) nogil

View File

@ -9,7 +9,7 @@ from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown from ..syntax.ner import BiluoPushDown
from ..syntax.parser import ParserFactory from ..syntax.parser import ParserFactory
from ..tokens import Tokens from ..tokens import Doc
from ..multi_words import RegexMerger from ..multi_words import RegexMerger
from .pos import EnPosTagger from .pos import EnPosTagger

View File

@ -14,7 +14,7 @@ from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
from ..typedefs cimport id_t from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, LexemeC from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Tokens from ..tokens cimport Doc
from ..morphology cimport set_morph_from_dict from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max from .._ml cimport arg_max
@ -260,11 +260,11 @@ cdef class EnPosTagger:
'morphs.json')))) 'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ) self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
def __call__(self, Tokens tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Tokens object. """Apply the tagger, setting the POS tags onto the Doc object.
Args: Args:
tokens (Tokens): The tokens to be tagged. tokens (Doc): The tokens to be tagged.
""" """
if tokens.length == 0: if tokens.length == 0:
return 0 return 0
@ -282,7 +282,7 @@ cdef class EnPosTagger:
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Tokens tokens, object tag_strs): def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i cdef int i
for i in range(tokens.length): for i in range(tokens.length):
tokens.data[i].tag = self.strings[tag_strs[i]] tokens.data[i].tag = self.strings[tag_strs[i]]
@ -291,7 +291,7 @@ cdef class EnPosTagger:
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def train(self, Tokens tokens, object gold_tag_strs): def train(self, Doc tokens, object gold_tag_strs):
cdef int i cdef int i
cdef int loss cdef int loss
cdef atom_t[N_CONTEXT_FIELDS] context cdef atom_t[N_CONTEXT_FIELDS] context

View File

@ -1,4 +1,4 @@
from .tokens cimport Tokens from .tokens cimport Doc
from .typedefs cimport flags_t, attr_id_t, attr_t from .typedefs cimport flags_t, attr_id_t, attr_t
from .parts_of_speech cimport univ_pos_t from .parts_of_speech cimport univ_pos_t
from .structs cimport Morphology, TokenC, LexemeC from .structs cimport Morphology, TokenC, LexemeC
@ -7,7 +7,7 @@ from .strings cimport StringStore
cdef class Span: cdef class Span:
cdef readonly Tokens _seq cdef readonly Doc _seq
cdef public int i cdef public int i
cdef public int start cdef public int start
cdef public int end cdef public int end

View File

@ -3,8 +3,8 @@ from collections import defaultdict
cdef class Span: cdef class Span:
"""A slice from a Tokens object.""" """A slice from a Doc object."""
def __cinit__(self, Tokens tokens, int start, int end, int label=0): def __cinit__(self, Doc tokens, int start, int end, int label=0):
self._seq = tokens self._seq = tokens
self.start = start self.start = start
self.end = end self.end = end

View File

@ -4,7 +4,7 @@ from .._ml cimport Model
from .arc_eager cimport TransitionSystem from .arc_eager cimport TransitionSystem
from ..tokens cimport Tokens, TokenC from ..tokens cimport Doc, TokenC
cdef class Parser: cdef class Parser:
@ -12,5 +12,5 @@ cdef class Parser:
cdef readonly Model model cdef readonly Model model
cdef readonly TransitionSystem moves cdef readonly TransitionSystem moves
cdef int _greedy_parse(self, Tokens tokens) except -1 cdef int _greedy_parse(self, Doc tokens) except -1
cdef int _beam_parse(self, Tokens tokens) except -1 cdef int _beam_parse(self, Doc tokens) except -1

View File

@ -31,7 +31,7 @@ from thinc.learner cimport LinearModel
from thinc.search cimport Beam from thinc.search cimport Beam
from thinc.search cimport MaxViolation from thinc.search cimport MaxViolation
from ..tokens cimport Tokens, TokenC from ..tokens cimport Doc, TokenC
from ..strings cimport StringStore from ..strings cimport StringStore
@ -75,20 +75,20 @@ cdef class Parser:
templates = get_templates(self.cfg.features) templates = get_templates(self.cfg.features)
self.model = Model(self.moves.n_moves, templates, model_dir) self.model = Model(self.moves.n_moves, templates, model_dir)
def __call__(self, Tokens tokens): def __call__(self, Doc tokens):
if self.cfg.get('beam_width', 1) < 1: if self.cfg.get('beam_width', 1) < 1:
self._greedy_parse(tokens) self._greedy_parse(tokens)
else: else:
self._beam_parse(tokens) self._beam_parse(tokens)
def train(self, Tokens tokens, GoldParse gold): def train(self, Doc tokens, GoldParse gold):
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
if self.cfg.beam_width < 1: if self.cfg.beam_width < 1:
return self._greedy_train(tokens, gold) return self._greedy_train(tokens, gold)
else: else:
return self._beam_train(tokens, gold) return self._beam_train(tokens, gold)
cdef int _greedy_parse(self, Tokens tokens) except -1: cdef int _greedy_parse(self, Doc tokens) except -1:
cdef atom_t[CONTEXT_SIZE] context cdef atom_t[CONTEXT_SIZE] context
cdef int n_feats cdef int n_feats
cdef Pool mem = Pool() cdef Pool mem = Pool()
@ -106,7 +106,7 @@ cdef class Parser:
self.moves.finalize_state(stcls) self.moves.finalize_state(stcls)
tokens.set_parse(stcls._sent) tokens.set_parse(stcls._sent)
cdef int _beam_parse(self, Tokens tokens) except -1: cdef int _beam_parse(self, Doc tokens) except -1:
cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width) cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
words = [w.orth_ for w in tokens] words = [w.orth_ for w in tokens]
beam.initialize(_init_state, tokens.length, tokens.data) beam.initialize(_init_state, tokens.length, tokens.data)
@ -118,7 +118,7 @@ cdef class Parser:
tokens.set_parse(state._sent) tokens.set_parse(state._sent)
_cleanup(beam) _cleanup(beam)
def _greedy_train(self, Tokens tokens, GoldParse gold): def _greedy_train(self, Doc tokens, GoldParse gold):
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
self.moves.initialize_state(stcls) self.moves.initialize_state(stcls)
@ -143,7 +143,7 @@ cdef class Parser:
loss += cost loss += cost
return loss return loss
def _beam_train(self, Tokens tokens, GoldParse gold_parse): def _beam_train(self, Doc tokens, GoldParse gold_parse):
cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width) cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
pred.initialize(_init_state, tokens.length, tokens.data) pred.initialize(_init_state, tokens.length, tokens.data)
pred.check_done(_check_final_state, NULL) pred.check_done(_check_final_state, NULL)
@ -190,7 +190,7 @@ cdef class Parser:
beam.advance(_transition_state, _hash_state, <void*>self.moves.c) beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL) beam.check_done(_check_final_state, NULL)
def _count_feats(self, dict counts, Tokens tokens, list hist, int inc): def _count_feats(self, dict counts, Doc tokens, list hist, int inc):
cdef atom_t[CONTEXT_SIZE] context cdef atom_t[CONTEXT_SIZE] context
cdef Pool mem = Pool() cdef Pool mem = Pool()
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length) cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)

View File

@ -8,7 +8,7 @@ from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology, UniStr from .structs cimport LexemeC, TokenC, Morphology, UniStr
from .strings cimport StringStore from .strings cimport StringStore
from .tokens cimport Tokens from .tokens cimport Doc
from .vocab cimport Vocab, _Cached from .vocab cimport Vocab, _Cached
@ -27,13 +27,13 @@ cdef class Tokenizer:
cdef object _suffix_re cdef object _suffix_re
cdef object _infix_re cdef object _infix_re
cpdef Tokens tokens_from_list(self, list strings) cpdef Doc tokens_from_list(self, list strings)
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes, cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL vector[LexemeC*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1

View File

@ -16,7 +16,7 @@ from .morphology cimport set_morph_from_dict
from . import util from . import util
from .util import read_lang_data from .util import read_lang_data
from .tokens import Tokens from .tokens import Doc
cdef class Tokenizer: cdef class Tokenizer:
@ -38,9 +38,9 @@ cdef class Tokenizer:
infix_re = re.compile(infix_re) infix_re = re.compile(infix_re)
return cls(vocab, rules, prefix_re, suffix_re, infix_re) return cls(vocab, rules, prefix_re, suffix_re, infix_re)
cpdef Tokens tokens_from_list(self, list strings): cpdef Doc tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings]) cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.vocab, ' '.join(strings)) cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
if length == 0: if length == 0:
return tokens return tokens
cdef UniStr string_struct cdef UniStr string_struct
@ -70,10 +70,10 @@ cdef class Tokenizer:
string (unicode): The string to be tokenized. string (unicode): The string to be tokenized.
Returns: Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs. tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
""" """
cdef int length = len(string) cdef int length = len(string)
cdef Tokens tokens = Tokens(self.vocab, string) cdef Doc tokens = Doc(self.vocab, string)
if length == 0: if length == 0:
return tokens return tokens
cdef int i = 0 cdef int i = 0
@ -101,7 +101,7 @@ cdef class Tokenizer:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
return tokens return tokens
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1: cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key) cached = <_Cached*>self._cache.get(key)
if cached == NULL: if cached == NULL:
return False return False
@ -114,7 +114,7 @@ cdef class Tokenizer:
idx = tokens.push_back(idx, &cached.data.tokens[i]) idx = tokens.push_back(idx, &cached.data.tokens[i])
return True return True
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1: cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes cdef vector[LexemeC*] suffixes
cdef hash_t orig_key cdef hash_t orig_key
@ -167,7 +167,7 @@ cdef class Tokenizer:
break break
return string return string
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string, cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
vector[const LexemeC*] *prefixes, vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1: vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit cdef bint cache_hit

View File

@ -28,13 +28,12 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id) return lexeme.flags & (1 << flag_id)
cdef class Tokens: cdef class Doc:
cdef Pool mem cdef Pool mem
cdef Vocab vocab cdef Vocab vocab
cdef TokenC* data cdef TokenC* data
cdef list _py_tokens cdef list _py_tokens
cdef unicode _string cdef unicode _string
cdef tuple _tag_strings cdef tuple _tag_strings
@ -62,12 +61,12 @@ cdef class Token:
cdef bint _owns_c_data cdef bint _owns_c_data
cdef Tokens _seq cdef Doc _seq
@staticmethod @staticmethod
cdef inline Token cinit(Vocab vocab, unicode string, cdef inline Token cinit(Vocab vocab, unicode string,
const TokenC* token, int offset, int array_len, const TokenC* token, int offset, int array_len,
Tokens parent_seq): Doc parent_seq):
if offset < 0 or offset >= array_len: if offset < 0 or offset >= array_len:
msg = "Attempt to access token at %d, max length %d" msg = "Attempt to access token at %d, max length %d"

View File

@ -78,7 +78,7 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return 0 return 0
cdef class Tokens: cdef class Doc:
""" """
Container class for annotated text. Constructed via English.__call__ or Container class for annotated text. Constructed via English.__call__ or
Tokenizer.__call__. Tokenizer.__call__.
@ -185,7 +185,7 @@ cdef class Tokens:
Yield a list of sentence Span objects, calculated from the dependency parse. Yield a list of sentence Span objects, calculated from the dependency parse.
""" """
cdef int i cdef int i
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:]) cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:])
start = 0 start = 0
for i in range(1, self.length): for i in range(1, self.length):
if self.data[i].sent_start: if self.data[i].sent_start:
@ -370,7 +370,7 @@ cdef class Tokens:
cdef class Token: cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created """An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Tokens.__getitem__ and Tokens.__iter__. via Doc.__getitem__ and Doc.__iter__.
""" """
def __cinit__(self, Vocab vocab, unicode string): def __cinit__(self, Vocab vocab, unicode string):
self.vocab = vocab self.vocab = vocab