* Rename Tokens to Doc

This commit is contained in:
Matthew Honnibal 2015-07-08 18:53:00 +02:00
parent d0fc7f5ba9
commit bb522496dd
11 changed files with 41 additions and 43 deletions

View File

@ -9,7 +9,6 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t, id_t
from .tokens cimport Tokens
cdef int arg_max(const weight_t* scores, const int n_classes) nogil

View File

@ -9,7 +9,7 @@ from ..syntax.arc_eager import ArcEager
from ..syntax.ner import BiluoPushDown
from ..syntax.parser import ParserFactory
from ..tokens import Tokens
from ..tokens import Doc
from ..multi_words import RegexMerger
from .pos import EnPosTagger

View File

@ -14,7 +14,7 @@ from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL
from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens cimport Tokens
from ..tokens cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max
@ -260,11 +260,11 @@ cdef class EnPosTagger:
'morphs.json'))))
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
def __call__(self, Tokens tokens):
"""Apply the tagger, setting the POS tags onto the Tokens object.
def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object.
Args:
tokens (Tokens): The tokens to be tagged.
tokens (Doc): The tokens to be tagged.
"""
if tokens.length == 0:
return 0
@ -282,7 +282,7 @@ cdef class EnPosTagger:
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Tokens tokens, object tag_strs):
def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i
for i in range(tokens.length):
tokens.data[i].tag = self.strings[tag_strs[i]]
@ -291,7 +291,7 @@ cdef class EnPosTagger:
tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length
def train(self, Tokens tokens, object gold_tag_strs):
def train(self, Doc tokens, object gold_tag_strs):
cdef int i
cdef int loss
cdef atom_t[N_CONTEXT_FIELDS] context

View File

@ -1,4 +1,4 @@
from .tokens cimport Tokens
from .tokens cimport Doc
from .typedefs cimport flags_t, attr_id_t, attr_t
from .parts_of_speech cimport univ_pos_t
from .structs cimport Morphology, TokenC, LexemeC
@ -7,7 +7,7 @@ from .strings cimport StringStore
cdef class Span:
cdef readonly Tokens _seq
cdef readonly Doc _seq
cdef public int i
cdef public int start
cdef public int end

View File

@ -3,8 +3,8 @@ from collections import defaultdict
cdef class Span:
"""A slice from a Tokens object."""
def __cinit__(self, Tokens tokens, int start, int end, int label=0):
"""A slice from a Doc object."""
def __cinit__(self, Doc tokens, int start, int end, int label=0):
self._seq = tokens
self.start = start
self.end = end

View File

@ -4,7 +4,7 @@ from .._ml cimport Model
from .arc_eager cimport TransitionSystem
from ..tokens cimport Tokens, TokenC
from ..tokens cimport Doc, TokenC
cdef class Parser:
@ -12,5 +12,5 @@ cdef class Parser:
cdef readonly Model model
cdef readonly TransitionSystem moves
cdef int _greedy_parse(self, Tokens tokens) except -1
cdef int _beam_parse(self, Tokens tokens) except -1
cdef int _greedy_parse(self, Doc tokens) except -1
cdef int _beam_parse(self, Doc tokens) except -1

View File

@ -31,7 +31,7 @@ from thinc.learner cimport LinearModel
from thinc.search cimport Beam
from thinc.search cimport MaxViolation
from ..tokens cimport Tokens, TokenC
from ..tokens cimport Doc, TokenC
from ..strings cimport StringStore
@ -75,20 +75,20 @@ cdef class Parser:
templates = get_templates(self.cfg.features)
self.model = Model(self.moves.n_moves, templates, model_dir)
def __call__(self, Tokens tokens):
def __call__(self, Doc tokens):
if self.cfg.get('beam_width', 1) < 1:
self._greedy_parse(tokens)
else:
self._beam_parse(tokens)
def train(self, Tokens tokens, GoldParse gold):
def train(self, Doc tokens, GoldParse gold):
self.moves.preprocess_gold(gold)
if self.cfg.beam_width < 1:
return self._greedy_train(tokens, gold)
else:
return self._beam_train(tokens, gold)
cdef int _greedy_parse(self, Tokens tokens) except -1:
cdef int _greedy_parse(self, Doc tokens) except -1:
cdef atom_t[CONTEXT_SIZE] context
cdef int n_feats
cdef Pool mem = Pool()
@ -106,7 +106,7 @@ cdef class Parser:
self.moves.finalize_state(stcls)
tokens.set_parse(stcls._sent)
cdef int _beam_parse(self, Tokens tokens) except -1:
cdef int _beam_parse(self, Doc tokens) except -1:
cdef Beam beam = Beam(self.moves.n_moves, self.cfg.beam_width)
words = [w.orth_ for w in tokens]
beam.initialize(_init_state, tokens.length, tokens.data)
@ -118,7 +118,7 @@ cdef class Parser:
tokens.set_parse(state._sent)
_cleanup(beam)
def _greedy_train(self, Tokens tokens, GoldParse gold):
def _greedy_train(self, Doc tokens, GoldParse gold):
cdef Pool mem = Pool()
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)
self.moves.initialize_state(stcls)
@ -143,7 +143,7 @@ cdef class Parser:
loss += cost
return loss
def _beam_train(self, Tokens tokens, GoldParse gold_parse):
def _beam_train(self, Doc tokens, GoldParse gold_parse):
cdef Beam pred = Beam(self.moves.n_moves, self.cfg.beam_width)
pred.initialize(_init_state, tokens.length, tokens.data)
pred.check_done(_check_final_state, NULL)
@ -190,7 +190,7 @@ cdef class Parser:
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
def _count_feats(self, dict counts, Tokens tokens, list hist, int inc):
def _count_feats(self, dict counts, Doc tokens, list hist, int inc):
cdef atom_t[CONTEXT_SIZE] context
cdef Pool mem = Pool()
cdef StateClass stcls = StateClass.init(tokens.data, tokens.length)

View File

@ -8,7 +8,7 @@ from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .structs cimport LexemeC, TokenC, Morphology, UniStr
from .strings cimport StringStore
from .tokens cimport Tokens
from .tokens cimport Doc
from .vocab cimport Vocab, _Cached
@ -27,13 +27,13 @@ cdef class Tokenizer:
cdef object _suffix_re
cdef object _infix_re
cpdef Tokens tokens_from_list(self, list strings)
cpdef Doc tokens_from_list(self, list strings)
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes) except NULL
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
cdef int _find_prefix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1

View File

@ -16,7 +16,7 @@ from .morphology cimport set_morph_from_dict
from . import util
from .util import read_lang_data
from .tokens import Tokens
from .tokens import Doc
cdef class Tokenizer:
@ -38,9 +38,9 @@ cdef class Tokenizer:
infix_re = re.compile(infix_re)
return cls(vocab, rules, prefix_re, suffix_re, infix_re)
cpdef Tokens tokens_from_list(self, list strings):
cpdef Doc tokens_from_list(self, list strings):
cdef int length = sum([len(s) for s in strings])
cdef Tokens tokens = Tokens(self.vocab, ' '.join(strings))
cdef Doc tokens = Doc(self.vocab, ' '.join(strings))
if length == 0:
return tokens
cdef UniStr string_struct
@ -70,10 +70,10 @@ cdef class Tokenizer:
string (unicode): The string to be tokenized.
Returns:
tokens (Tokens): A Tokens object, giving access to a sequence of LexemeCs.
tokens (Doc): A Doc object, giving access to a sequence of LexemeCs.
"""
cdef int length = len(string)
cdef Tokens tokens = Tokens(self.vocab, string)
cdef Doc tokens = Doc(self.vocab, string)
if length == 0:
return tokens
cdef int i = 0
@ -101,7 +101,7 @@ cdef class Tokenizer:
self._tokenize(tokens, &span, start, i)
return tokens
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1:
cdef int _try_cache(self, int idx, hash_t key, Doc tokens) except -1:
cached = <_Cached*>self._cache.get(key)
if cached == NULL:
return False
@ -114,7 +114,7 @@ cdef class Tokenizer:
idx = tokens.push_back(idx, &cached.data.tokens[i])
return True
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1:
cdef int _tokenize(self, Doc tokens, UniStr* span, int start, int end) except -1:
cdef vector[LexemeC*] prefixes
cdef vector[LexemeC*] suffixes
cdef hash_t orig_key
@ -167,7 +167,7 @@ cdef class Tokenizer:
break
return string
cdef int _attach_tokens(self, Tokens tokens, int idx, UniStr* string,
cdef int _attach_tokens(self, Doc tokens, int idx, UniStr* string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes) except -1:
cdef bint cache_hit

View File

@ -28,13 +28,12 @@ cdef inline bint check_flag(const LexemeC* lexeme, attr_id_t flag_id) nogil:
return lexeme.flags & (1 << flag_id)
cdef class Tokens:
cdef class Doc:
cdef Pool mem
cdef Vocab vocab
cdef TokenC* data
cdef list _py_tokens
cdef unicode _string
cdef tuple _tag_strings
@ -62,12 +61,12 @@ cdef class Token:
cdef bint _owns_c_data
cdef Tokens _seq
cdef Doc _seq
@staticmethod
cdef inline Token cinit(Vocab vocab, unicode string,
const TokenC* token, int offset, int array_len,
Tokens parent_seq):
Doc parent_seq):
if offset < 0 or offset >= array_len:
msg = "Attempt to access token at %d, max length %d"

View File

@ -78,7 +78,7 @@ cdef attr_t get_lex_attr(const LexemeC* lex, attr_id_t feat_name) nogil:
return 0
cdef class Tokens:
cdef class Doc:
"""
Container class for annotated text. Constructed via English.__call__ or
Tokenizer.__call__.
@ -185,7 +185,7 @@ cdef class Tokens:
Yield a list of sentence Span objects, calculated from the dependency parse.
"""
cdef int i
cdef Tokens sent = Tokens(self.vocab, self._string[self.data[0].idx:])
cdef Doc sent = Doc(self.vocab, self._string[self.data[0].idx:])
start = 0
for i in range(1, self.length):
if self.data[i].sent_start:
@ -370,7 +370,7 @@ cdef class Tokens:
cdef class Token:
"""An individual token --- i.e. a word, a punctuation symbol, etc. Created
via Tokens.__getitem__ and Tokens.__iter__.
via Doc.__getitem__ and Doc.__iter__.
"""
def __cinit__(self, Vocab vocab, unicode string):
self.vocab = vocab