From 793430aa7ac886f00c3d0f705a69bac7b6f96dc9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 17 May 2017 12:04:50 +0200 Subject: [PATCH] Get spaCy train command working with neural network * Integrate models into pipeline * Add basic serialization (maybe incorrect) * Fix pickle on vocab --- spacy/__main__.py | 6 +- spacy/cfile.pxd | 11 +- spacy/cfile.pyx | 24 +- spacy/cli/train.py | 48 ++- spacy/gold.pyx | 16 +- spacy/language.py | 84 ++++- spacy/lexeme.pxd | 20 +- spacy/lexeme.pyx | 23 ++ spacy/pipeline.pyx | 57 +++- spacy/structs.pxd | 18 ++ spacy/syntax/nn_parser.pyx | 27 +- spacy/tests/test_pickles.py | 1 - spacy/tests/vocab/test_lexeme.py | 15 + spacy/tokenizer.pyx | 7 +- spacy/train.py | 12 +- spacy/vocab.pyx | 520 ++++++++++++++++++++----------- 16 files changed, 649 insertions(+), 240 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index aec5f94d9..cf959def7 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -81,17 +81,19 @@ class CLI(object): train_data=("location of JSON-formatted training data", "positional", None, str), dev_data=("location of JSON-formatted development data (optional)", "positional", None, str), n_iter=("number of iterations", "option", "n", int), + nsents=("number of sentences", "option", None, int), parser_L1=("L1 regularization penalty for parser", "option", "L", float), no_tagger=("Don't train tagger", "flag", "T", bool), no_parser=("Don't train parser", "flag", "P", bool), no_ner=("Don't train NER", "flag", "N", bool) ) def train(self, lang, output_dir, train_data, dev_data=None, n_iter=15, - parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False): + nsents=0, parser_L1=0.0, no_tagger=False, no_parser=False, no_ner=False): """ Train a model. Expects data in spaCy's JSON format. """ - cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, + nsents = nsents or None + cli_train(lang, output_dir, train_data, dev_data, n_iter, nsents, not no_tagger, not no_parser, not no_ner, parser_L1) @plac.annotations( diff --git a/spacy/cfile.pxd b/spacy/cfile.pxd index cb0077587..b95fbb2be 100644 --- a/spacy/cfile.pxd +++ b/spacy/cfile.pxd @@ -3,9 +3,11 @@ from cymem.cymem cimport Pool cdef class CFile: cdef FILE* fp - cdef bint is_open + cdef unsigned char* data + cdef int is_open cdef Pool mem cdef int size # For compatibility with subclass + cdef int i # For compatibility with subclass cdef int _capacity # For compatibility with subclass cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 @@ -16,8 +18,13 @@ cdef class CFile: -cdef class StringCFile(CFile): +cdef class StringCFile: cdef unsigned char* data + cdef int is_open + cdef Pool mem + cdef int size # For compatibility with subclass + cdef int i # For compatibility with subclass + cdef int _capacity # For compatibility with subclass cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx index b83b55498..006ff78ac 100644 --- a/spacy/cfile.pyx +++ b/spacy/cfile.pyx @@ -53,31 +53,43 @@ cdef class CFile: cdef class StringCFile: - def __init__(self, mode, bytes data=b'', on_open_error=None): + def __init__(self, bytes data, mode, on_open_error=None): self.mem = Pool() - self.is_open = 'w' in mode + self.is_open = 1 if 'w' in mode else 0 self._capacity = max(len(data), 8) self.size = len(data) + self.i = 0 self.data = self.mem.alloc(1, self._capacity) for i in range(len(data)): self.data[i] = data[i] + def __dealloc__(self): + # Important to override this -- or + # we try to close a non-existant file pointer! + pass + def close(self): self.is_open = False def string_data(self): - return (self.data-self.size)[:self.size] + cdef bytes byte_string = b'\0' * (self.size) + bytes_ptr = byte_string + for i in range(self.size): + bytes_ptr[i] = self.data[i] + print(byte_string) + return byte_string cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - memcpy(dest, self.data, elem_size * number) - self.data += elem_size * number + if self.i+(number * elem_size) < self.size: + memcpy(dest, &self.data[self.i], elem_size * number) + self.i += elem_size * number cdef int write_from(self, void* src, size_t elem_size, size_t number) except -1: write_size = number * elem_size if (self.size + write_size) >= self._capacity: self._capacity = (self.size + write_size) * 2 self.data = self.mem.realloc(self.data, self._capacity) - memcpy(&self.data[self.size], src, elem_size * number) + memcpy(&self.data[self.size], src, write_size) self.size += write_size cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 66cecadc8..5c23587bc 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -4,16 +4,20 @@ from __future__ import unicode_literals, division, print_function import json from collections import defaultdict import cytoolz +from pathlib import Path +import dill +from ..tokens.doc import Doc from ..scorer import Scorer from ..gold import GoldParse, merge_sents from ..gold import read_json_file as read_gold_json from ..util import prints from .. import util +from .. import displacy -def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner, - parser_L1): +def train(language, output_dir, train_data, dev_data, n_iter, n_sents, + tagger, parser, ner, parser_L1): output_path = util.ensure_path(output_dir) train_path = util.ensure_path(train_data) dev_path = util.ensure_path(dev_data) @@ -39,10 +43,8 @@ def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ne 'n_iter': n_iter, 'lang': language, 'features': lang.Defaults.tagger_features} - gold_train = list(read_gold_json(train_path))[:100] - gold_dev = list(read_gold_json(dev_path)) if dev_path else None - - gold_dev = gold_dev[:100] + gold_train = list(read_gold_json(train_path, limit=n_sents)) + gold_dev = list(read_gold_json(dev_path, limit=n_sents)) if dev_path else None train_model(lang, gold_train, gold_dev, output_path, n_iter) if gold_dev: @@ -63,34 +65,48 @@ def train_config(config): def train_model(Language, train_data, dev_data, output_path, n_iter, **cfg): print("Itn.\tDep. Loss\tUAS\tNER F.\tTag %\tToken %") - nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies', 'entities']) + nlp = Language(pipeline=['token_vectors', 'tags', 'dependencies']) # TODO: Get spaCy using Thinc's trainer and optimizer with nlp.begin_training(train_data, **cfg) as (trainer, optimizer): - for itn, epoch in enumerate(trainer.epochs(n_iter)): + for itn, epoch in enumerate(trainer.epochs(n_iter, gold_preproc=True)): losses = defaultdict(float) - for docs, golds in epoch: + to_render = [] + for i, (docs, golds) in enumerate(epoch): state = nlp.update(docs, golds, drop=0., sgd=optimizer) losses['dep_loss'] += state.get('parser_loss', 0.0) + to_render.insert(0, nlp(docs[-1].text)) + to_render[0].user_data['title'] = "Batch %d" % i + with Path('/tmp/entities.html').open('w') as file_: + html = displacy.render(to_render[:5], style='ent', page=True, + options={'compact': True}) + file_.write(html) + with Path('/tmp/parses.html').open('w') as file_: + html = displacy.render(to_render[:5], style='dep', page=True, + options={'compact': True}) + file_.write(html) if dev_data: dev_scores = trainer.evaluate(dev_data).scores else: dev_scores = defaultdict(float) print_progress(itn, losses, dev_scores) + with (output_path / 'model.bin').open('wb') as file_: + dill.dump(nlp, file_, -1) + #nlp.to_disk(output_path, tokenizer=False) -def evaluate(Language, gold_tuples, output_path): - print("Load parser", output_path) - nlp = Language(path=output_path) +def evaluate(Language, gold_tuples, path): + with (path / 'model.bin').open('rb') as file_: + nlp = dill.load(file_) scorer = Scorer() for raw_text, sents in gold_tuples: sents = merge_sents(sents) for annot_tuples, brackets in sents: if raw_text is None: - tokens = nlp.tokenizer.tokens_from_list(annot_tuples[1]) - nlp.tagger(tokens) - nlp.parser(tokens) - nlp.entity(tokens) + tokens = Doc(nlp.vocab, words=annot_tuples[1]) + state = None + for proc in nlp.pipeline: + state = proc(tokens, state=state) else: tokens = nlp(raw_text) gold = GoldParse.from_annot_tuples(tokens, annot_tuples) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 1e55075c7..e7098843b 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -138,14 +138,16 @@ def _min_edit_path(cand_words, gold_words): return prev_costs[n_gold], previous_row[-1] -def read_json_file(loc, docs_filter=None): +def read_json_file(loc, docs_filter=None, make_supertags=False, limit=None): loc = ensure_path(loc) if loc.is_dir(): for filename in loc.iterdir(): - yield from read_json_file(loc / filename) + yield from read_json_file(loc / filename, limit=limit) else: with loc.open('r', encoding='utf8') as file_: docs = ujson.load(file_) + if limit is not None: + docs = docs[:limit] for doc in docs: if docs_filter is not None and not docs_filter(doc): continue @@ -169,11 +171,13 @@ def read_json_file(loc, docs_filter=None): if labels[-1].lower() == 'root': labels[-1] = 'ROOT' ner.append(token.get('ner', '-')) - sents.append(( - (ids, words, tags, heads, labels, ner), - sent.get('brackets', []))) + if make_supertags: + tags[-1] = '-'.join((tags[-1], labels[-1], ner[-1])) + sents.append([ + [ids, words, tags, heads, labels, ner], + sent.get('brackets', [])]) if sents: - yield (paragraph.get('raw', None), sents) + yield [paragraph.get('raw', None), sents] def _iob_to_biluo(tags): diff --git a/spacy/language.py b/spacy/language.py index 485a56573..a4ef24de0 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import absolute_import, unicode_literals from contextlib import contextmanager +import dill from .tokenizer import Tokenizer from .vocab import Vocab @@ -188,10 +189,18 @@ class Language(object): @contextmanager def begin_training(self, gold_tuples, **cfg): + # Populate vocab + for _, annots_brackets in gold_tuples: + for annots, _ in annots_brackets: + for word in annots[1]: + _ = self.vocab[word] + # Handle crossing dependencies + gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) contexts = [] for proc in self.pipeline: if hasattr(proc, 'begin_training'): - context = proc.begin_training(gold_tuples, pipeline=self.pipeline) + context = proc.begin_training(gold_tuples, + pipeline=self.pipeline) contexts.append(context) trainer = Trainer(self, gold_tuples, **cfg) yield trainer, trainer.optimizer @@ -221,15 +230,72 @@ class Language(object): for doc, state in stream: yield doc - def to_disk(self, path): - raise NotImplemented + def to_disk(self, path, **exclude): + """Save the current state to a directory. - def from_disk(self, path): - raise NotImplemented + Args: + path: A path to a directory, which will be created if it doesn't + exist. Paths may be either strings or pathlib.Path-like + objects. + **exclude: Prevent named attributes from being saved. + """ + path = util.ensure_path(path) + if not path.exists(): + path.mkdir() + if not path.is_dir(): + raise IOError("Output path must be a directory") + props = {} + for name, value in self.__dict__.items(): + if name in exclude: + continue + if hasattr(value, 'to_disk'): + value.to_disk(path / name) + else: + props[name] = value + with (path / 'props.pickle').open('wb') as file_: + dill.dump(props, file_) - def to_bytes(self, path): - raise NotImplemented + def from_disk(self, path, **exclude): + """Load the current state from a directory. - def from_bytes(self, path): - raise NotImplemented + Args: + path: A path to a directory. Paths may be either strings or + pathlib.Path-like objects. + **exclude: Prevent named attributes from being saved. + """ + path = util.ensure_path(path) + for name in path.iterdir(): + if name not in exclude and hasattr(self, str(name)): + getattr(self, name).from_disk(path / name) + with (path / 'props.pickle').open('rb') as file_: + bytes_data = file_.read() + self.from_bytes(bytes_data, **exclude) + return self + + def to_bytes(self, **exclude): + """Serialize the current state to a binary string. + + Args: + path: A path to a directory. Paths may be either strings or + pathlib.Path-like objects. + **exclude: Prevent named attributes from being serialized. + """ + props = dict(self.__dict__) + for key in exclude: + if key in props: + props.pop(key) + return dill.dumps(props, -1) + + def from_bytes(self, bytes_data, **exclude): + """Load state from a binary string. + + Args: + bytes_data (bytes): The data to load from. + **exclude: Prevent named attributes from being loaded. + """ + props = dill.loads(bytes_data) + for key, value in props.items(): + if key not in exclude: + setattr(self, key, value) + return self diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 12d4e3de3..b058c66e3 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -2,7 +2,7 @@ from .typedefs cimport attr_t, hash_t, flags_t, len_t, tag_t from .attrs cimport attr_id_t from .attrs cimport ID, ORTH, LOWER, NORM, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER, LANG -from .structs cimport LexemeC +from .structs cimport LexemeC, SerializedLexemeC from .strings cimport StringStore from .vocab cimport Vocab @@ -22,7 +22,23 @@ cdef class Lexeme: self.c = lex self.vocab = vocab self.orth = lex.orth - + + @staticmethod + cdef inline SerializedLexemeC c_to_bytes(const LexemeC* lex) nogil: + cdef SerializedLexemeC lex_data + buff = &lex.flags + end = &lex.l2_norm + sizeof(lex.l2_norm) + for i in range(sizeof(lex_data.data)): + lex_data.data[i] = buff[i] + return lex_data + + @staticmethod + cdef inline void c_from_bytes(LexemeC* lex, SerializedLexemeC lex_data) nogil: + buff = &lex.flags + end = &lex.l2_norm + sizeof(lex.l2_norm) + for i in range(sizeof(lex_data.data)): + buff[i] = lex_data.data[i] + @staticmethod cdef inline void set_struct_attr(LexemeC* lex, attr_id_t name, attr_t value) nogil: if name < (sizeof(flags_t) * 8): diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 0c65f101e..effffbac8 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -116,6 +116,29 @@ cdef class Lexeme: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def to_bytes(self): + lex_data = Lexeme.c_to_bytes(self.c) + start = &self.c.flags + end = &self.c.l2_norm + sizeof(self.c.l2_norm) + assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data)) + byte_string = b'\0' * sizeof(lex_data.data) + byte_chars = byte_string + for i in range(sizeof(lex_data.data)): + byte_chars[i] = lex_data.data[i] + assert len(byte_string) == sizeof(lex_data.data), (len(byte_string), + sizeof(lex_data.data)) + return byte_string + + def from_bytes(self, bytes byte_string): + # This method doesn't really have a use-case --- wrote it for testing. + # Possibly delete? It puts the Lexeme out of synch with the vocab. + cdef SerializedLexemeC lex_data + assert len(byte_string) == sizeof(lex_data.data) + for i in range(len(byte_string)): + lex_data.data[i] = byte_string[i] + Lexeme.c_from_bytes(self.c, lex_data) + self.orth = self.c.orth + property has_vector: def __get__(self): cdef int i diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index eacd29396..1636b29a1 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -26,10 +26,14 @@ from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown from .syntax.arc_eager cimport ArcEager from .tagger import Tagger +from .syntax.stateclass cimport StateClass from .gold cimport GoldParse +from .morphology cimport Morphology +from .vocab cimport Vocab -from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP +from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS from ._ml import Tok2Vec, flatten, get_col, doc2feats +from .parts_of_speech import X class TokenVectorEncoder(object): @@ -50,7 +54,7 @@ class TokenVectorEncoder(object): docs = [docs] tokvecs = self.predict(docs) self.set_annotations(docs, tokvecs) - state = {} if state is not None else state + state = {} if state is None else state state['tokvecs'] = tokvecs return state @@ -58,7 +62,6 @@ class TokenVectorEncoder(object): raise NotImplementedError def predict(self, docs): - cdef Doc doc feats = self.doc2feats(docs) tokvecs = self.model(feats) return tokvecs @@ -68,7 +71,7 @@ class TokenVectorEncoder(object): for doc in docs: doc.tensor = tokvecs[start : start + len(doc)] start += len(doc) - + def update(self, docs, golds, state=None, drop=0., sgd=None): if isinstance(docs, Doc): @@ -88,9 +91,9 @@ class TokenVectorEncoder(object): class NeuralTagger(object): name = 'nn_tagger' - def __init__(self, vocab): + def __init__(self, vocab, model=True): self.vocab = vocab - self.model = Softmax(self.vocab.morphology.n_tags) + self.model = model def __call__(self, doc, state=None): assert state is not None @@ -132,7 +135,7 @@ class NeuralTagger(object): bp_tokvecs = state['bp_tokvecs'] if self.model.nI is None: self.model.nI = tokvecs.shape[1] - + tag_scores, bp_tag_scores = self.model.begin_update(tokvecs, drop=drop) loss, d_tag_scores = self.get_loss(docs, golds, tag_scores) d_tokvecs = bp_tag_scores(d_tag_scores, sgd) @@ -141,7 +144,7 @@ class NeuralTagger(object): state['bp_tag_scores'] = bp_tag_scores state['d_tag_scores'] = d_tag_scores state['tag_loss'] = loss - + if 'd_tokvecs' in state: state['d_tokvecs'] += d_tokvecs else: @@ -161,6 +164,22 @@ class NeuralTagger(object): d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1]) return (d_scores**2).sum(), d_scores + def begin_training(self, gold_tuples, pipeline=None): + # Populate tag map, if anything's missing. + tag_map = dict(self.vocab.morphology.tag_map) + for raw_text, annots_brackets in gold_tuples: + for annots, brackets in annots_brackets: + ids, words, tags, heads, deps, ents = annots + for tag in tags: + if tag not in tag_map: + tag_map[tag] = {POS: X} + + cdef Vocab vocab = self.vocab + vocab.morphology = Morphology(self.vocab.strings, tag_map, + self.vocab.morphology.lemmatizer) + self.model = Softmax(self.vocab.morphology.n_tags) + + cdef class EntityRecognizer(LinearParser): """ @@ -209,6 +228,28 @@ cdef class NeuralEntityRecognizer(NeuralParser): name = 'entity' TransitionSystem = BiluoPushDown + nr_feature = 6 + + def get_token_ids(self, states): + cdef StateClass state + cdef int n_tokens = 6 + ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c') + for i, state in enumerate(states): + ids[i, 0] = state.c.B(0)-1 + ids[i, 1] = state.c.B(0) + ids[i, 2] = state.c.B(1) + ids[i, 3] = state.c.E(0) + ids[i, 4] = state.c.E(0)-1 + ids[i, 5] = state.c.E(0)+1 + for j in range(6): + if ids[i, j] >= state.c.length: + ids[i, j] = -1 + if ids[i, j] != -1: + ids[i, j] += state.c.offset + return ids + + + cdef class BeamDependencyParser(BeamParser): TransitionSystem = ArcEager diff --git a/spacy/structs.pxd b/spacy/structs.pxd index ae1cfb434..41bfbb62c 100644 --- a/spacy/structs.pxd +++ b/spacy/structs.pxd @@ -28,6 +28,24 @@ cdef struct LexemeC: float l2_norm +cdef struct SerializedLexemeC: + unsigned char[4*13 + 8] data + # sizeof(flags_t) # flags + # + sizeof(attr_t) # lang + # + sizeof(attr_t) # id + # + sizeof(attr_t) # length + # + sizeof(attr_t) # orth + # + sizeof(attr_t) # lower + # + sizeof(attr_t) # norm + # + sizeof(attr_t) # shape + # + sizeof(attr_t) # prefix + # + sizeof(attr_t) # suffix + # + sizeof(attr_t) # cluster + # + sizeof(float) # prob + # + sizeof(float) # cluster + # + sizeof(float) # l2_norm + + cdef struct Entity: hash_t id int start diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index b1910a270..1f4f2f9bb 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -12,6 +12,7 @@ from libc.math cimport exp cimport cython cimport cython.parallel import cytoolz +import dill import numpy.random cimport numpy as np @@ -35,6 +36,7 @@ from thinc.api import layerize, chain from thinc.neural import Model, Affine, ELU, ReLu, Maxout from thinc.neural.ops import NumpyOps +from .. import util from ..util import get_async, get_cuda_stream from .._ml import zero_init, PrecomputableAffine, PrecomputableMaxouts from .._ml import Tok2Vec, doc2feats @@ -218,9 +220,8 @@ cdef class Parser: """ @classmethod def Model(cls, nr_class, token_vector_width=128, hidden_width=128, **cfg): - nr_context_tokens = StateClass.nr_context_tokens() lower = PrecomputableMaxouts(hidden_width, - nF=nr_context_tokens, + nF=cls.nr_feature, nI=token_vector_width, pieces=cfg.get('maxout_pieces', 1)) @@ -267,7 +268,7 @@ cdef class Parser: self.model = model def __reduce__(self): - return (Parser, (self.vocab, self.moves, self.model, self.cfg), None, None) + return (Parser, (self.vocab, self.moves, self.model), None, None) def __call__(self, Doc tokens, state=None): """ @@ -392,9 +393,11 @@ cdef class Parser: lower, stream, drop=dropout) return state2vec, upper + nr_feature = 13 + def get_token_ids(self, states): cdef StateClass state - cdef int n_tokens = states[0].nr_context_tokens() + cdef int n_tokens = self.nr_feature ids = numpy.zeros((len(states), n_tokens), dtype='i', order='c') for i, state in enumerate(states): state.set_context_tokens(ids[i]) @@ -458,6 +461,22 @@ cdef class Parser: if self.model is True: self.model = self.Model(self.moves.n_moves, **cfg) + def to_disk(self, path): + path = util.ensure_path(path) + with (path / 'model.bin').open('wb') as file_: + dill.dump(self.model, file_) + + def from_disk(self, path): + path = util.ensure_path(path) + with (path / 'model.bin').open('wb') as file_: + self.model = dill.load(file_) + + def to_bytes(self): + pass + + def from_bytes(self, data): + pass + class ParserStateError(ValueError): def __init__(self, doc): diff --git a/spacy/tests/test_pickles.py b/spacy/tests/test_pickles.py index 41e15884f..b22a2864b 100644 --- a/spacy/tests/test_pickles.py +++ b/spacy/tests/test_pickles.py @@ -19,7 +19,6 @@ def test_pickle_string_store(stringstore, text1, text2): assert len(stringstore) == len(unpickled) -@pytest.mark.xfail @pytest.mark.parametrize('text1,text2', [('dog', 'cat')]) def test_pickle_vocab(text1, text2): vocab = Vocab(lex_attr_getters={int(NORM): lambda string: string[:-1]}) diff --git a/spacy/tests/vocab/test_lexeme.py b/spacy/tests/vocab/test_lexeme.py index 5fe1eef5c..163df8591 100644 --- a/spacy/tests/vocab/test_lexeme.py +++ b/spacy/tests/vocab/test_lexeme.py @@ -56,3 +56,18 @@ def test_vocab_lexeme_add_flag_provided_id(en_vocab): assert en_vocab['199'].check_flag(IS_DIGIT) == False assert en_vocab['the'].check_flag(is_len4) == False assert en_vocab['dogs'].check_flag(is_len4) == True + + +def test_lexeme_bytes_roundtrip(en_vocab): + one = en_vocab['one'] + alpha = en_vocab['alpha'] + assert one.orth != alpha.orth + assert one.lower != alpha.lower + print(one.orth, alpha.orth) + alpha.from_bytes(one.to_bytes()) + + assert one.orth_ == alpha.orth_ + assert one.orth == alpha.orth + assert one.lower == alpha.lower + assert one.lower_ == alpha.lower_ + diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index c094bea0d..05a73ea34 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -97,11 +97,10 @@ cdef class Tokenizer: def __reduce__(self): args = (self.vocab, self._rules, - self._prefix_re, - self._suffix_re, - self._infix_re, + self.prefix_search, + self.suffix_search, + self.infix_finditer, self.token_match) - return (self.__class__, args, None, None) cpdef Doc tokens_from_list(self, list strings): diff --git a/spacy/train.py b/spacy/train.py index 69572356e..1f5e5250c 100644 --- a/spacy/train.py +++ b/spacy/train.py @@ -20,14 +20,16 @@ class Trainer(object): """ def __init__(self, nlp, gold_tuples): self.nlp = nlp - self.gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) self.nr_epoch = 0 self.optimizer = Adam(NumpyOps(), 0.001) + self.gold_tuples = gold_tuples def epochs(self, nr_epoch, augment_data=None, gold_preproc=False): cached_golds = {} def _epoch(indices): - for i in tqdm.tqdm(indices): + all_docs = [] + all_golds = [] + for i in indices: raw_text, paragraph_tuples = self.gold_tuples[i] if gold_preproc: raw_text = None @@ -43,7 +45,11 @@ class Trainer(object): raw_text, paragraph_tuples = augment_data(raw_text, paragraph_tuples) docs = self.make_docs(raw_text, paragraph_tuples) golds = self.make_golds(docs, paragraph_tuples) - yield docs, golds + all_docs.extend(docs) + all_golds.extend(golds) + for batch in tqdm.tqdm(partition_all(12, zip(all_docs, all_golds))): + X, y = zip(*batch) + yield X, y indices = list(range(len(self.gold_tuples))) for itn in range(nr_epoch): diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4255819aa..0a68c6ef3 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -5,7 +5,7 @@ import bz2 import ujson import re -from libc.string cimport memset +from libc.string cimport memset, memcpy from libc.stdint cimport int32_t from libc.math cimport sqrt from cymem.cymem cimport Address @@ -13,9 +13,10 @@ from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string from .typedefs cimport attr_t -from .cfile cimport CFile, StringCFile +from .cfile cimport CFile from .tokens.token cimport Token from .attrs cimport PROB, LANG +from .structs cimport SerializedLexemeC from .compat import copy_reg, pickle from .lemmatizer import Lemmatizer @@ -42,6 +43,7 @@ cdef class Vocab: def load(cls, path, lex_attr_getters=None, lemmatizer=True, tag_map=True, oov_prob=True, **deprecated_kwargs): """ + Deprecated --- replace in spaCy 2 Load the vocabulary from a path. Arguments: @@ -88,6 +90,7 @@ cdef class Vocab: self.load_lexemes(path / 'vocab' / 'lexemes.bin') return self + def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, strings=tuple(), **deprecated_kwargs): """ @@ -149,24 +152,7 @@ cdef class Vocab: The current number of lexemes stored. """ return self.length - - def resize_vectors(self, int new_size): - """ - Set vectors_length to a new size, and allocate more memory for the Lexeme - vectors if necessary. The memory will be zeroed. - - Arguments: - new_size (int): The new size of the vectors. - """ - cdef hash_t key - cdef size_t addr - if new_size > self.vectors_length: - for key, addr in self._by_hash.items(): - lex = addr - lex.vector = self.mem.realloc(lex.vector, - new_size * sizeof(lex.vector[0])) - self.vectors_length = new_size - + def add_flag(self, flag_getter, int flag_id=-1): """ Set a new boolean flag to words in the vocabulary. @@ -224,7 +210,7 @@ cdef class Vocab: if lex != NULL: if lex.orth != self.strings[string]: raise LookupError.mismatched_strings( - lex.orth, self.strings[string], self.strings[lex.orth], string) + lex.orth, self.strings[string], string) return lex else: return self._new_lexeme(mem, string) @@ -337,148 +323,75 @@ cdef class Vocab: Token.set_struct_attr(token, attr_id, value) return tokens - def dump(self, loc=None): - """ - Save the lexemes binary data to the given location, or - return a byte-string with the data if loc is None. + def to_disk(self, path): + path = util.ensure_path(path) + if not path.exists(): + path.mkdir() + strings_loc = path / 'strings.json' + with strings_loc.open('w', encoding='utf8') as file_: + self.strings.dump(file_) + self.dump(path / 'lexemes.bin') - Arguments: - loc (Path or None): The path to save to, or None. - """ - cdef CFile fp - if loc is None: - fp = StringCFile('wb') - else: - fp = CFile(loc, 'wb') - cdef size_t st + def from_disk(self, path): + path = util.ensure_path(path) + with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: + strings_list = ujson.load(file_) + for string in strings_list: + self.strings[string] + self.load_lexemes(path / 'lexemes.bin') + + def lexemes_to_bytes(self, **exclude): + cdef hash_t key cdef size_t addr - cdef hash_t key cdef LexemeC* lexeme = NULL + cdef SerializedLexemeC lex_data + cdef int size = 0 for key, addr in self._by_hash.items(): + if addr == 0: + continue + size += sizeof(lex_data.data) + byte_string = b'\0' * size + byte_ptr = byte_string + cdef int j + cdef int i = 0 + for key, addr in self._by_hash.items(): + if addr == 0: + continue lexeme = addr - fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) - fp.write_from(&lexeme.flags, sizeof(lexeme.flags), 1) - fp.write_from(&lexeme.id, sizeof(lexeme.id), 1) - fp.write_from(&lexeme.length, sizeof(lexeme.length), 1) - fp.write_from(&lexeme.orth, sizeof(lexeme.orth), 1) - fp.write_from(&lexeme.lower, sizeof(lexeme.lower), 1) - fp.write_from(&lexeme.norm, sizeof(lexeme.norm), 1) - fp.write_from(&lexeme.shape, sizeof(lexeme.shape), 1) - fp.write_from(&lexeme.prefix, sizeof(lexeme.prefix), 1) - fp.write_from(&lexeme.suffix, sizeof(lexeme.suffix), 1) - fp.write_from(&lexeme.cluster, sizeof(lexeme.cluster), 1) - fp.write_from(&lexeme.prob, sizeof(lexeme.prob), 1) - fp.write_from(&lexeme.sentiment, sizeof(lexeme.sentiment), 1) - fp.write_from(&lexeme.l2_norm, sizeof(lexeme.l2_norm), 1) - fp.write_from(&lexeme.lang, sizeof(lexeme.lang), 1) - fp.close() - if loc is None: - return fp.string_data() + lex_data = Lexeme.c_to_bytes(lexeme) + for j in range(sizeof(lex_data.data)): + byte_ptr[i] = lex_data.data[j] + i += 1 + return byte_string - def load_lexemes(self, loc): + def lexemes_from_bytes(self, bytes bytes_data): """ - Load the binary vocabulary data from the given location. - - Arguments: - loc (Path): The path to load from. - - Returns: - None + Load the binary vocabulary data from the given string. """ - fp = CFile(loc, 'rb', - on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc)) - cdef LexemeC* lexeme = NULL + cdef LexemeC* lexeme cdef hash_t key cdef unicode py_str - cdef attr_t orth = 0 - assert sizeof(orth) == sizeof(lexeme.orth) - i = 0 - while True: - try: - fp.read_into(&orth, 1, sizeof(orth)) - except IOError: - break - lexeme = self.mem.alloc(sizeof(LexemeC), 1) - # Copy data from the file into the lexeme - fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) - fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) - fp.read_into(&lexeme.length, 1, sizeof(lexeme.length)) - fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth)) - fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower)) - fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm)) - fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape)) - fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix)) - fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix)) - fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster)) - fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob)) - fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) - fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) - fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang)) + cdef int i = 0 + cdef int j = 0 + cdef SerializedLexemeC lex_data + chunk_size = sizeof(lex_data.data) + cdef unsigned char* bytes_ptr = bytes_data + for i in range(0, len(bytes_data), chunk_size): + lexeme = self.mem.alloc(1, sizeof(LexemeC)) + for j in range(sizeof(lex_data.data)): + lex_data.data[j] = bytes_ptr[i+j] + Lexeme.c_from_bytes(lexeme, lex_data) lexeme.vector = EMPTY_VEC py_str = self.strings[lexeme.orth] + assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth) key = hash_string(py_str) self._by_hash.set(key, lexeme) self._by_orth.set(lexeme.orth, lexeme) self.length += 1 - i += 1 - fp.close() - - def _deserialize_lexemes(self, CFile fp): - """ - Load the binary vocabulary data from the given CFile. - """ - cdef LexemeC* lexeme = NULL - cdef hash_t key - cdef unicode py_str - cdef attr_t orth = 0 - assert sizeof(orth) == sizeof(lexeme.orth) - i = 0 - cdef int todo = fp.size - cdef int lex_size = sizeof(lexeme.flags) - lex_size += sizeof(lexeme.id) - lex_size += sizeof(lexeme.length) - lex_size += sizeof(lexeme.orth) - lex_size += sizeof(lexeme.lower) - lex_size += sizeof(lexeme.norm) - lex_size += sizeof(lexeme.shape) - lex_size += sizeof(lexeme.prefix) - lex_size += sizeof(lexeme.suffix) - lex_size += sizeof(lexeme.cluster) - lex_size += sizeof(lexeme.prob) - lex_size += sizeof(lexeme.sentiment) - lex_size += sizeof(lexeme.l2_norm) - lex_size += sizeof(lexeme.lang) - while True: - if todo < lex_size: - break - todo -= lex_size - lexeme = self.mem.alloc(sizeof(LexemeC), 1) - # Copy data from the file into the lexeme - fp.read_into(&lexeme.flags, 1, sizeof(lexeme.flags)) - fp.read_into(&lexeme.id, 1, sizeof(lexeme.id)) - fp.read_into(&lexeme.length, 1, sizeof(lexeme.length)) - fp.read_into(&lexeme.orth, 1, sizeof(lexeme.orth)) - fp.read_into(&lexeme.lower, 1, sizeof(lexeme.lower)) - fp.read_into(&lexeme.norm, 1, sizeof(lexeme.norm)) - fp.read_into(&lexeme.shape, 1, sizeof(lexeme.shape)) - fp.read_into(&lexeme.prefix, 1, sizeof(lexeme.prefix)) - fp.read_into(&lexeme.suffix, 1, sizeof(lexeme.suffix)) - fp.read_into(&lexeme.cluster, 1, sizeof(lexeme.cluster)) - fp.read_into(&lexeme.prob, 1, sizeof(lexeme.prob)) - fp.read_into(&lexeme.sentiment, 1, sizeof(lexeme.sentiment)) - fp.read_into(&lexeme.l2_norm, 1, sizeof(lexeme.l2_norm)) - fp.read_into(&lexeme.lang, 1, sizeof(lexeme.lang)) - - lexeme.vector = EMPTY_VEC - py_str = self.strings[lexeme.orth] - key = hash_string(py_str) - self._by_hash.set(key, lexeme) - self._by_orth.set(lexeme.orth, lexeme) - self.length += 1 - i += 1 - fp.close() + # Deprecated --- delete these once stable + def dump_vectors(self, out_loc): """ Save the word vectors to a binary file. @@ -487,7 +400,7 @@ cdef class Vocab: loc (Path): The path to save to. Returns: None - """ + #""" cdef int32_t vec_len = self.vectors_length cdef int32_t word_len cdef bytes word_str @@ -508,6 +421,8 @@ cdef class Vocab: out_file.write_from(vec, vec_len, sizeof(float)) out_file.close() + + def load_vectors(self, file_): """ Load vectors from a text-based file. @@ -610,38 +525,22 @@ cdef class Vocab: return vec_len -def pickle_vocab(vocab): - sstore = vocab.strings - morph = vocab.morphology - length = vocab.length - data_dir = vocab.data_dir - lex_attr_getters = vocab.lex_attr_getters + def resize_vectors(self, int new_size): + """ + Set vectors_length to a new size, and allocate more memory for the Lexeme + vectors if necessary. The memory will be zeroed. - lexemes_data = vocab.dump() - vectors_length = vocab.vectors_length - - return (unpickle_vocab, - (sstore, morph, data_dir, lex_attr_getters, - lexemes_data, length, vectors_length)) - - -def unpickle_vocab(sstore, morphology, data_dir, - lex_attr_getters, bytes lexemes_data, int length, int vectors_length): - cdef Vocab vocab = Vocab() - vocab.length = length - vocab.vectors_length = vectors_length - vocab.strings = sstore - cdef CFile fp = StringCFile('r', data=lexemes_data) - vocab.morphology = morphology - vocab.data_dir = data_dir - vocab.lex_attr_getters = lex_attr_getters - vocab._deserialize_lexemes(fp) - vocab.length = length - vocab.vectors_length = vectors_length - return vocab - - -copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab) + Arguments: + new_size (int): The new size of the vectors. + """ + cdef hash_t key + cdef size_t addr + if new_size > self.vectors_length: + for key, addr in self._by_hash.items(): + lex = addr + lex.vector = self.mem.realloc(lex.vector, + new_size * sizeof(lex.vector[0])) + self.vectors_length = new_size def write_binary_vectors(in_loc, out_loc): @@ -670,6 +569,39 @@ def write_binary_vectors(in_loc, out_loc): out_file.write_from(vec, vec_len, sizeof(float)) +def pickle_vocab(vocab): + sstore = vocab.strings + morph = vocab.morphology + length = vocab.length + data_dir = vocab.data_dir + lex_attr_getters = vocab.lex_attr_getters + + lexemes_data = vocab.lexemes_to_bytes() + vectors_length = vocab.vectors_length + + return (unpickle_vocab, + (sstore, morph, data_dir, lex_attr_getters, + lexemes_data, length, vectors_length)) + + +def unpickle_vocab(sstore, morphology, data_dir, + lex_attr_getters, bytes lexemes_data, int length, int vectors_length): + cdef Vocab vocab = Vocab() + vocab.length = length + vocab.vectors_length = vectors_length + vocab.strings = sstore + vocab.morphology = morphology + vocab.data_dir = data_dir + vocab.lex_attr_getters = lex_attr_getters + vocab.lexemes_from_bytes(lexemes_data) + vocab.length = length + vocab.vectors_length = vectors_length + return vocab + + +copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab) + + class LookupError(Exception): @classmethod def mismatched_strings(cls, id_, id_string, original_string): @@ -701,3 +633,237 @@ class VectorReadError(Exception): "Vector size: %d\n" "Max size: %d\n" "Min size: 1\n" % (loc, size, MAX_VEC_SIZE)) + + +# +#Deprecated --- delete these once stable +# +# def dump_vectors(self, out_loc): +# """ +# Save the word vectors to a binary file. +# +# Arguments: +# loc (Path): The path to save to. +# Returns: +# None +# #""" +# cdef int32_t vec_len = self.vectors_length +# cdef int32_t word_len +# cdef bytes word_str +# cdef char* chars +# +# cdef Lexeme lexeme +# cdef CFile out_file = CFile(out_loc, 'wb') +# for lexeme in self: +# word_str = lexeme.orth_.encode('utf8') +# vec = lexeme.c.vector +# word_len = len(word_str) +# +# out_file.write_from(&word_len, 1, sizeof(word_len)) +# out_file.write_from(&vec_len, 1, sizeof(vec_len)) +# +# chars = word_str +# out_file.write_from(chars, word_len, sizeof(char)) +# out_file.write_from(vec, vec_len, sizeof(float)) +# out_file.close() +# +# +# +# def load_vectors(self, file_): +# """ +# Load vectors from a text-based file. +# +# Arguments: +# file_ (buffer): The file to read from. Entries should be separated by newlines, +# and each entry should be whitespace delimited. The first value of the entry +# should be the word string, and subsequent entries should be the values of the +# vector. +# +# Returns: +# vec_len (int): The length of the vectors loaded. +# """ +# cdef LexemeC* lexeme +# cdef attr_t orth +# cdef int32_t vec_len = -1 +# cdef double norm = 0.0 +# +# whitespace_pattern = re.compile(r'\s', re.UNICODE) +# +# for line_num, line in enumerate(file_): +# pieces = line.split() +# word_str = " " if whitespace_pattern.match(line) else pieces.pop(0) +# if vec_len == -1: +# vec_len = len(pieces) +# elif vec_len != len(pieces): +# raise VectorReadError.mismatched_sizes(file_, line_num, +# vec_len, len(pieces)) +# orth = self.strings[word_str] +# lexeme = self.get_by_orth(self.mem, orth) +# lexeme.vector = self.mem.alloc(vec_len, sizeof(float)) +# for i, val_str in enumerate(pieces): +# lexeme.vector[i] = float(val_str) +# norm = 0.0 +# for i in range(vec_len): +# norm += lexeme.vector[i] * lexeme.vector[i] +# lexeme.l2_norm = sqrt(norm) +# self.vectors_length = vec_len +# return vec_len +# +# def load_vectors_from_bin_loc(self, loc): +# """ +# Load vectors from the location of a binary file. +# +# Arguments: +# loc (unicode): The path of the binary file to load from. +# +# Returns: +# vec_len (int): The length of the vectors loaded. +# """ +# cdef CFile file_ = CFile(loc, b'rb') +# cdef int32_t word_len +# cdef int32_t vec_len = 0 +# cdef int32_t prev_vec_len = 0 +# cdef float* vec +# cdef Address mem +# cdef attr_t string_id +# cdef bytes py_word +# cdef vector[float*] vectors +# cdef int line_num = 0 +# cdef Pool tmp_mem = Pool() +# while True: +# try: +# file_.read_into(&word_len, sizeof(word_len), 1) +# except IOError: +# break +# file_.read_into(&vec_len, sizeof(vec_len), 1) +# if prev_vec_len != 0 and vec_len != prev_vec_len: +# raise VectorReadError.mismatched_sizes(loc, line_num, +# vec_len, prev_vec_len) +# if 0 >= vec_len >= MAX_VEC_SIZE: +# raise VectorReadError.bad_size(loc, vec_len) +# +# chars = file_.alloc_read(tmp_mem, word_len, sizeof(char)) +# vec = file_.alloc_read(self.mem, vec_len, sizeof(float)) +# +# string_id = self.strings[chars[:word_len]] +# # Insert words into vocab to add vector. +# self.get_by_orth(self.mem, string_id) +# while string_id >= vectors.size(): +# vectors.push_back(EMPTY_VEC) +# assert vec != NULL +# vectors[string_id] = vec +# line_num += 1 +# cdef LexemeC* lex +# cdef size_t lex_addr +# cdef double norm = 0.0 +# cdef int i +# for orth, lex_addr in self._by_orth.items(): +# lex = lex_addr +# if lex.lower < vectors.size(): +# lex.vector = vectors[lex.lower] +# norm = 0.0 +# for i in range(vec_len): +# norm += lex.vector[i] * lex.vector[i] +# lex.l2_norm = sqrt(norm) +# else: +# lex.vector = EMPTY_VEC +# self.vectors_length = vec_len +# return vec_len +# +# +#def write_binary_vectors(in_loc, out_loc): +# cdef CFile out_file = CFile(out_loc, 'wb') +# cdef Address mem +# cdef int32_t word_len +# cdef int32_t vec_len +# cdef char* chars +# with bz2.BZ2File(in_loc, 'r') as file_: +# for line in file_: +# pieces = line.split() +# word = pieces.pop(0) +# mem = Address(len(pieces), sizeof(float)) +# vec = mem.ptr +# for i, val_str in enumerate(pieces): +# vec[i] = float(val_str) +# +# word_len = len(word) +# vec_len = len(pieces) +# +# out_file.write_from(&word_len, 1, sizeof(word_len)) +# out_file.write_from(&vec_len, 1, sizeof(vec_len)) +# +# chars = word +# out_file.write_from(chars, len(word), sizeof(char)) +# out_file.write_from(vec, vec_len, sizeof(float)) +# +# +# def resize_vectors(self, int new_size): +# """ +# Set vectors_length to a new size, and allocate more memory for the Lexeme +# vectors if necessary. The memory will be zeroed. +# +# Arguments: +# new_size (int): The new size of the vectors. +# """ +# cdef hash_t key +# cdef size_t addr +# if new_size > self.vectors_length: +# for key, addr in self._by_hash.items(): +# lex = addr +# lex.vector = self.mem.realloc(lex.vector, +# new_size * sizeof(lex.vector[0])) +# self.vectors_length = new_size +# +# + +# +# def dump(self, loc=None): +# """ +# Save the lexemes binary data to the given location, or +# return a byte-string with the data if loc is None. +# +# Arguments: +# loc (Path or None): The path to save to, or None. +# """ +# if loc is None: +# return self.to_bytes() +# else: +# return self.to_disk(loc) +# +# def load_lexemes(self, loc): +# """ +# Load the binary vocabulary data from the given location. +# +# Arguments: +# loc (Path): The path to load from. +# +# Returns: +# None +# """ +# fp = CFile(loc, 'rb', +# on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc)) +# cdef LexemeC* lexeme = NULL +# cdef SerializedLexemeC lex_data +# cdef hash_t key +# cdef unicode py_str +# cdef attr_t orth = 0 +# assert sizeof(orth) == sizeof(lexeme.orth) +# i = 0 +# while True: +# try: +# fp.read_into(&orth, 1, sizeof(orth)) +# except IOError: +# break +# lexeme = self.mem.alloc(sizeof(LexemeC), 1) +# # Copy data from the file into the lexeme +# fp.read_into(&lex_data.data, 1, sizeof(lex_data.data)) +# Lexeme.c_from_bytes(lexeme, lex_data) +# +# lexeme.vector = EMPTY_VEC +# py_str = self.strings[lexeme.orth] +# key = hash_string(py_str) +# self._by_hash.set(key, lexeme) +# self._by_orth.set(lexeme.orth, lexeme) +# self.length += 1 +# i += 1 +# fp.close()