From a902b5f2175012bc57bd326dbfcf43e5ed2a91a5 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 3 Jul 2020 12:58:16 +0200 Subject: [PATCH] Record whether Doc objects are built from known spacing (#5697) * Tell convert CLI to store user data for Doc * Remove assert * Add has_unknwon_spaces flag on Doc * Do not tokenize docs with unknown spaces in Corpus * Handle conversion of unknown spaces in Example * Fixes * Fixes * Draft has_known_spaces support in DocBin * Add test for serialize has_unknown_spaces * Fix DocBin serialization when has_unknown_spaces * Use serialization in test --- spacy/cli/convert.py | 4 +- spacy/gold/converters/json2docs.py | 2 - spacy/gold/corpus.py | 36 +++++++++------- spacy/gold/example.pyx | 4 +- spacy/tests/serialize/test_serialize_doc.py | 16 +++++++ spacy/tokens/_serialize.py | 10 +++++ spacy/tokens/doc.pxd | 3 ++ spacy/tokens/doc.pyx | 46 ++++++++++----------- 8 files changed, 76 insertions(+), 45 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index 976fe7910..56f38766a 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type): if output_type == "json": srsly.write_json("-", docs_to_json(docs)) else: - sys.stdout.buffer.write(DocBin(docs=docs).to_bytes()) + sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes()) def _write_docs_to_file(docs, output_file, output_type): @@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type): if output_type == "json": srsly.write_json(output_file, docs_to_json(docs)) else: - data = DocBin(docs=docs).to_bytes() + data = DocBin(docs=docs, store_user_data=True).to_bytes() with output_file.open("wb") as file_: file_.write(data) diff --git a/spacy/gold/converters/json2docs.py b/spacy/gold/converters/json2docs.py index 50ad16faf..342f94848 100644 --- a/spacy/gold/converters/json2docs.py +++ b/spacy/gold/converters/json2docs.py @@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs): for json_para in json_to_annotations(json_doc): example_dict = _fix_legacy_dict_data(json_para) tok_dict, doc_dict = _parse_example_dict_data(example_dict) - if json_para.get("raw"): - assert tok_dict.get("SPACY") doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) docs.append(doc) return docs diff --git a/spacy/gold/corpus.py b/spacy/gold/corpus.py index 9a688987c..64f38d21c 100644 --- a/spacy/gold/corpus.py +++ b/spacy/gold/corpus.py @@ -43,25 +43,36 @@ class Corpus: locs.append(path) return locs + def _make_example(self, nlp, reference, gold_preproc): + if gold_preproc or reference.has_unknown_spaces: + return Example( + Doc( + nlp.vocab, + words=[word.text for word in reference], + spaces=[bool(word.whitespace_) for word in reference] + ), + reference + ) + else: + return Example( + nlp.make_doc(reference.text), + reference + ) + def make_examples(self, nlp, reference_docs, max_length=0): for reference in reference_docs: if len(reference) == 0: continue elif max_length == 0 or len(reference) < max_length: - yield Example( - nlp.make_doc(reference.text), - reference - ) + yield self._make_example(nlp, reference, False) elif reference.is_sentenced: for ref_sent in reference.sents: if len(ref_sent) == 0: continue elif max_length == 0 or len(ref_sent) < max_length: - yield Example( - nlp.make_doc(ref_sent.text), - ref_sent.as_doc() - ) + yield self._make_example(nlp, ref_sent.as_doc(), False) + def make_examples_gold_preproc(self, nlp, reference_docs): for reference in reference_docs: if reference.is_sentenced: @@ -69,14 +80,7 @@ class Corpus: else: ref_sents = [reference] for ref_sent in ref_sents: - eg = Example( - Doc( - nlp.vocab, - words=[w.text for w in ref_sent], - spaces=[bool(w.whitespace_) for w in ref_sent] - ), - ref_sent - ) + eg = self._make_example(nlp, ref_sent, True) if len(eg.x): yield eg diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx index 2ecee1821..7b629dcd2 100644 --- a/spacy/gold/example.pyx +++ b/spacy/gold/example.pyx @@ -15,7 +15,7 @@ from ..syntax import nonproj cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): - """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """ + """ Create a Doc from dictionaries with token and doc annotations. """ attrs, array = _annot2array(vocab, tok_annot, doc_annot) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) if "entities" in doc_annot: @@ -406,7 +406,7 @@ def _parse_links(vocab, words, spaces, links): def _guess_spaces(text, words): if text is None: - return [True] * len(words) + return None spaces = [] text_pos = 0 # align words with text diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py index 615bb1cd9..85c21f7f9 100644 --- a/spacy/tests/serialize/test_serialize_doc.py +++ b/spacy/tests/serialize/test_serialize_doc.py @@ -75,3 +75,19 @@ def test_serialize_doc_bin(): for i, doc in enumerate(reloaded_docs): assert doc.text == texts[i] assert doc.cats == cats + + +def test_serialize_doc_bin_unknown_spaces(en_vocab): + doc1 = Doc(en_vocab, words=["that", "'s"]) + assert doc1.has_unknown_spaces + assert doc1.text == "that 's " + doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False]) + assert not doc2.has_unknown_spaces + assert doc2.text == "that's" + + doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes()) + re_doc1, re_doc2 = doc_bin.get_docs(en_vocab) + assert re_doc1.has_unknown_spaces + assert re_doc1.text == "that 's " + assert not re_doc2.has_unknown_spaces + assert re_doc2.text == "that's" diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py index a56900988..edc183e0d 100644 --- a/spacy/tokens/_serialize.py +++ b/spacy/tokens/_serialize.py @@ -61,6 +61,7 @@ class DocBin(object): self.spaces = [] self.cats = [] self.user_data = [] + self.flags = [] self.strings = set() self.store_user_data = store_user_data for doc in docs: @@ -85,6 +86,9 @@ class DocBin(object): assert array.shape[0] == spaces.shape[0] # this should never happen spaces = spaces.reshape((spaces.shape[0], 1)) self.spaces.append(numpy.asarray(spaces, dtype=bool)) + self.flags.append({ + "has_unknown_spaces": doc.has_unknown_spaces + }) for token in doc: self.strings.add(token.text) self.strings.add(token.tag_) @@ -109,8 +113,11 @@ class DocBin(object): vocab[string] orth_col = self.attrs.index(ORTH) for i in range(len(self.tokens)): + flags = self.flags[i] tokens = self.tokens[i] spaces = self.spaces[i] + if flags.get("has_unknown_spaces"): + spaces = None doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces) doc = doc.from_array(self.attrs, tokens) doc.cats = self.cats[i] @@ -134,6 +141,7 @@ class DocBin(object): self.spaces.extend(other.spaces) self.strings.update(other.strings) self.cats.extend(other.cats) + self.flags.extend(other.flags) if self.store_user_data: self.user_data.extend(other.user_data) @@ -158,6 +166,7 @@ class DocBin(object): "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"), "strings": list(self.strings), "cats": self.cats, + "flags": self.flags, } if self.store_user_data: msg["user_data"] = self.user_data @@ -183,6 +192,7 @@ class DocBin(object): self.tokens = NumpyOps().unflatten(flat_tokens, lengths) self.spaces = NumpyOps().unflatten(flat_spaces, lengths) self.cats = msg["cats"] + self.flags = msg.get("flags", [{} for _ in lengths]) if self.store_user_data and "user_data" in msg: self.user_data = list(msg["user_data"]) for tokens in self.tokens: diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 42918ab6d..2775aa97e 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -59,11 +59,14 @@ cdef class Doc: cdef public dict user_token_hooks cdef public dict user_span_hooks + cdef public bint has_unknown_spaces + cdef public list _py_tokens cdef int length cdef int max_length + cdef public object noun_chunks_iterator cdef object __weakref__ diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index f69a6811d..723873e1f 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -172,8 +172,7 @@ cdef class Doc: raise ValueError(Errors.E046.format(name=name)) return Underscore.doc_extensions.pop(name) - def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None, - orths_and_spaces=None): + def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None): """Create a Doc object. vocab (Vocab): A vocabulary object, which must match any models you @@ -215,28 +214,25 @@ cdef class Doc: self._vector = None self.noun_chunks_iterator = _get_chunker(self.vocab.lang) cdef bint has_space - if orths_and_spaces is None and words is not None: - if spaces is None: - spaces = [True] * len(words) - elif len(spaces) != len(words): - raise ValueError(Errors.E027) - orths_and_spaces = zip(words, spaces) + if words is None and spaces is not None: + raise ValueError("words must be set if spaces is set") + elif spaces is None and words is not None: + self.has_unknown_spaces = True + else: + self.has_unknown_spaces = False + words = words if words is not None else [] + spaces = spaces if spaces is not None else ([True] * len(words)) + if len(spaces) != len(words): + raise ValueError(Errors.E027) cdef const LexemeC* lexeme - if orths_and_spaces is not None: - orths_and_spaces = list(orths_and_spaces) - for orth_space in orths_and_spaces: - if isinstance(orth_space, unicode): - lexeme = self.vocab.get(self.mem, orth_space) - has_space = True - elif isinstance(orth_space, bytes): - raise ValueError(Errors.E028.format(value=orth_space)) - elif isinstance(orth_space[0], unicode): - lexeme = self.vocab.get(self.mem, orth_space[0]) - has_space = orth_space[1] - else: - lexeme = self.vocab.get_by_orth(self.mem, orth_space[0]) - has_space = orth_space[1] - self.push_back(lexeme, has_space) + for word, has_space in zip(words, spaces): + if isinstance(word, unicode): + lexeme = self.vocab.get(self.mem, word) + elif isinstance(word, bytes): + raise ValueError(Errors.E028.format(value=word)) + else: + lexeme = self.vocab.get_by_orth(self.mem, word) + self.push_back(lexeme, has_space) # Tough to decide on policy for this. Is an empty doc tagged and parsed? # There's no information we'd like to add to it, so I guess so? if self.length == 0: @@ -1082,6 +1078,7 @@ cdef class Doc: "sentiment": lambda: self.sentiment, "tensor": lambda: self.tensor, "cats": lambda: self.cats, + "has_unknown_spaces": lambda: self.has_unknown_spaces } for key in kwargs: if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"): @@ -1114,6 +1111,7 @@ cdef class Doc: "cats": lambda b: None, "user_data_keys": lambda b: None, "user_data_values": lambda b: None, + "has_unknown_spaces": lambda b: None } for key in kwargs: if key in deserializers or key in ("user_data",): @@ -1134,6 +1132,8 @@ cdef class Doc: self.tensor = msg["tensor"] if "cats" not in exclude and "cats" in msg: self.cats = msg["cats"] + if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg: + self.has_unknown_spaces = msg["has_unknown_spaces"] start = 0 cdef const LexemeC* lex cdef unicode orth_