Record whether Doc objects are built from known spacing (#5697)

* Tell convert CLI to store user data for Doc * Remove assert * Add has_unknwon_spaces flag on Doc * Do not tokenize docs with unknown spaces in Corpus * Handle conversion of unknown spaces in Example * Fixes * Fixes * Draft has_known_spaces support in DocBin * Add test for serialize has_unknown_spaces * Fix DocBin serialization when has_unknown_spaces * Use serialization in test
2025-11-25 04:16:11 +03:00 · 2020-07-03 12:58:16 +02:00 · 2020-07-03 12:58:16 +02:00 · a902b5f217
commit a902b5f217
parent abad56db7d
8 changed files with 76 additions and 45 deletions
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
    if output_type == "json":
        srsly.write_json("-", docs_to_json(docs))
    else:
-        sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
+        sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())


 def _write_docs_to_file(docs, output_file, output_type):
@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
    if output_type == "json":
        srsly.write_json(output_file, docs_to_json(docs))
    else:
-        data = DocBin(docs=docs).to_bytes()
+        data = DocBin(docs=docs, store_user_data=True).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
 
--- a/spacy/gold/converters/json2docs.py
+++ b/spacy/gold/converters/json2docs.py
@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
        for json_para in json_to_annotations(json_doc):
            example_dict = _fix_legacy_dict_data(json_para)
            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
-            if json_para.get("raw"):
-                assert tok_dict.get("SPACY")
            doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
            docs.append(doc)
    return docs
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -43,24 +43,35 @@ class Corpus:
                locs.append(path)
        return locs

+    def _make_example(self, nlp, reference, gold_preproc):
+        if gold_preproc or reference.has_unknown_spaces:
+            return Example(
+                Doc(
+                    nlp.vocab,
+                    words=[word.text for word in reference],
+                    spaces=[bool(word.whitespace_) for word in reference]
+                ),
+                reference
+            )
+        else:
+            return Example(
+                nlp.make_doc(reference.text),
+                reference
+            )
+ 
    def make_examples(self, nlp, reference_docs, max_length=0):
        for reference in reference_docs:
            if len(reference) == 0:
                continue
            elif max_length == 0 or len(reference) < max_length:
-                yield Example(
-                    nlp.make_doc(reference.text),
-                    reference
-                )
+                yield self._make_example(nlp, reference, False)
            elif reference.is_sentenced:
                for ref_sent in reference.sents:
                    if len(ref_sent) == 0:
                        continue
                    elif max_length == 0 or len(ref_sent) < max_length:
-                        yield Example(
-                            nlp.make_doc(ref_sent.text),
-                            ref_sent.as_doc()
-                        )
+                        yield self._make_example(nlp, ref_sent.as_doc(), False)
+    

    def make_examples_gold_preproc(self, nlp, reference_docs):
        for reference in reference_docs:
@ -69,14 +80,7 @@ class Corpus:
            else:
                ref_sents = [reference]
            for ref_sent in ref_sents:
-                eg = Example(
-                    Doc(
-                        nlp.vocab, 
-                        words=[w.text for w in ref_sent],
-                        spaces=[bool(w.whitespace_) for w in ref_sent]
-                    ),
-                    ref_sent
-                )
+                eg = self._make_example(nlp, ref_sent, True)
                if len(eg.x):
                    yield eg

--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -15,7 +15,7 @@ from ..syntax import nonproj


 cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
-    """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
+    """ Create a Doc from dictionaries with token and doc annotations. """
    attrs, array = _annot2array(vocab, tok_annot, doc_annot)
    output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
    if "entities" in doc_annot:
@ -406,7 +406,7 @@ def _parse_links(vocab, words, spaces, links):

 def _guess_spaces(text, words):
    if text is None:
-        return [True] * len(words)
+        return None
    spaces = []
    text_pos = 0
    # align words with text
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@ -75,3 +75,19 @@ def test_serialize_doc_bin():
    for i, doc in enumerate(reloaded_docs):
        assert doc.text == texts[i]
        assert doc.cats == cats
+
+
+def test_serialize_doc_bin_unknown_spaces(en_vocab):
+    doc1 = Doc(en_vocab, words=["that", "'s"])
+    assert doc1.has_unknown_spaces
+    assert doc1.text == "that 's "
+    doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
+    assert not doc2.has_unknown_spaces
+    assert doc2.text == "that's"
+
+    doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
+    re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
+    assert re_doc1.has_unknown_spaces
+    assert re_doc1.text == "that 's "
+    assert not re_doc2.has_unknown_spaces
+    assert re_doc2.text == "that's"
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -61,6 +61,7 @@ class DocBin(object):
        self.spaces = []
        self.cats = []
        self.user_data = []
+        self.flags = []
        self.strings = set()
        self.store_user_data = store_user_data
        for doc in docs:
@ -85,6 +86,9 @@ class DocBin(object):
        assert array.shape[0] == spaces.shape[0]  # this should never happen
        spaces = spaces.reshape((spaces.shape[0], 1))
        self.spaces.append(numpy.asarray(spaces, dtype=bool))
+        self.flags.append({
+            "has_unknown_spaces": doc.has_unknown_spaces
+        })
        for token in doc:
            self.strings.add(token.text)
            self.strings.add(token.tag_)
@ -109,8 +113,11 @@ class DocBin(object):
            vocab[string]
        orth_col = self.attrs.index(ORTH)
        for i in range(len(self.tokens)):
+            flags = self.flags[i]
            tokens = self.tokens[i]
            spaces = self.spaces[i]
+            if flags.get("has_unknown_spaces"):
+                spaces = None
            doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
            doc = doc.from_array(self.attrs, tokens)
            doc.cats = self.cats[i]
@ -134,6 +141,7 @@ class DocBin(object):
        self.spaces.extend(other.spaces)
        self.strings.update(other.strings)
        self.cats.extend(other.cats)
+        self.flags.extend(other.flags)
        if self.store_user_data:
            self.user_data.extend(other.user_data)

@ -158,6 +166,7 @@ class DocBin(object):
            "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
            "strings": list(self.strings),
            "cats": self.cats,
+            "flags": self.flags,
        }
        if self.store_user_data:
            msg["user_data"] = self.user_data
@ -183,6 +192,7 @@ class DocBin(object):
        self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
        self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
        self.cats = msg["cats"]
+        self.flags = msg.get("flags", [{} for _ in lengths])
        if self.store_user_data and "user_data" in msg:
            self.user_data = list(msg["user_data"])
        for tokens in self.tokens:
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -59,11 +59,14 @@ cdef class Doc:
    cdef public dict user_token_hooks
    cdef public dict user_span_hooks

+    cdef public bint has_unknown_spaces
+
    cdef public list _py_tokens

    cdef int length
    cdef int max_length

+
    cdef public object noun_chunks_iterator

    cdef object __weakref__
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -172,8 +172,7 @@ cdef class Doc:
            raise ValueError(Errors.E046.format(name=name))
        return Underscore.doc_extensions.pop(name)

-    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
-                 orths_and_spaces=None):
+    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
        """Create a Doc object.

        vocab (Vocab): A vocabulary object, which must match any models you
@ -215,27 +214,24 @@ cdef class Doc:
        self._vector = None
        self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
        cdef bint has_space
-        if orths_and_spaces is None and words is not None:
-            if spaces is None:
-                spaces = [True] * len(words)
-            elif len(spaces) != len(words):
-                raise ValueError(Errors.E027)
-            orths_and_spaces = zip(words, spaces)
-        cdef const LexemeC* lexeme
-        if orths_and_spaces is not None:
-            orths_and_spaces = list(orths_and_spaces)
-            for orth_space in orths_and_spaces:
-                if isinstance(orth_space, unicode):
-                    lexeme = self.vocab.get(self.mem, orth_space)
-                    has_space = True
-                elif isinstance(orth_space, bytes):
-                    raise ValueError(Errors.E028.format(value=orth_space))
-                elif isinstance(orth_space[0], unicode):
-                    lexeme = self.vocab.get(self.mem, orth_space[0])
-                    has_space = orth_space[1]
+        if words is None and spaces is not None:
+            raise ValueError("words must be set if spaces is set")
+        elif spaces is None and words is not None:
+            self.has_unknown_spaces = True
        else:
-                    lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
-                    has_space = orth_space[1]
+            self.has_unknown_spaces = False
+        words = words if words is not None else []
+        spaces = spaces if spaces is not None else ([True] * len(words))
+        if len(spaces) != len(words):
+            raise ValueError(Errors.E027)
+        cdef const LexemeC* lexeme
+        for word, has_space in zip(words, spaces):
+            if isinstance(word, unicode):
+                lexeme = self.vocab.get(self.mem, word)
+            elif isinstance(word, bytes):
+                raise ValueError(Errors.E028.format(value=word))
+            else:
+                lexeme = self.vocab.get_by_orth(self.mem, word)
            self.push_back(lexeme, has_space)
        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
        # There's no information we'd like to add to it, so I guess so?
@ -1082,6 +1078,7 @@ cdef class Doc:
            "sentiment": lambda: self.sentiment,
            "tensor": lambda: self.tensor,
            "cats": lambda: self.cats,
+            "has_unknown_spaces": lambda: self.has_unknown_spaces
        }
        for key in kwargs:
            if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
@ -1114,6 +1111,7 @@ cdef class Doc:
            "cats": lambda b: None,
            "user_data_keys": lambda b: None,
            "user_data_values": lambda b: None,
+            "has_unknown_spaces": lambda b: None
        }
        for key in kwargs:
            if key in deserializers or key in ("user_data",):
@ -1134,6 +1132,8 @@ cdef class Doc:
            self.tensor = msg["tensor"]
        if "cats" not in exclude and "cats" in msg:
            self.cats = msg["cats"]
+        if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
+            self.has_unknown_spaces = msg["has_unknown_spaces"]
        start = 0
        cdef const LexemeC* lex
        cdef unicode orth_