Record whether Doc objects are built from known spacing (#5697)

* Tell convert CLI to store user data for Doc

* Remove assert

* Add has_unknwon_spaces flag on Doc

* Do not tokenize docs with unknown spaces in Corpus

* Handle conversion of unknown spaces in Example

* Fixes

* Fixes

* Draft has_known_spaces support in DocBin

* Add test for serialize has_unknown_spaces

* Fix DocBin serialization when has_unknown_spaces

* Use serialization in test
This commit is contained in:
Matthew Honnibal 2020-07-03 12:58:16 +02:00 committed by GitHub
parent abad56db7d
commit a902b5f217
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 76 additions and 45 deletions

View File

@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
if output_type == "json":
srsly.write_json("-", docs_to_json(docs))
else:
sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
def _write_docs_to_file(docs, output_file, output_type):
@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
if output_type == "json":
srsly.write_json(output_file, docs_to_json(docs))
else:
data = DocBin(docs=docs).to_bytes()
data = DocBin(docs=docs, store_user_data=True).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)

View File

@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
for json_para in json_to_annotations(json_doc):
example_dict = _fix_legacy_dict_data(json_para)
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if json_para.get("raw"):
assert tok_dict.get("SPACY")
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
docs.append(doc)
return docs

View File

@ -43,24 +43,35 @@ class Corpus:
locs.append(path)
return locs
def _make_example(self, nlp, reference, gold_preproc):
if gold_preproc or reference.has_unknown_spaces:
return Example(
Doc(
nlp.vocab,
words=[word.text for word in reference],
spaces=[bool(word.whitespace_) for word in reference]
),
reference
)
else:
return Example(
nlp.make_doc(reference.text),
reference
)
def make_examples(self, nlp, reference_docs, max_length=0):
for reference in reference_docs:
if len(reference) == 0:
continue
elif max_length == 0 or len(reference) < max_length:
yield Example(
nlp.make_doc(reference.text),
reference
)
yield self._make_example(nlp, reference, False)
elif reference.is_sentenced:
for ref_sent in reference.sents:
if len(ref_sent) == 0:
continue
elif max_length == 0 or len(ref_sent) < max_length:
yield Example(
nlp.make_doc(ref_sent.text),
ref_sent.as_doc()
)
yield self._make_example(nlp, ref_sent.as_doc(), False)
def make_examples_gold_preproc(self, nlp, reference_docs):
for reference in reference_docs:
@ -69,14 +80,7 @@ class Corpus:
else:
ref_sents = [reference]
for ref_sent in ref_sents:
eg = Example(
Doc(
nlp.vocab,
words=[w.text for w in ref_sent],
spaces=[bool(w.whitespace_) for w in ref_sent]
),
ref_sent
)
eg = self._make_example(nlp, ref_sent, True)
if len(eg.x):
yield eg

View File

@ -15,7 +15,7 @@ from ..syntax import nonproj
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
""" Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
""" Create a Doc from dictionaries with token and doc annotations. """
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
if "entities" in doc_annot:
@ -406,7 +406,7 @@ def _parse_links(vocab, words, spaces, links):
def _guess_spaces(text, words):
if text is None:
return [True] * len(words)
return None
spaces = []
text_pos = 0
# align words with text

View File

@ -75,3 +75,19 @@ def test_serialize_doc_bin():
for i, doc in enumerate(reloaded_docs):
assert doc.text == texts[i]
assert doc.cats == cats
def test_serialize_doc_bin_unknown_spaces(en_vocab):
doc1 = Doc(en_vocab, words=["that", "'s"])
assert doc1.has_unknown_spaces
assert doc1.text == "that 's "
doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
assert not doc2.has_unknown_spaces
assert doc2.text == "that's"
doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
assert re_doc1.has_unknown_spaces
assert re_doc1.text == "that 's "
assert not re_doc2.has_unknown_spaces
assert re_doc2.text == "that's"

View File

@ -61,6 +61,7 @@ class DocBin(object):
self.spaces = []
self.cats = []
self.user_data = []
self.flags = []
self.strings = set()
self.store_user_data = store_user_data
for doc in docs:
@ -85,6 +86,9 @@ class DocBin(object):
assert array.shape[0] == spaces.shape[0] # this should never happen
spaces = spaces.reshape((spaces.shape[0], 1))
self.spaces.append(numpy.asarray(spaces, dtype=bool))
self.flags.append({
"has_unknown_spaces": doc.has_unknown_spaces
})
for token in doc:
self.strings.add(token.text)
self.strings.add(token.tag_)
@ -109,8 +113,11 @@ class DocBin(object):
vocab[string]
orth_col = self.attrs.index(ORTH)
for i in range(len(self.tokens)):
flags = self.flags[i]
tokens = self.tokens[i]
spaces = self.spaces[i]
if flags.get("has_unknown_spaces"):
spaces = None
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
doc = doc.from_array(self.attrs, tokens)
doc.cats = self.cats[i]
@ -134,6 +141,7 @@ class DocBin(object):
self.spaces.extend(other.spaces)
self.strings.update(other.strings)
self.cats.extend(other.cats)
self.flags.extend(other.flags)
if self.store_user_data:
self.user_data.extend(other.user_data)
@ -158,6 +166,7 @@ class DocBin(object):
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
"strings": list(self.strings),
"cats": self.cats,
"flags": self.flags,
}
if self.store_user_data:
msg["user_data"] = self.user_data
@ -183,6 +192,7 @@ class DocBin(object):
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
self.cats = msg["cats"]
self.flags = msg.get("flags", [{} for _ in lengths])
if self.store_user_data and "user_data" in msg:
self.user_data = list(msg["user_data"])
for tokens in self.tokens:

View File

@ -59,11 +59,14 @@ cdef class Doc:
cdef public dict user_token_hooks
cdef public dict user_span_hooks
cdef public bint has_unknown_spaces
cdef public list _py_tokens
cdef int length
cdef int max_length
cdef public object noun_chunks_iterator
cdef object __weakref__

View File

@ -172,8 +172,7 @@ cdef class Doc:
raise ValueError(Errors.E046.format(name=name))
return Underscore.doc_extensions.pop(name)
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
orths_and_spaces=None):
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
"""Create a Doc object.
vocab (Vocab): A vocabulary object, which must match any models you
@ -215,27 +214,24 @@ cdef class Doc:
self._vector = None
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
cdef bint has_space
if orths_and_spaces is None and words is not None:
if spaces is None:
spaces = [True] * len(words)
elif len(spaces) != len(words):
raise ValueError(Errors.E027)
orths_and_spaces = zip(words, spaces)
cdef const LexemeC* lexeme
if orths_and_spaces is not None:
orths_and_spaces = list(orths_and_spaces)
for orth_space in orths_and_spaces:
if isinstance(orth_space, unicode):
lexeme = self.vocab.get(self.mem, orth_space)
has_space = True
elif isinstance(orth_space, bytes):
raise ValueError(Errors.E028.format(value=orth_space))
elif isinstance(orth_space[0], unicode):
lexeme = self.vocab.get(self.mem, orth_space[0])
has_space = orth_space[1]
if words is None and spaces is not None:
raise ValueError("words must be set if spaces is set")
elif spaces is None and words is not None:
self.has_unknown_spaces = True
else:
lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
has_space = orth_space[1]
self.has_unknown_spaces = False
words = words if words is not None else []
spaces = spaces if spaces is not None else ([True] * len(words))
if len(spaces) != len(words):
raise ValueError(Errors.E027)
cdef const LexemeC* lexeme
for word, has_space in zip(words, spaces):
if isinstance(word, unicode):
lexeme = self.vocab.get(self.mem, word)
elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word))
else:
lexeme = self.vocab.get_by_orth(self.mem, word)
self.push_back(lexeme, has_space)
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
# There's no information we'd like to add to it, so I guess so?
@ -1082,6 +1078,7 @@ cdef class Doc:
"sentiment": lambda: self.sentiment,
"tensor": lambda: self.tensor,
"cats": lambda: self.cats,
"has_unknown_spaces": lambda: self.has_unknown_spaces
}
for key in kwargs:
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
@ -1114,6 +1111,7 @@ cdef class Doc:
"cats": lambda b: None,
"user_data_keys": lambda b: None,
"user_data_values": lambda b: None,
"has_unknown_spaces": lambda b: None
}
for key in kwargs:
if key in deserializers or key in ("user_data",):
@ -1134,6 +1132,8 @@ cdef class Doc:
self.tensor = msg["tensor"]
if "cats" not in exclude and "cats" in msg:
self.cats = msg["cats"]
if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
self.has_unknown_spaces = msg["has_unknown_spaces"]
start = 0
cdef const LexemeC* lex
cdef unicode orth_