mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 09:26:27 +03:00
Record whether Doc objects are built from known spacing (#5697)
* Tell convert CLI to store user data for Doc * Remove assert * Add has_unknwon_spaces flag on Doc * Do not tokenize docs with unknown spaces in Corpus * Handle conversion of unknown spaces in Example * Fixes * Fixes * Draft has_known_spaces support in DocBin * Add test for serialize has_unknown_spaces * Fix DocBin serialization when has_unknown_spaces * Use serialization in test
This commit is contained in:
parent
abad56db7d
commit
a902b5f217
|
@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
|
||||||
if output_type == "json":
|
if output_type == "json":
|
||||||
srsly.write_json("-", docs_to_json(docs))
|
srsly.write_json("-", docs_to_json(docs))
|
||||||
else:
|
else:
|
||||||
sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
|
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
|
||||||
|
|
||||||
|
|
||||||
def _write_docs_to_file(docs, output_file, output_type):
|
def _write_docs_to_file(docs, output_file, output_type):
|
||||||
|
@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
|
||||||
if output_type == "json":
|
if output_type == "json":
|
||||||
srsly.write_json(output_file, docs_to_json(docs))
|
srsly.write_json(output_file, docs_to_json(docs))
|
||||||
else:
|
else:
|
||||||
data = DocBin(docs=docs).to_bytes()
|
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||||
with output_file.open("wb") as file_:
|
with output_file.open("wb") as file_:
|
||||||
file_.write(data)
|
file_.write(data)
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
|
||||||
for json_para in json_to_annotations(json_doc):
|
for json_para in json_to_annotations(json_doc):
|
||||||
example_dict = _fix_legacy_dict_data(json_para)
|
example_dict = _fix_legacy_dict_data(json_para)
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
if json_para.get("raw"):
|
|
||||||
assert tok_dict.get("SPACY")
|
|
||||||
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
return docs
|
return docs
|
||||||
|
|
|
@ -43,24 +43,35 @@ class Corpus:
|
||||||
locs.append(path)
|
locs.append(path)
|
||||||
return locs
|
return locs
|
||||||
|
|
||||||
|
def _make_example(self, nlp, reference, gold_preproc):
|
||||||
|
if gold_preproc or reference.has_unknown_spaces:
|
||||||
|
return Example(
|
||||||
|
Doc(
|
||||||
|
nlp.vocab,
|
||||||
|
words=[word.text for word in reference],
|
||||||
|
spaces=[bool(word.whitespace_) for word in reference]
|
||||||
|
),
|
||||||
|
reference
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return Example(
|
||||||
|
nlp.make_doc(reference.text),
|
||||||
|
reference
|
||||||
|
)
|
||||||
|
|
||||||
def make_examples(self, nlp, reference_docs, max_length=0):
|
def make_examples(self, nlp, reference_docs, max_length=0):
|
||||||
for reference in reference_docs:
|
for reference in reference_docs:
|
||||||
if len(reference) == 0:
|
if len(reference) == 0:
|
||||||
continue
|
continue
|
||||||
elif max_length == 0 or len(reference) < max_length:
|
elif max_length == 0 or len(reference) < max_length:
|
||||||
yield Example(
|
yield self._make_example(nlp, reference, False)
|
||||||
nlp.make_doc(reference.text),
|
|
||||||
reference
|
|
||||||
)
|
|
||||||
elif reference.is_sentenced:
|
elif reference.is_sentenced:
|
||||||
for ref_sent in reference.sents:
|
for ref_sent in reference.sents:
|
||||||
if len(ref_sent) == 0:
|
if len(ref_sent) == 0:
|
||||||
continue
|
continue
|
||||||
elif max_length == 0 or len(ref_sent) < max_length:
|
elif max_length == 0 or len(ref_sent) < max_length:
|
||||||
yield Example(
|
yield self._make_example(nlp, ref_sent.as_doc(), False)
|
||||||
nlp.make_doc(ref_sent.text),
|
|
||||||
ref_sent.as_doc()
|
|
||||||
)
|
|
||||||
|
|
||||||
def make_examples_gold_preproc(self, nlp, reference_docs):
|
def make_examples_gold_preproc(self, nlp, reference_docs):
|
||||||
for reference in reference_docs:
|
for reference in reference_docs:
|
||||||
|
@ -69,14 +80,7 @@ class Corpus:
|
||||||
else:
|
else:
|
||||||
ref_sents = [reference]
|
ref_sents = [reference]
|
||||||
for ref_sent in ref_sents:
|
for ref_sent in ref_sents:
|
||||||
eg = Example(
|
eg = self._make_example(nlp, ref_sent, True)
|
||||||
Doc(
|
|
||||||
nlp.vocab,
|
|
||||||
words=[w.text for w in ref_sent],
|
|
||||||
spaces=[bool(w.whitespace_) for w in ref_sent]
|
|
||||||
),
|
|
||||||
ref_sent
|
|
||||||
)
|
|
||||||
if len(eg.x):
|
if len(eg.x):
|
||||||
yield eg
|
yield eg
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ from ..syntax import nonproj
|
||||||
|
|
||||||
|
|
||||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
""" Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
|
""" Create a Doc from dictionaries with token and doc annotations. """
|
||||||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||||
if "entities" in doc_annot:
|
if "entities" in doc_annot:
|
||||||
|
@ -406,7 +406,7 @@ def _parse_links(vocab, words, spaces, links):
|
||||||
|
|
||||||
def _guess_spaces(text, words):
|
def _guess_spaces(text, words):
|
||||||
if text is None:
|
if text is None:
|
||||||
return [True] * len(words)
|
return None
|
||||||
spaces = []
|
spaces = []
|
||||||
text_pos = 0
|
text_pos = 0
|
||||||
# align words with text
|
# align words with text
|
||||||
|
|
|
@ -75,3 +75,19 @@ def test_serialize_doc_bin():
|
||||||
for i, doc in enumerate(reloaded_docs):
|
for i, doc in enumerate(reloaded_docs):
|
||||||
assert doc.text == texts[i]
|
assert doc.text == texts[i]
|
||||||
assert doc.cats == cats
|
assert doc.cats == cats
|
||||||
|
|
||||||
|
|
||||||
|
def test_serialize_doc_bin_unknown_spaces(en_vocab):
|
||||||
|
doc1 = Doc(en_vocab, words=["that", "'s"])
|
||||||
|
assert doc1.has_unknown_spaces
|
||||||
|
assert doc1.text == "that 's "
|
||||||
|
doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
|
||||||
|
assert not doc2.has_unknown_spaces
|
||||||
|
assert doc2.text == "that's"
|
||||||
|
|
||||||
|
doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
|
||||||
|
re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
|
||||||
|
assert re_doc1.has_unknown_spaces
|
||||||
|
assert re_doc1.text == "that 's "
|
||||||
|
assert not re_doc2.has_unknown_spaces
|
||||||
|
assert re_doc2.text == "that's"
|
||||||
|
|
|
@ -61,6 +61,7 @@ class DocBin(object):
|
||||||
self.spaces = []
|
self.spaces = []
|
||||||
self.cats = []
|
self.cats = []
|
||||||
self.user_data = []
|
self.user_data = []
|
||||||
|
self.flags = []
|
||||||
self.strings = set()
|
self.strings = set()
|
||||||
self.store_user_data = store_user_data
|
self.store_user_data = store_user_data
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
|
@ -85,6 +86,9 @@ class DocBin(object):
|
||||||
assert array.shape[0] == spaces.shape[0] # this should never happen
|
assert array.shape[0] == spaces.shape[0] # this should never happen
|
||||||
spaces = spaces.reshape((spaces.shape[0], 1))
|
spaces = spaces.reshape((spaces.shape[0], 1))
|
||||||
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
||||||
|
self.flags.append({
|
||||||
|
"has_unknown_spaces": doc.has_unknown_spaces
|
||||||
|
})
|
||||||
for token in doc:
|
for token in doc:
|
||||||
self.strings.add(token.text)
|
self.strings.add(token.text)
|
||||||
self.strings.add(token.tag_)
|
self.strings.add(token.tag_)
|
||||||
|
@ -109,8 +113,11 @@ class DocBin(object):
|
||||||
vocab[string]
|
vocab[string]
|
||||||
orth_col = self.attrs.index(ORTH)
|
orth_col = self.attrs.index(ORTH)
|
||||||
for i in range(len(self.tokens)):
|
for i in range(len(self.tokens)):
|
||||||
|
flags = self.flags[i]
|
||||||
tokens = self.tokens[i]
|
tokens = self.tokens[i]
|
||||||
spaces = self.spaces[i]
|
spaces = self.spaces[i]
|
||||||
|
if flags.get("has_unknown_spaces"):
|
||||||
|
spaces = None
|
||||||
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
|
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
|
||||||
doc = doc.from_array(self.attrs, tokens)
|
doc = doc.from_array(self.attrs, tokens)
|
||||||
doc.cats = self.cats[i]
|
doc.cats = self.cats[i]
|
||||||
|
@ -134,6 +141,7 @@ class DocBin(object):
|
||||||
self.spaces.extend(other.spaces)
|
self.spaces.extend(other.spaces)
|
||||||
self.strings.update(other.strings)
|
self.strings.update(other.strings)
|
||||||
self.cats.extend(other.cats)
|
self.cats.extend(other.cats)
|
||||||
|
self.flags.extend(other.flags)
|
||||||
if self.store_user_data:
|
if self.store_user_data:
|
||||||
self.user_data.extend(other.user_data)
|
self.user_data.extend(other.user_data)
|
||||||
|
|
||||||
|
@ -158,6 +166,7 @@ class DocBin(object):
|
||||||
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
||||||
"strings": list(self.strings),
|
"strings": list(self.strings),
|
||||||
"cats": self.cats,
|
"cats": self.cats,
|
||||||
|
"flags": self.flags,
|
||||||
}
|
}
|
||||||
if self.store_user_data:
|
if self.store_user_data:
|
||||||
msg["user_data"] = self.user_data
|
msg["user_data"] = self.user_data
|
||||||
|
@ -183,6 +192,7 @@ class DocBin(object):
|
||||||
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
|
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
|
||||||
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
||||||
self.cats = msg["cats"]
|
self.cats = msg["cats"]
|
||||||
|
self.flags = msg.get("flags", [{} for _ in lengths])
|
||||||
if self.store_user_data and "user_data" in msg:
|
if self.store_user_data and "user_data" in msg:
|
||||||
self.user_data = list(msg["user_data"])
|
self.user_data = list(msg["user_data"])
|
||||||
for tokens in self.tokens:
|
for tokens in self.tokens:
|
||||||
|
|
|
@ -59,11 +59,14 @@ cdef class Doc:
|
||||||
cdef public dict user_token_hooks
|
cdef public dict user_token_hooks
|
||||||
cdef public dict user_span_hooks
|
cdef public dict user_span_hooks
|
||||||
|
|
||||||
|
cdef public bint has_unknown_spaces
|
||||||
|
|
||||||
cdef public list _py_tokens
|
cdef public list _py_tokens
|
||||||
|
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int max_length
|
cdef int max_length
|
||||||
|
|
||||||
|
|
||||||
cdef public object noun_chunks_iterator
|
cdef public object noun_chunks_iterator
|
||||||
|
|
||||||
cdef object __weakref__
|
cdef object __weakref__
|
||||||
|
|
|
@ -172,8 +172,7 @@ cdef class Doc:
|
||||||
raise ValueError(Errors.E046.format(name=name))
|
raise ValueError(Errors.E046.format(name=name))
|
||||||
return Underscore.doc_extensions.pop(name)
|
return Underscore.doc_extensions.pop(name)
|
||||||
|
|
||||||
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
|
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
|
||||||
orths_and_spaces=None):
|
|
||||||
"""Create a Doc object.
|
"""Create a Doc object.
|
||||||
|
|
||||||
vocab (Vocab): A vocabulary object, which must match any models you
|
vocab (Vocab): A vocabulary object, which must match any models you
|
||||||
|
@ -215,27 +214,24 @@ cdef class Doc:
|
||||||
self._vector = None
|
self._vector = None
|
||||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||||
cdef bint has_space
|
cdef bint has_space
|
||||||
if orths_and_spaces is None and words is not None:
|
if words is None and spaces is not None:
|
||||||
if spaces is None:
|
raise ValueError("words must be set if spaces is set")
|
||||||
spaces = [True] * len(words)
|
elif spaces is None and words is not None:
|
||||||
elif len(spaces) != len(words):
|
self.has_unknown_spaces = True
|
||||||
raise ValueError(Errors.E027)
|
|
||||||
orths_and_spaces = zip(words, spaces)
|
|
||||||
cdef const LexemeC* lexeme
|
|
||||||
if orths_and_spaces is not None:
|
|
||||||
orths_and_spaces = list(orths_and_spaces)
|
|
||||||
for orth_space in orths_and_spaces:
|
|
||||||
if isinstance(orth_space, unicode):
|
|
||||||
lexeme = self.vocab.get(self.mem, orth_space)
|
|
||||||
has_space = True
|
|
||||||
elif isinstance(orth_space, bytes):
|
|
||||||
raise ValueError(Errors.E028.format(value=orth_space))
|
|
||||||
elif isinstance(orth_space[0], unicode):
|
|
||||||
lexeme = self.vocab.get(self.mem, orth_space[0])
|
|
||||||
has_space = orth_space[1]
|
|
||||||
else:
|
else:
|
||||||
lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
|
self.has_unknown_spaces = False
|
||||||
has_space = orth_space[1]
|
words = words if words is not None else []
|
||||||
|
spaces = spaces if spaces is not None else ([True] * len(words))
|
||||||
|
if len(spaces) != len(words):
|
||||||
|
raise ValueError(Errors.E027)
|
||||||
|
cdef const LexemeC* lexeme
|
||||||
|
for word, has_space in zip(words, spaces):
|
||||||
|
if isinstance(word, unicode):
|
||||||
|
lexeme = self.vocab.get(self.mem, word)
|
||||||
|
elif isinstance(word, bytes):
|
||||||
|
raise ValueError(Errors.E028.format(value=word))
|
||||||
|
else:
|
||||||
|
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||||
self.push_back(lexeme, has_space)
|
self.push_back(lexeme, has_space)
|
||||||
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
||||||
# There's no information we'd like to add to it, so I guess so?
|
# There's no information we'd like to add to it, so I guess so?
|
||||||
|
@ -1082,6 +1078,7 @@ cdef class Doc:
|
||||||
"sentiment": lambda: self.sentiment,
|
"sentiment": lambda: self.sentiment,
|
||||||
"tensor": lambda: self.tensor,
|
"tensor": lambda: self.tensor,
|
||||||
"cats": lambda: self.cats,
|
"cats": lambda: self.cats,
|
||||||
|
"has_unknown_spaces": lambda: self.has_unknown_spaces
|
||||||
}
|
}
|
||||||
for key in kwargs:
|
for key in kwargs:
|
||||||
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
|
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
|
||||||
|
@ -1114,6 +1111,7 @@ cdef class Doc:
|
||||||
"cats": lambda b: None,
|
"cats": lambda b: None,
|
||||||
"user_data_keys": lambda b: None,
|
"user_data_keys": lambda b: None,
|
||||||
"user_data_values": lambda b: None,
|
"user_data_values": lambda b: None,
|
||||||
|
"has_unknown_spaces": lambda b: None
|
||||||
}
|
}
|
||||||
for key in kwargs:
|
for key in kwargs:
|
||||||
if key in deserializers or key in ("user_data",):
|
if key in deserializers or key in ("user_data",):
|
||||||
|
@ -1134,6 +1132,8 @@ cdef class Doc:
|
||||||
self.tensor = msg["tensor"]
|
self.tensor = msg["tensor"]
|
||||||
if "cats" not in exclude and "cats" in msg:
|
if "cats" not in exclude and "cats" in msg:
|
||||||
self.cats = msg["cats"]
|
self.cats = msg["cats"]
|
||||||
|
if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
|
||||||
|
self.has_unknown_spaces = msg["has_unknown_spaces"]
|
||||||
start = 0
|
start = 0
|
||||||
cdef const LexemeC* lex
|
cdef const LexemeC* lex
|
||||||
cdef unicode orth_
|
cdef unicode orth_
|
||||||
|
|
Loading…
Reference in New Issue
Block a user