mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Record whether Doc objects are built from known spacing (#5697)
* Tell convert CLI to store user data for Doc * Remove assert * Add has_unknwon_spaces flag on Doc * Do not tokenize docs with unknown spaces in Corpus * Handle conversion of unknown spaces in Example * Fixes * Fixes * Draft has_known_spaces support in DocBin * Add test for serialize has_unknown_spaces * Fix DocBin serialization when has_unknown_spaces * Use serialization in test
This commit is contained in:
parent
abad56db7d
commit
a902b5f217
|
@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
|
|||
if output_type == "json":
|
||||
srsly.write_json("-", docs_to_json(docs))
|
||||
else:
|
||||
sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
|
||||
sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
|
||||
|
||||
|
||||
def _write_docs_to_file(docs, output_file, output_type):
|
||||
|
@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
|
|||
if output_type == "json":
|
||||
srsly.write_json(output_file, docs_to_json(docs))
|
||||
else:
|
||||
data = DocBin(docs=docs).to_bytes()
|
||||
data = DocBin(docs=docs, store_user_data=True).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
|
||||
|
|
|
@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
|
|||
for json_para in json_to_annotations(json_doc):
|
||||
example_dict = _fix_legacy_dict_data(json_para)
|
||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||
if json_para.get("raw"):
|
||||
assert tok_dict.get("SPACY")
|
||||
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
||||
docs.append(doc)
|
||||
return docs
|
||||
|
|
|
@ -43,24 +43,35 @@ class Corpus:
|
|||
locs.append(path)
|
||||
return locs
|
||||
|
||||
def _make_example(self, nlp, reference, gold_preproc):
|
||||
if gold_preproc or reference.has_unknown_spaces:
|
||||
return Example(
|
||||
Doc(
|
||||
nlp.vocab,
|
||||
words=[word.text for word in reference],
|
||||
spaces=[bool(word.whitespace_) for word in reference]
|
||||
),
|
||||
reference
|
||||
)
|
||||
else:
|
||||
return Example(
|
||||
nlp.make_doc(reference.text),
|
||||
reference
|
||||
)
|
||||
|
||||
def make_examples(self, nlp, reference_docs, max_length=0):
|
||||
for reference in reference_docs:
|
||||
if len(reference) == 0:
|
||||
continue
|
||||
elif max_length == 0 or len(reference) < max_length:
|
||||
yield Example(
|
||||
nlp.make_doc(reference.text),
|
||||
reference
|
||||
)
|
||||
yield self._make_example(nlp, reference, False)
|
||||
elif reference.is_sentenced:
|
||||
for ref_sent in reference.sents:
|
||||
if len(ref_sent) == 0:
|
||||
continue
|
||||
elif max_length == 0 or len(ref_sent) < max_length:
|
||||
yield Example(
|
||||
nlp.make_doc(ref_sent.text),
|
||||
ref_sent.as_doc()
|
||||
)
|
||||
yield self._make_example(nlp, ref_sent.as_doc(), False)
|
||||
|
||||
|
||||
def make_examples_gold_preproc(self, nlp, reference_docs):
|
||||
for reference in reference_docs:
|
||||
|
@ -69,14 +80,7 @@ class Corpus:
|
|||
else:
|
||||
ref_sents = [reference]
|
||||
for ref_sent in ref_sents:
|
||||
eg = Example(
|
||||
Doc(
|
||||
nlp.vocab,
|
||||
words=[w.text for w in ref_sent],
|
||||
spaces=[bool(w.whitespace_) for w in ref_sent]
|
||||
),
|
||||
ref_sent
|
||||
)
|
||||
eg = self._make_example(nlp, ref_sent, True)
|
||||
if len(eg.x):
|
||||
yield eg
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ from ..syntax import nonproj
|
|||
|
||||
|
||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||
""" Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
|
||||
""" Create a Doc from dictionaries with token and doc annotations. """
|
||||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||
if "entities" in doc_annot:
|
||||
|
@ -406,7 +406,7 @@ def _parse_links(vocab, words, spaces, links):
|
|||
|
||||
def _guess_spaces(text, words):
|
||||
if text is None:
|
||||
return [True] * len(words)
|
||||
return None
|
||||
spaces = []
|
||||
text_pos = 0
|
||||
# align words with text
|
||||
|
|
|
@ -75,3 +75,19 @@ def test_serialize_doc_bin():
|
|||
for i, doc in enumerate(reloaded_docs):
|
||||
assert doc.text == texts[i]
|
||||
assert doc.cats == cats
|
||||
|
||||
|
||||
def test_serialize_doc_bin_unknown_spaces(en_vocab):
|
||||
doc1 = Doc(en_vocab, words=["that", "'s"])
|
||||
assert doc1.has_unknown_spaces
|
||||
assert doc1.text == "that 's "
|
||||
doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
|
||||
assert not doc2.has_unknown_spaces
|
||||
assert doc2.text == "that's"
|
||||
|
||||
doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
|
||||
re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
|
||||
assert re_doc1.has_unknown_spaces
|
||||
assert re_doc1.text == "that 's "
|
||||
assert not re_doc2.has_unknown_spaces
|
||||
assert re_doc2.text == "that's"
|
||||
|
|
|
@ -61,6 +61,7 @@ class DocBin(object):
|
|||
self.spaces = []
|
||||
self.cats = []
|
||||
self.user_data = []
|
||||
self.flags = []
|
||||
self.strings = set()
|
||||
self.store_user_data = store_user_data
|
||||
for doc in docs:
|
||||
|
@ -85,6 +86,9 @@ class DocBin(object):
|
|||
assert array.shape[0] == spaces.shape[0] # this should never happen
|
||||
spaces = spaces.reshape((spaces.shape[0], 1))
|
||||
self.spaces.append(numpy.asarray(spaces, dtype=bool))
|
||||
self.flags.append({
|
||||
"has_unknown_spaces": doc.has_unknown_spaces
|
||||
})
|
||||
for token in doc:
|
||||
self.strings.add(token.text)
|
||||
self.strings.add(token.tag_)
|
||||
|
@ -109,8 +113,11 @@ class DocBin(object):
|
|||
vocab[string]
|
||||
orth_col = self.attrs.index(ORTH)
|
||||
for i in range(len(self.tokens)):
|
||||
flags = self.flags[i]
|
||||
tokens = self.tokens[i]
|
||||
spaces = self.spaces[i]
|
||||
if flags.get("has_unknown_spaces"):
|
||||
spaces = None
|
||||
doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
|
||||
doc = doc.from_array(self.attrs, tokens)
|
||||
doc.cats = self.cats[i]
|
||||
|
@ -134,6 +141,7 @@ class DocBin(object):
|
|||
self.spaces.extend(other.spaces)
|
||||
self.strings.update(other.strings)
|
||||
self.cats.extend(other.cats)
|
||||
self.flags.extend(other.flags)
|
||||
if self.store_user_data:
|
||||
self.user_data.extend(other.user_data)
|
||||
|
||||
|
@ -158,6 +166,7 @@ class DocBin(object):
|
|||
"lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
|
||||
"strings": list(self.strings),
|
||||
"cats": self.cats,
|
||||
"flags": self.flags,
|
||||
}
|
||||
if self.store_user_data:
|
||||
msg["user_data"] = self.user_data
|
||||
|
@ -183,6 +192,7 @@ class DocBin(object):
|
|||
self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
|
||||
self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
|
||||
self.cats = msg["cats"]
|
||||
self.flags = msg.get("flags", [{} for _ in lengths])
|
||||
if self.store_user_data and "user_data" in msg:
|
||||
self.user_data = list(msg["user_data"])
|
||||
for tokens in self.tokens:
|
||||
|
|
|
@ -59,11 +59,14 @@ cdef class Doc:
|
|||
cdef public dict user_token_hooks
|
||||
cdef public dict user_span_hooks
|
||||
|
||||
cdef public bint has_unknown_spaces
|
||||
|
||||
cdef public list _py_tokens
|
||||
|
||||
cdef int length
|
||||
cdef int max_length
|
||||
|
||||
|
||||
cdef public object noun_chunks_iterator
|
||||
|
||||
cdef object __weakref__
|
||||
|
|
|
@ -172,8 +172,7 @@ cdef class Doc:
|
|||
raise ValueError(Errors.E046.format(name=name))
|
||||
return Underscore.doc_extensions.pop(name)
|
||||
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
|
||||
orths_and_spaces=None):
|
||||
def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
|
||||
"""Create a Doc object.
|
||||
|
||||
vocab (Vocab): A vocabulary object, which must match any models you
|
||||
|
@ -215,27 +214,24 @@ cdef class Doc:
|
|||
self._vector = None
|
||||
self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
|
||||
cdef bint has_space
|
||||
if orths_and_spaces is None and words is not None:
|
||||
if spaces is None:
|
||||
spaces = [True] * len(words)
|
||||
elif len(spaces) != len(words):
|
||||
raise ValueError(Errors.E027)
|
||||
orths_and_spaces = zip(words, spaces)
|
||||
cdef const LexemeC* lexeme
|
||||
if orths_and_spaces is not None:
|
||||
orths_and_spaces = list(orths_and_spaces)
|
||||
for orth_space in orths_and_spaces:
|
||||
if isinstance(orth_space, unicode):
|
||||
lexeme = self.vocab.get(self.mem, orth_space)
|
||||
has_space = True
|
||||
elif isinstance(orth_space, bytes):
|
||||
raise ValueError(Errors.E028.format(value=orth_space))
|
||||
elif isinstance(orth_space[0], unicode):
|
||||
lexeme = self.vocab.get(self.mem, orth_space[0])
|
||||
has_space = orth_space[1]
|
||||
if words is None and spaces is not None:
|
||||
raise ValueError("words must be set if spaces is set")
|
||||
elif spaces is None and words is not None:
|
||||
self.has_unknown_spaces = True
|
||||
else:
|
||||
lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
|
||||
has_space = orth_space[1]
|
||||
self.has_unknown_spaces = False
|
||||
words = words if words is not None else []
|
||||
spaces = spaces if spaces is not None else ([True] * len(words))
|
||||
if len(spaces) != len(words):
|
||||
raise ValueError(Errors.E027)
|
||||
cdef const LexemeC* lexeme
|
||||
for word, has_space in zip(words, spaces):
|
||||
if isinstance(word, unicode):
|
||||
lexeme = self.vocab.get(self.mem, word)
|
||||
elif isinstance(word, bytes):
|
||||
raise ValueError(Errors.E028.format(value=word))
|
||||
else:
|
||||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||
self.push_back(lexeme, has_space)
|
||||
# Tough to decide on policy for this. Is an empty doc tagged and parsed?
|
||||
# There's no information we'd like to add to it, so I guess so?
|
||||
|
@ -1082,6 +1078,7 @@ cdef class Doc:
|
|||
"sentiment": lambda: self.sentiment,
|
||||
"tensor": lambda: self.tensor,
|
||||
"cats": lambda: self.cats,
|
||||
"has_unknown_spaces": lambda: self.has_unknown_spaces
|
||||
}
|
||||
for key in kwargs:
|
||||
if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
|
||||
|
@ -1114,6 +1111,7 @@ cdef class Doc:
|
|||
"cats": lambda b: None,
|
||||
"user_data_keys": lambda b: None,
|
||||
"user_data_values": lambda b: None,
|
||||
"has_unknown_spaces": lambda b: None
|
||||
}
|
||||
for key in kwargs:
|
||||
if key in deserializers or key in ("user_data",):
|
||||
|
@ -1134,6 +1132,8 @@ cdef class Doc:
|
|||
self.tensor = msg["tensor"]
|
||||
if "cats" not in exclude and "cats" in msg:
|
||||
self.cats = msg["cats"]
|
||||
if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
|
||||
self.has_unknown_spaces = msg["has_unknown_spaces"]
|
||||
start = 0
|
||||
cdef const LexemeC* lex
|
||||
cdef unicode orth_
|
||||
|
|
Loading…
Reference in New Issue
Block a user