Merge branch 'master' into docs/new-in-v3-1

This commit is contained in:
Ines Montani 2021-06-24 13:11:37 +10:00
commit 528746129d
6 changed files with 43 additions and 23 deletions

View File

@ -2,11 +2,7 @@
# Contribute to spaCy
Thanks for your interest in contributing to spaCy 🎉 The project is maintained
by **[@honnibal](https://github.com/honnibal)**,
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
**[@adrianeboyd](https://github.com/adrianeboyd)**,
and we'll do our best to help you get started. This page will give you a quick
Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick
overview of how things are organized and most importantly, how to get involved.
## Table of contents

View File

@ -61,11 +61,11 @@ open-source software, released under the MIT license.
## 💬 Where to ask questions
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't
be able to provide individual support via email. We also believe that help is
much more valuable if it's shared publicly, so that more people can benefit from
it.
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
Please understand that we won't be able to provide individual support via email.
We also believe that help is much more valuable if it's shared publicly, so that
more people can benefit from it.
| Type | Platforms |
| ------------------------------- | --------------------------------------- |

View File

@ -346,13 +346,21 @@ def test_doc_from_array_morph(en_vocab):
@pytest.mark.usefixtures("clean_underscore")
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
en_texts = [
"Merging the docs is fun.",
"",
"They don't think alike. ",
"Another doc.",
]
en_texts_without_empty = [t for t in en_texts if len(t)]
de_text = "Wie war die Frage?"
en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]]
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
en_docs[3].spans["group"] = [en_docs[3][0:1]]
span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
)
de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False)
en_docs[0][2]._.is_ambiguous = True # docs
@ -366,8 +374,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
m_doc = Doc.from_docs(en_docs)
assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts_without_empty)
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
@ -379,11 +387,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
assert bool(m_doc[11].whitespace_)
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
assert str(m_doc) == "".join(en_texts)
assert len(m_doc.text) == sum(len(t) for t in en_texts)
assert m_doc.text == "".join(en_texts_without_empty)
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and not bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
@ -392,11 +401,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert m_doc[9].idx == think_idx
assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
assert bool(m_doc[11].whitespace_)
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
# space delimiter considered, although spacy attribute was missing
assert str(m_doc) == " ".join(en_texts_without_empty)
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc]
@ -409,6 +419,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
# can merge empty docs
doc = Doc.from_docs([en_tokenizer("")] * 10)
# empty but set spans keys are preserved
en_docs = [en_tokenizer(text) for text in en_texts]
m_doc = Doc.from_docs(en_docs)
assert "group" not in m_doc.spans
for doc in en_docs:
doc.spans["group"] = []
m_doc = Doc.from_docs(en_docs)
assert "group" in m_doc.spans
assert len(m_doc.spans["group"]) == 0
def test_doc_api_from_docs_ents(en_tokenizer):
texts = ["Merging the docs is fun.", "They don't think alike."]

View File

@ -1141,6 +1141,10 @@ cdef class Doc:
else:
warnings.warn(Warnings.W102.format(key=key, value=value))
for key in doc.spans:
# if a spans key is in any doc, include it in the merged doc
# even if it is empty
if key not in concat_spans:
concat_spans[key] = []
for span in doc.spans[key]:
concat_spans[key].append((
span.start_char + char_offset,
@ -1150,7 +1154,7 @@ cdef class Doc:
span.text, # included as a check
))
char_offset += len(doc.text)
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
char_offset += 1
arrays = [doc.to_array(attrs) for doc in docs]

View File

@ -1248,7 +1248,7 @@ hyperparameters, pipeline and tokenizer used for constructing and training the
pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
takes the `nlp` object and returns a tokenizer. Here, we're registering a
function called `whitespace_tokenizer` in the
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to
construct your tokenizer during training, you can pass in your Python file by
setting `--code functions.py` when you run [`spacy train`](/api/cli#train).

View File

@ -800,7 +800,7 @@ vars:
commands:
- name: annotate
- script:
- 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}'
- 'python -m prodigy ner.correct ${vars.prodigy.dataset} ${vars.prodigy.model} ./assets/raw_data.jsonl --labels ${vars.prodigy.labels}'
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'