Merge branch 'master' into docs/new-in-v3-1

This commit is contained in:
Ines Montani 2021-06-24 13:11:37 +10:00
commit 528746129d
6 changed files with 43 additions and 23 deletions

View File

@ -2,11 +2,7 @@
# Contribute to spaCy # Contribute to spaCy
Thanks for your interest in contributing to spaCy 🎉 The project is maintained Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick
by **[@honnibal](https://github.com/honnibal)**,
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
**[@adrianeboyd](https://github.com/adrianeboyd)**,
and we'll do our best to help you get started. This page will give you a quick
overview of how things are organized and most importantly, how to get involved. overview of how things are organized and most importantly, how to get involved.
## Table of contents ## Table of contents

View File

@ -61,11 +61,11 @@ open-source software, released under the MIT license.
## 💬 Where to ask questions ## 💬 Where to ask questions
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**, The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and **[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't **[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
be able to provide individual support via email. We also believe that help is Please understand that we won't be able to provide individual support via email.
much more valuable if it's shared publicly, so that more people can benefit from We also believe that help is much more valuable if it's shared publicly, so that
it. more people can benefit from it.
| Type | Platforms | | Type | Platforms |
| ------------------------------- | --------------------------------------- | | ------------------------------- | --------------------------------------- |

View File

@ -346,13 +346,21 @@ def test_doc_from_array_morph(en_vocab):
@pytest.mark.usefixtures("clean_underscore") @pytest.mark.usefixtures("clean_underscore")
def test_doc_api_from_docs(en_tokenizer, de_tokenizer): def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
en_texts = ["Merging the docs is fun.", "", "They don't think alike."] en_texts = [
"Merging the docs is fun.",
"",
"They don't think alike. ",
"Another doc.",
]
en_texts_without_empty = [t for t in en_texts if len(t)] en_texts_without_empty = [t for t in en_texts if len(t)]
de_text = "Wie war die Frage?" de_text = "Wie war die Frage?"
en_docs = [en_tokenizer(text) for text in en_texts] en_docs = [en_tokenizer(text) for text in en_texts]
en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[0].spans["group"] = [en_docs[0][1:4]]
en_docs[2].spans["group"] = [en_docs[2][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]]
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text]) en_docs[3].spans["group"] = [en_docs[3][0:1]]
span_group_texts = sorted(
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
)
de_doc = de_tokenizer(de_text) de_doc = de_tokenizer(de_text)
Token.set_extension("is_ambiguous", default=False) Token.set_extension("is_ambiguous", default=False)
en_docs[0][2]._.is_ambiguous = True # docs en_docs[0][2]._.is_ambiguous = True # docs
@ -366,8 +374,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
m_doc = Doc.from_docs(en_docs) m_doc = Doc.from_docs(en_docs)
assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
assert str(m_doc) == " ".join(en_texts_without_empty) assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
p_token = m_doc[len(en_docs[0]) - 1] p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_) assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc] en_docs_tokens = [t for doc in en_docs for t in doc]
@ -379,11 +387,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert not any([t._.is_ambiguous for t in m_doc[3:8]]) assert not any([t._.is_ambiguous for t in m_doc[3:8]])
assert "group" in m_doc.spans assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
assert bool(m_doc[11].whitespace_)
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
assert len(en_texts_without_empty) == len(list(m_doc.sents)) assert len(en_texts_without_empty) == len(list(m_doc.sents))
assert len(str(m_doc)) == sum(len(t) for t in en_texts) assert len(m_doc.text) == sum(len(t) for t in en_texts)
assert str(m_doc) == "".join(en_texts) assert m_doc.text == "".join(en_texts_without_empty)
p_token = m_doc[len(en_docs[0]) - 1] p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and not bool(p_token.whitespace_) assert p_token.text == "." and not bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc] en_docs_tokens = [t for doc in en_docs for t in doc]
@ -392,11 +401,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
assert m_doc[9].idx == think_idx assert m_doc[9].idx == think_idx
assert "group" in m_doc.spans assert "group" in m_doc.spans
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
assert bool(m_doc[11].whitespace_)
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
# space delimiter considered, although spacy attribute was missing # space delimiter considered, although spacy attribute was missing
assert str(m_doc) == " ".join(en_texts_without_empty) assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
p_token = m_doc[len(en_docs[0]) - 1] p_token = m_doc[len(en_docs[0]) - 1]
assert p_token.text == "." and bool(p_token.whitespace_) assert p_token.text == "." and bool(p_token.whitespace_)
en_docs_tokens = [t for doc in en_docs for t in doc] en_docs_tokens = [t for doc in en_docs for t in doc]
@ -409,6 +419,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
# can merge empty docs # can merge empty docs
doc = Doc.from_docs([en_tokenizer("")] * 10) doc = Doc.from_docs([en_tokenizer("")] * 10)
# empty but set spans keys are preserved
en_docs = [en_tokenizer(text) for text in en_texts]
m_doc = Doc.from_docs(en_docs)
assert "group" not in m_doc.spans
for doc in en_docs:
doc.spans["group"] = []
m_doc = Doc.from_docs(en_docs)
assert "group" in m_doc.spans
assert len(m_doc.spans["group"]) == 0
def test_doc_api_from_docs_ents(en_tokenizer): def test_doc_api_from_docs_ents(en_tokenizer):
texts = ["Merging the docs is fun.", "They don't think alike."] texts = ["Merging the docs is fun.", "They don't think alike."]

View File

@ -1141,6 +1141,10 @@ cdef class Doc:
else: else:
warnings.warn(Warnings.W102.format(key=key, value=value)) warnings.warn(Warnings.W102.format(key=key, value=value))
for key in doc.spans: for key in doc.spans:
# if a spans key is in any doc, include it in the merged doc
# even if it is empty
if key not in concat_spans:
concat_spans[key] = []
for span in doc.spans[key]: for span in doc.spans[key]:
concat_spans[key].append(( concat_spans[key].append((
span.start_char + char_offset, span.start_char + char_offset,
@ -1150,7 +1154,7 @@ cdef class Doc:
span.text, # included as a check span.text, # included as a check
)) ))
char_offset += len(doc.text) char_offset += len(doc.text)
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space: if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
char_offset += 1 char_offset += 1
arrays = [doc.to_array(attrs) for doc in docs] arrays = [doc.to_array(attrs) for doc in docs]

View File

@ -1248,7 +1248,7 @@ hyperparameters, pipeline and tokenizer used for constructing and training the
pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
takes the `nlp` object and returns a tokenizer. Here, we're registering a takes the `nlp` object and returns a tokenizer. Here, we're registering a
function called `whitespace_tokenizer` in the function called `whitespace_tokenizer` in the
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to [`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to
construct your tokenizer during training, you can pass in your Python file by construct your tokenizer during training, you can pass in your Python file by
setting `--code functions.py` when you run [`spacy train`](/api/cli#train). setting `--code functions.py` when you run [`spacy train`](/api/cli#train).

View File

@ -800,7 +800,7 @@ vars:
commands: commands:
- name: annotate - name: annotate
- script: - script:
- 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}' - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ${vars.prodigy.model} ./assets/raw_data.jsonl --labels ${vars.prodigy.labels}'
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}' - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy' - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy' - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'