From 31504f59828a17de7e71bfdf9dd0e05ba8d229bc Mon Sep 17 00:00:00 2001 From: Nick Sorros Date: Tue, 22 Jun 2021 10:41:45 +0300 Subject: [PATCH 1/4] Switch model and data path in prodigy project.yml recipe (#8467) --- website/docs/usage/projects.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index fc191824a..d30a50302 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -800,7 +800,7 @@ vars: commands: - name: annotate - script: - - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}' + - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ${vars.prodigy.model} ./assets/raw_data.jsonl --labels ${vars.prodigy.labels}' - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}' - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy' - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy' From d96c422cfc1c63e7cdf56dcc3608cbda5efcd948 Mon Sep 17 00:00:00 2001 From: themrmax Date: Tue, 22 Jun 2021 15:34:06 -0700 Subject: [PATCH 2/4] Fix broken link change /api/registry to /api/top-level#registry --- website/docs/usage/linguistic-features.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 5a1293c2e..b05d16da3 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1248,7 +1248,7 @@ hyperparameters, pipeline and tokenizer used for constructing and training the pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that takes the `nlp` object and returns a tokenizer. Here, we're registering a function called `whitespace_tokenizer` in the -[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to +[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to construct your tokenizer during training, you can pass in your Python file by setting `--code functions.py` when you run [`spacy train`](/api/cli#train). From 393c3c70d7ca1c00d4357faa4cd05450285db160 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 23 Jun 2021 15:51:35 +0200 Subject: [PATCH 3/4] Various fixes for spans in Docs.from_docs (#8487) * Fix spans offsets if a doc ends in a single space and no space is inserted * Also include spans key in merged doc for empty spans lists --- spacy/tests/doc/test_doc_api.py | 40 ++++++++++++++++++++++++--------- spacy/tokens/doc.pyx | 6 ++++- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 7ca231bc4..6f3f9ae0b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -346,17 +346,25 @@ def test_doc_from_array_morph(en_vocab): @pytest.mark.usefixtures("clean_underscore") def test_doc_api_from_docs(en_tokenizer, de_tokenizer): - en_texts = ["Merging the docs is fun.", "", "They don't think alike."] + en_texts = [ + "Merging the docs is fun.", + "", + "They don't think alike. ", + "Another doc.", + ] en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] - span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text]) + en_docs[3].spans["group"] = [en_docs[3][0:1]] + span_group_texts = sorted( + [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] + ) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) - en_docs[0][2]._.is_ambiguous = True # docs - en_docs[2][3]._.is_ambiguous = True # think + en_docs[0][2]._.is_ambiguous = True # docs + en_docs[2][3]._.is_ambiguous = True # think assert Doc.from_docs([]) is None assert de_doc is not Doc.from_docs([de_doc]) assert str(de_doc) == str(Doc.from_docs([de_doc])) @@ -366,8 +374,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): m_doc = Doc.from_docs(en_docs) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) - assert str(m_doc) == " ".join(en_texts_without_empty) + assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) + assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -379,11 +387,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert not any([t._.is_ambiguous for t in m_doc[3:8]]) assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(str(m_doc)) == sum(len(t) for t in en_texts) - assert str(m_doc) == "".join(en_texts) + assert len(m_doc.text) == sum(len(t) for t in en_texts) + assert m_doc.text == "".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -392,11 +401,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert m_doc[9].idx == think_idx assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) - assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) + assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing - assert str(m_doc) == " ".join(en_texts_without_empty) + assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -409,6 +419,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): # can merge empty docs doc = Doc.from_docs([en_tokenizer("")] * 10) + # empty but set spans keys are preserved + en_docs = [en_tokenizer(text) for text in en_texts] + m_doc = Doc.from_docs(en_docs) + assert "group" not in m_doc.spans + for doc in en_docs: + doc.spans["group"] = [] + m_doc = Doc.from_docs(en_docs) + assert "group" in m_doc.spans + assert len(m_doc.spans["group"]) == 0 + def test_doc_api_from_docs_ents(en_tokenizer): texts = ["Merging the docs is fun.", "They don't think alike."] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 912d8c17d..cd2bd6f6c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1141,6 +1141,10 @@ cdef class Doc: else: warnings.warn(Warnings.W102.format(key=key, value=value)) for key in doc.spans: + # if a spans key is in any doc, include it in the merged doc + # even if it is empty + if key not in concat_spans: + concat_spans[key] = [] for span in doc.spans[key]: concat_spans[key].append(( span.start_char + char_offset, @@ -1150,7 +1154,7 @@ cdef class Doc: span.text, # included as a check )) char_offset += len(doc.text) - if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space: + if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_): char_offset += 1 arrays = [doc.to_array(attrs) for doc in docs] From 3e3d87a068019fd66625ef4f1686f0ad616f7ab5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 24 Jun 2021 12:37:55 +1000 Subject: [PATCH 4/4] Update maintainer info [ci skip] --- CONTRIBUTING.md | 6 +----- README.md | 10 +++++----- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 072981270..682e5134c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,11 +2,7 @@ # Contribute to spaCy -Thanks for your interest in contributing to spaCy 🎉 The project is maintained -by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and -**[@adrianeboyd](https://github.com/adrianeboyd)**, -and we'll do our best to help you get started. This page will give you a quick +Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick overview of how things are organized and most importantly, how to get involved. ## Table of contents diff --git a/README.md b/README.md index 3bc7ba0f1..61d5449a4 100644 --- a/README.md +++ b/README.md @@ -61,11 +61,11 @@ open-source software, released under the MIT license. ## 💬 Where to ask questions The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and -**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't -be able to provide individual support via email. We also believe that help is -much more valuable if it's shared publicly, so that more people can benefit from -it. +**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**, +**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**. +Please understand that we won't be able to provide individual support via email. +We also believe that help is much more valuable if it's shared publicly, so that +more people can benefit from it. | Type | Platforms | | ------------------------------- | --------------------------------------- |