diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 072981270..682e5134c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,11 +2,7 @@ # Contribute to spaCy -Thanks for your interest in contributing to spaCy 🎉 The project is maintained -by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and -**[@adrianeboyd](https://github.com/adrianeboyd)**, -and we'll do our best to help you get started. This page will give you a quick +Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick overview of how things are organized and most importantly, how to get involved. ## Table of contents diff --git a/README.md b/README.md index 3bc7ba0f1..61d5449a4 100644 --- a/README.md +++ b/README.md @@ -61,11 +61,11 @@ open-source software, released under the MIT license. ## 💬 Where to ask questions The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**, -**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and -**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't -be able to provide individual support via email. We also believe that help is -much more valuable if it's shared publicly, so that more people can benefit from -it. +**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**, +**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**. +Please understand that we won't be able to provide individual support via email. +We also believe that help is much more valuable if it's shared publicly, so that +more people can benefit from it. | Type | Platforms | | ------------------------------- | --------------------------------------- | diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index 7ca231bc4..6f3f9ae0b 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -346,17 +346,25 @@ def test_doc_from_array_morph(en_vocab): @pytest.mark.usefixtures("clean_underscore") def test_doc_api_from_docs(en_tokenizer, de_tokenizer): - en_texts = ["Merging the docs is fun.", "", "They don't think alike."] + en_texts = [ + "Merging the docs is fun.", + "", + "They don't think alike. ", + "Another doc.", + ] en_texts_without_empty = [t for t in en_texts if len(t)] de_text = "Wie war die Frage?" en_docs = [en_tokenizer(text) for text in en_texts] en_docs[0].spans["group"] = [en_docs[0][1:4]] en_docs[2].spans["group"] = [en_docs[2][1:4]] - span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text]) + en_docs[3].spans["group"] = [en_docs[3][0:1]] + span_group_texts = sorted( + [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text] + ) de_doc = de_tokenizer(de_text) Token.set_extension("is_ambiguous", default=False) - en_docs[0][2]._.is_ambiguous = True # docs - en_docs[2][3]._.is_ambiguous = True # think + en_docs[0][2]._.is_ambiguous = True # docs + en_docs[2][3]._.is_ambiguous = True # think assert Doc.from_docs([]) is None assert de_doc is not Doc.from_docs([de_doc]) assert str(de_doc) == str(Doc.from_docs([de_doc])) @@ -366,8 +374,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): m_doc = Doc.from_docs(en_docs) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) - assert str(m_doc) == " ".join(en_texts_without_empty) + assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) + assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -379,11 +387,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert not any([t._.is_ambiguous for t in m_doc[3:8]]) assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, ensure_whitespace=False) assert len(en_texts_without_empty) == len(list(m_doc.sents)) - assert len(str(m_doc)) == sum(len(t) for t in en_texts) - assert str(m_doc) == "".join(en_texts) + assert len(m_doc.text) == sum(len(t) for t in en_texts) + assert m_doc.text == "".join(en_texts_without_empty) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and not bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -392,11 +401,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): assert m_doc[9].idx == think_idx assert "group" in m_doc.spans assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]]) + assert bool(m_doc[11].whitespace_) m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"]) - assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1]) + assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1]) # space delimiter considered, although spacy attribute was missing - assert str(m_doc) == " ".join(en_texts_without_empty) + assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty]) p_token = m_doc[len(en_docs[0]) - 1] assert p_token.text == "." and bool(p_token.whitespace_) en_docs_tokens = [t for doc in en_docs for t in doc] @@ -409,6 +419,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer): # can merge empty docs doc = Doc.from_docs([en_tokenizer("")] * 10) + # empty but set spans keys are preserved + en_docs = [en_tokenizer(text) for text in en_texts] + m_doc = Doc.from_docs(en_docs) + assert "group" not in m_doc.spans + for doc in en_docs: + doc.spans["group"] = [] + m_doc = Doc.from_docs(en_docs) + assert "group" in m_doc.spans + assert len(m_doc.spans["group"]) == 0 + def test_doc_api_from_docs_ents(en_tokenizer): texts = ["Merging the docs is fun.", "They don't think alike."] diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 912d8c17d..cd2bd6f6c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1141,6 +1141,10 @@ cdef class Doc: else: warnings.warn(Warnings.W102.format(key=key, value=value)) for key in doc.spans: + # if a spans key is in any doc, include it in the merged doc + # even if it is empty + if key not in concat_spans: + concat_spans[key] = [] for span in doc.spans[key]: concat_spans[key].append(( span.start_char + char_offset, @@ -1150,7 +1154,7 @@ cdef class Doc: span.text, # included as a check )) char_offset += len(doc.text) - if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space: + if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_): char_offset += 1 arrays = [doc.to_array(attrs) for doc in docs] diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 5a1293c2e..b05d16da3 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -1248,7 +1248,7 @@ hyperparameters, pipeline and tokenizer used for constructing and training the pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that takes the `nlp` object and returns a tokenizer. Here, we're registering a function called `whitespace_tokenizer` in the -[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to +[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to construct your tokenizer during training, you can pass in your Python file by setting `--code functions.py` when you run [`spacy train`](/api/cli#train). diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index fc191824a..d30a50302 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -800,7 +800,7 @@ vars: commands: - name: annotate - script: - - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}' + - 'python -m prodigy ner.correct ${vars.prodigy.dataset} ${vars.prodigy.model} ./assets/raw_data.jsonl --labels ${vars.prodigy.labels}' - 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}' - 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy' - 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'