mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Merge branch 'master' into docs/new-in-v3-1
This commit is contained in:
commit
528746129d
|
@ -2,11 +2,7 @@
|
|||
|
||||
# Contribute to spaCy
|
||||
|
||||
Thanks for your interest in contributing to spaCy 🎉 The project is maintained
|
||||
by **[@honnibal](https://github.com/honnibal)**,
|
||||
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
|
||||
**[@adrianeboyd](https://github.com/adrianeboyd)**,
|
||||
and we'll do our best to help you get started. This page will give you a quick
|
||||
Thanks for your interest in contributing to spaCy 🎉 This page will give you a quick
|
||||
overview of how things are organized and most importantly, how to get involved.
|
||||
|
||||
## Table of contents
|
||||
|
|
10
README.md
10
README.md
|
@ -61,11 +61,11 @@ open-source software, released under the MIT license.
|
|||
## 💬 Where to ask questions
|
||||
|
||||
The spaCy project is maintained by **[@honnibal](https://github.com/honnibal)**,
|
||||
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)** and
|
||||
**[@adrianeboyd](https://github.com/adrianeboyd)**. Please understand that we won't
|
||||
be able to provide individual support via email. We also believe that help is
|
||||
much more valuable if it's shared publicly, so that more people can benefit from
|
||||
it.
|
||||
**[@ines](https://github.com/ines)**, **[@svlandeg](https://github.com/svlandeg)**,
|
||||
**[@adrianeboyd](https://github.com/adrianeboyd)** and **[@polm](https://github.com/polm)**.
|
||||
Please understand that we won't be able to provide individual support via email.
|
||||
We also believe that help is much more valuable if it's shared publicly, so that
|
||||
more people can benefit from it.
|
||||
|
||||
| Type | Platforms |
|
||||
| ------------------------------- | --------------------------------------- |
|
||||
|
|
|
@ -346,17 +346,25 @@ def test_doc_from_array_morph(en_vocab):
|
|||
|
||||
@pytest.mark.usefixtures("clean_underscore")
|
||||
def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
||||
en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
|
||||
en_texts = [
|
||||
"Merging the docs is fun.",
|
||||
"",
|
||||
"They don't think alike. ",
|
||||
"Another doc.",
|
||||
]
|
||||
en_texts_without_empty = [t for t in en_texts if len(t)]
|
||||
de_text = "Wie war die Frage?"
|
||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
en_docs[0].spans["group"] = [en_docs[0][1:4]]
|
||||
en_docs[2].spans["group"] = [en_docs[2][1:4]]
|
||||
span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
|
||||
en_docs[3].spans["group"] = [en_docs[3][0:1]]
|
||||
span_group_texts = sorted(
|
||||
[en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
|
||||
)
|
||||
de_doc = de_tokenizer(de_text)
|
||||
Token.set_extension("is_ambiguous", default=False)
|
||||
en_docs[0][2]._.is_ambiguous = True # docs
|
||||
en_docs[2][3]._.is_ambiguous = True # think
|
||||
en_docs[0][2]._.is_ambiguous = True # docs
|
||||
en_docs[2][3]._.is_ambiguous = True # think
|
||||
assert Doc.from_docs([]) is None
|
||||
assert de_doc is not Doc.from_docs([de_doc])
|
||||
assert str(de_doc) == str(Doc.from_docs([de_doc]))
|
||||
|
@ -366,8 +374,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
|
||||
m_doc = Doc.from_docs(en_docs)
|
||||
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
|
@ -379,11 +387,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert not any([t._.is_ambiguous for t in m_doc[3:8]])
|
||||
assert "group" in m_doc.spans
|
||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||
assert bool(m_doc[11].whitespace_)
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
|
||||
assert len(en_texts_without_empty) == len(list(m_doc.sents))
|
||||
assert len(str(m_doc)) == sum(len(t) for t in en_texts)
|
||||
assert str(m_doc) == "".join(en_texts)
|
||||
assert len(m_doc.text) == sum(len(t) for t in en_texts)
|
||||
assert m_doc.text == "".join(en_texts_without_empty)
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and not bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
|
@ -392,11 +401,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
assert m_doc[9].idx == think_idx
|
||||
assert "group" in m_doc.spans
|
||||
assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
|
||||
assert bool(m_doc[11].whitespace_)
|
||||
|
||||
m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
|
||||
assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
|
||||
assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
|
||||
# space delimiter considered, although spacy attribute was missing
|
||||
assert str(m_doc) == " ".join(en_texts_without_empty)
|
||||
assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
|
||||
p_token = m_doc[len(en_docs[0]) - 1]
|
||||
assert p_token.text == "." and bool(p_token.whitespace_)
|
||||
en_docs_tokens = [t for doc in en_docs for t in doc]
|
||||
|
@ -409,6 +419,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
|
|||
# can merge empty docs
|
||||
doc = Doc.from_docs([en_tokenizer("")] * 10)
|
||||
|
||||
# empty but set spans keys are preserved
|
||||
en_docs = [en_tokenizer(text) for text in en_texts]
|
||||
m_doc = Doc.from_docs(en_docs)
|
||||
assert "group" not in m_doc.spans
|
||||
for doc in en_docs:
|
||||
doc.spans["group"] = []
|
||||
m_doc = Doc.from_docs(en_docs)
|
||||
assert "group" in m_doc.spans
|
||||
assert len(m_doc.spans["group"]) == 0
|
||||
|
||||
|
||||
def test_doc_api_from_docs_ents(en_tokenizer):
|
||||
texts = ["Merging the docs is fun.", "They don't think alike."]
|
||||
|
|
|
@ -1141,6 +1141,10 @@ cdef class Doc:
|
|||
else:
|
||||
warnings.warn(Warnings.W102.format(key=key, value=value))
|
||||
for key in doc.spans:
|
||||
# if a spans key is in any doc, include it in the merged doc
|
||||
# even if it is empty
|
||||
if key not in concat_spans:
|
||||
concat_spans[key] = []
|
||||
for span in doc.spans[key]:
|
||||
concat_spans[key].append((
|
||||
span.start_char + char_offset,
|
||||
|
@ -1150,7 +1154,7 @@ cdef class Doc:
|
|||
span.text, # included as a check
|
||||
))
|
||||
char_offset += len(doc.text)
|
||||
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
|
||||
if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
|
||||
char_offset += 1
|
||||
|
||||
arrays = [doc.to_array(attrs) for doc in docs]
|
||||
|
|
|
@ -1248,7 +1248,7 @@ hyperparameters, pipeline and tokenizer used for constructing and training the
|
|||
pipeline. The `[nlp.tokenizer]` block refers to a **registered function** that
|
||||
takes the `nlp` object and returns a tokenizer. Here, we're registering a
|
||||
function called `whitespace_tokenizer` in the
|
||||
[`@tokenizers` registry](/api/registry). To make sure spaCy knows how to
|
||||
[`@tokenizers` registry](/api/top-level#registry). To make sure spaCy knows how to
|
||||
construct your tokenizer during training, you can pass in your Python file by
|
||||
setting `--code functions.py` when you run [`spacy train`](/api/cli#train).
|
||||
|
||||
|
|
|
@ -800,7 +800,7 @@ vars:
|
|||
commands:
|
||||
- name: annotate
|
||||
- script:
|
||||
- 'python -m prodigy ner.correct ${vars.prodigy.dataset} ./assets/raw_data.jsonl ${vars.prodigy.model} --labels ${vars.prodigy.labels}'
|
||||
- 'python -m prodigy ner.correct ${vars.prodigy.dataset} ${vars.prodigy.model} ./assets/raw_data.jsonl --labels ${vars.prodigy.labels}'
|
||||
- 'python -m prodigy data-to-spacy ./corpus/train.json ./corpus/eval.json --ner ${vars.prodigy.dataset}'
|
||||
- 'python -m spacy convert ./corpus/train.json ./corpus/train.spacy'
|
||||
- 'python -m spacy convert ./corpus/eval.json ./corpus/eval.spacy'
|
||||
|
|
Loading…
Reference in New Issue
Block a user