Merge branch 'master' into spacy.io

2025-07-15 02:32:37 +03:00 · 2019-05-11 23:04:13 +02:00 · 2019-05-11 23:04:13 +02:00 · ac5990f793
commit ac5990f793
parent f60c9a94ba f96af8526a
3 changed files with 16 additions and 11 deletions
--- a/bin/push-tag.sh
+++ b/bin/push-tag.sh
@ -16,4 +16,4 @@ version=${version/\'/}
 version=${version/\"/}
 version=${version/\"/}
 git tag "v$version"
-git push origin "v$version" --tags
+git push origin "v$version"
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,7 +4,7 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "2.1.4.dev0"
+__version__ = "2.1.4"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -11,14 +11,8 @@ def iob2json(input_data, n_sents=10, *args, **kwargs):
    """
    Convert IOB files into JSON format for use with train cli.
    """
-    docs = []
+    sentences = read_iob(input_data.split("\n"))
-    for group in minibatch(docs, n_sents):
+    docs = merge_sentences(sentences, n_sents)
        group = list(group)
        first = group.pop(0)
        to_extend = first["paragraphs"][0]["sentences"]
        for sent in group[1:]:
            to_extend.extend(sent["paragraphs"][0]["sentences"])
        docs.append(first)
    return docs
@ -27,7 +21,6 @@ def read_iob(raw_sents):
    for line in raw_sents:
        if not line.strip():
            continue
        # tokens = [t.split("|") for t in line.split()]
        tokens = [re.split("[^\w\-]", line.strip())]
        if len(tokens[0]) == 3:
            words, pos, iob = zip(*tokens)
@ -49,3 +42,15 @@ def read_iob(raw_sents):
    paragraphs = [{"sentences": [sent]} for sent in sentences]
    docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
    return docs
 def merge_sentences(docs, n_sents):
    merged = []
    for group in minibatch(docs, size=n_sents):
        group = list(group)
        first = group.pop(0)
        to_extend = first["paragraphs"][0]["sentences"]
        for sent in group[1:]:
            to_extend.extend(sent["paragraphs"][0]["sentences"])
        merged.append(first)
    return merged