diff --git a/bin/push-tag.sh b/bin/push-tag.sh index 57133499c..50b50c986 100755 --- a/bin/push-tag.sh +++ b/bin/push-tag.sh @@ -16,4 +16,4 @@ version=${version/\'/} version=${version/\"/} version=${version/\"/} git tag "v$version" -git push origin "v$version" --tags +git push origin "v$version" diff --git a/spacy/about.py b/spacy/about.py index d1906fd76..5e7093606 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -4,7 +4,7 @@ # fmt: off __title__ = "spacy" -__version__ = "2.1.4.dev0" +__version__ = "2.1.4" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __uri__ = "https://spacy.io" __author__ = "Explosion AI" diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index b986ea61e..e0086afa0 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -11,14 +11,8 @@ def iob2json(input_data, n_sents=10, *args, **kwargs): """ Convert IOB files into JSON format for use with train cli. """ - docs = [] - for group in minibatch(docs, n_sents): - group = list(group) - first = group.pop(0) - to_extend = first["paragraphs"][0]["sentences"] - for sent in group[1:]: - to_extend.extend(sent["paragraphs"][0]["sentences"]) - docs.append(first) + sentences = read_iob(input_data.split("\n")) + docs = merge_sentences(sentences, n_sents) return docs @@ -27,7 +21,6 @@ def read_iob(raw_sents): for line in raw_sents: if not line.strip(): continue - # tokens = [t.split("|") for t in line.split()] tokens = [re.split("[^\w\-]", line.strip())] if len(tokens[0]) == 3: words, pos, iob = zip(*tokens) @@ -49,3 +42,15 @@ def read_iob(raw_sents): paragraphs = [{"sentences": [sent]} for sent in sentences] docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs] return docs + + +def merge_sentences(docs, n_sents): + merged = [] + for group in minibatch(docs, size=n_sents): + group = list(group) + first = group.pop(0) + to_extend = first["paragraphs"][0]["sentences"] + for sent in group[1:]: + to_extend.extend(sent["paragraphs"][0]["sentences"]) + merged.append(first) + return merged