Merge branch 'master' into spacy.io

This commit is contained in:
Ines Montani 2019-05-11 23:04:13 +02:00
commit ac5990f793
3 changed files with 16 additions and 11 deletions

View File

@ -16,4 +16,4 @@ version=${version/\'/}
version=${version/\"/} version=${version/\"/}
version=${version/\"/} version=${version/\"/}
git tag "v$version" git tag "v$version"
git push origin "v$version" --tags git push origin "v$version"

View File

@ -4,7 +4,7 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "2.1.4.dev0" __version__ = "2.1.4"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io" __uri__ = "https://spacy.io"
__author__ = "Explosion AI" __author__ = "Explosion AI"

View File

@ -11,14 +11,8 @@ def iob2json(input_data, n_sents=10, *args, **kwargs):
""" """
Convert IOB files into JSON format for use with train cli. Convert IOB files into JSON format for use with train cli.
""" """
docs = [] sentences = read_iob(input_data.split("\n"))
for group in minibatch(docs, n_sents): docs = merge_sentences(sentences, n_sents)
group = list(group)
first = group.pop(0)
to_extend = first["paragraphs"][0]["sentences"]
for sent in group[1:]:
to_extend.extend(sent["paragraphs"][0]["sentences"])
docs.append(first)
return docs return docs
@ -27,7 +21,6 @@ def read_iob(raw_sents):
for line in raw_sents: for line in raw_sents:
if not line.strip(): if not line.strip():
continue continue
# tokens = [t.split("|") for t in line.split()]
tokens = [re.split("[^\w\-]", line.strip())] tokens = [re.split("[^\w\-]", line.strip())]
if len(tokens[0]) == 3: if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens) words, pos, iob = zip(*tokens)
@ -49,3 +42,15 @@ def read_iob(raw_sents):
paragraphs = [{"sentences": [sent]} for sent in sentences] paragraphs = [{"sentences": [sent]} for sent in sentences]
docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs] docs = [{"id": 0, "paragraphs": [para]} for para in paragraphs]
return docs return docs
def merge_sentences(docs, n_sents):
merged = []
for group in minibatch(docs, size=n_sents):
group = list(group)
first = group.pop(0)
to_extend = first["paragraphs"][0]["sentences"]
for sent in group[1:]:
to_extend.extend(sent["paragraphs"][0]["sentences"])
merged.append(first)
return merged