From 4896ce33200352248b48590510e47bbe1b5b8e41 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Oct 2017 00:09:14 +0200 Subject: [PATCH 1/6] Remove misleading comment --- spacy/cli/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index fef6753e6..89615bbe8 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -14,7 +14,7 @@ from ..util import prints CONVERTERS = { '.conllu': conllu2json, '.conll': conllu2json, - '.iob': iob2json + '.iob': iob2json, } From 31681d20e038fb0b318a9479856da473b0e0e926 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Oct 2017 16:50:26 +0200 Subject: [PATCH 2/6] Fix concatenation in iob2json converter --- spacy/cli/converters/iob2json.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 4849345e9..4d456fa57 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -1,5 +1,6 @@ # coding: utf8 from __future__ import unicode_literals +from cytoolz import partition_all, concat from ...compat import json_dumps, path2str from ...util import prints @@ -10,22 +11,24 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): """ Convert IOB files into JSON format for use with train cli. """ - # TODO: This isn't complete yet -- need to map from IOB to - # BILUO with input_path.open('r', encoding='utf8') as file_: - docs = read_iob(file_) + if n_sents: + lines = [' '.join(para) for para in partition_all(n_sents, file_)] + else: + lines = file_ + sentences = read_iob(lines) output_filename = input_path.parts[-1].replace(".iob", ".json") output_file = output_path / output_filename with output_file.open('w', encoding='utf-8') as f: - f.write(json_dumps(docs)) - prints("Created %d documents" % len(docs), + f.write(json_dumps(sentences)) + prints("Created %d documents" % len(sentences), title="Generated output file %s" % path2str(output_file)) -def read_iob(file_): +def read_iob(raw_sents): sentences = [] - for line in file_: + for line in raw_sents: if not line.strip(): continue tokens = [t.split('|') for t in line.split()] From f942903429b33b920c18ed7f9c4fe4715733d55f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Oct 2017 17:02:10 +0200 Subject: [PATCH 3/6] Improve sentence merging in iob2json --- spacy/cli/converters/iob2json.py | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 4d456fa57..74bc22ada 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -12,17 +12,13 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): Convert IOB files into JSON format for use with train cli. """ with input_path.open('r', encoding='utf8') as file_: - if n_sents: - lines = [' '.join(para) for para in partition_all(n_sents, file_)] - else: - lines = file_ - sentences = read_iob(lines) - + sentences = read_iob(file_) + docs = merge_sentences(sentences, n_sents) output_filename = input_path.parts[-1].replace(".iob", ".json") output_file = output_path / output_filename with output_file.open('w', encoding='utf-8') as f: - f.write(json_dumps(sentences)) - prints("Created %d documents" % len(sentences), + f.write(json_dumps(docs)) + prints("Created %d documents" % len(docs), title="Generated output file %s" % path2str(output_file)) @@ -46,3 +42,15 @@ def read_iob(raw_sents): paragraphs = [{'sentences': [sent]} for sent in sentences] docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs] return docs + +def merge_sentences(docs, n_sents): + counter = 0 + merged = [] + for group in partition_all(n_sents, docs): + group = list(group) + first = group.pop(0) + to_extend = first['paragraphs'][0]['sentences'] + for sent in group[1:]: + to_extend.extend(sent['paragraphs'][0]['sentences']) + merged.append(first) + return merged From c617d288d8fb9636f9dd077f8393c9e1d2d8626a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Oct 2017 17:20:19 +0200 Subject: [PATCH 4/6] Update pipeline component names in spaCy train --- spacy/cli/train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index d973effb6..2096bf0a1 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -69,7 +69,7 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, prints("Expected dict but got: {}".format(type(meta)), title="Not a valid meta.json format", exits=1) - pipeline = ['tags', 'dependencies', 'entities'] + pipeline = ['tagger', 'parser', 'ner'] if no_tagger and 'tags' in pipeline: pipeline.remove('tags') if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') if no_entities and 'entities' in pipeline: pipeline.remove('entities') From 8902df44de0a4b6fb4b4d23a3e3cb1d4088db492 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 2 Oct 2017 21:07:23 +0200 Subject: [PATCH 5/6] Fix component disabling during training --- spacy/cli/train.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 2096bf0a1..651fafb05 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -70,9 +70,9 @@ def train(cmd, lang, output_dir, train_data, dev_data, n_iter=10, n_sents=0, title="Not a valid meta.json format", exits=1) pipeline = ['tagger', 'parser', 'ner'] - if no_tagger and 'tags' in pipeline: pipeline.remove('tags') - if no_parser and 'dependencies' in pipeline: pipeline.remove('dependencies') - if no_entities and 'entities' in pipeline: pipeline.remove('entities') + if no_tagger and 'tagger' in pipeline: pipeline.remove('tagger') + if no_parser and 'parser' in pipeline: pipeline.remove('parser') + if no_entities and 'ner' in pipeline: pipeline.remove('ner') # Take dropout and batch size as generators of values -- dropout # starts high and decays sharply, to force the optimizer to explore. From e49cd7aeaf81ed12490d82b8a65ca93088ec916e Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 3 Oct 2017 15:22:19 +0200 Subject: [PATCH 6/6] Move import into load to avoid circular imports --- spacy/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 1cb7c0cbd..9acc566ad 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals from .cli.info import info as cli_info from .glossary import explain -from .deprecated import resolve_load_name from .about import __version__ from . import util def load(name, **overrides): + from .deprecated import resolve_load_name name = resolve_load_name(name, **overrides) return util.load_model(name, **overrides)