From 49e2de900e7d59c5691cc0e49c295d2d9957898c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 10 Apr 2017 11:37:04 +0200 Subject: [PATCH 01/88] Add costs property to StepwiseState, to show which moves are gold. --- spacy/syntax/parser.pyx | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 123ae03da..344ac5568 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -334,15 +334,16 @@ cdef class Parser: self.moves.finalize_state(stcls.c) return loss - def step_through(self, Doc doc): + def step_through(self, Doc doc, GoldParse gold=None): """Set up a stepwise state, to introspect and control the transition sequence. Arguments: doc (Doc): The document to step through. + gold (GoldParse): Optional gold parse Returns (StepwiseState): A state object, to step through the annotation process. """ - return StepwiseState(self, doc) + return StepwiseState(self, doc, gold=gold) def from_transition_sequence(self, Doc doc, sequence): """Control the annotations on a document by specifying a transition sequence @@ -367,13 +368,19 @@ cdef class StepwiseState: cdef readonly StateClass stcls cdef readonly Example eg cdef readonly Doc doc + cdef readonly GoldParse gold cdef readonly Parser parser - def __init__(self, Parser parser, Doc doc): + def __init__(self, Parser parser, Doc doc, GoldParse gold=None): self.parser = parser self.doc = doc + if gold: + self.gold = gold + else: + self.gold = GoldParse(doc) self.stcls = StateClass.init(doc.c, doc.length) self.parser.moves.initialize_state(self.stcls.c) + self.parser.moves.preprocess_gold(gold) self.eg = Example( nr_class=self.parser.moves.n_moves, nr_atom=CONTEXT_SIZE, @@ -406,6 +413,20 @@ cdef class StepwiseState: return [self.doc.vocab.strings[self.stcls.c._sent[i].dep] for i in range(self.stcls.c.length)] + @property + def costs(self): + '''Find the action-costs for the current state''' + self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, + self.stcls, self.gold) + costs = {} + for i in range(self.parser.moves.n_moves): + if not self.eg.c.is_valid[i]: + continue + transition = self.parser.moves.c[i] + name = self.parser.moves.move_name(transition.move, transition.label) + costs[name] = self.eg.c.costs[i] + return costs + def predict(self): self.eg.reset() self.eg.c.nr_feat = self.parser.model.set_featuresC(self.eg.c.atoms, self.eg.c.features, From ecfbc0b621ed5b2070a8c5716dd548c2e418cafd Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 10 Apr 2017 17:47:11 +0200 Subject: [PATCH 02/88] Update user survey badge --- website/assets/img/graphics.svg | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/website/assets/img/graphics.svg b/website/assets/img/graphics.svg index 38ea3aaec..c24473b4c 100644 --- a/website/assets/img/graphics.svg +++ b/website/assets/img/graphics.svg @@ -2,9 +2,11 @@ spaCy user survey 2017 - - - + + + + + From 1b92c8d5d591bcd248a3012feaa894afbb6f2e92 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 10 Apr 2017 17:49:49 +0200 Subject: [PATCH 03/88] Use unicode paths on Windows/Python 2 and catch other errors (resolves #970) try/except here is quite dirty, but it'll at least make sure users see an error message that explains what's going on --- spacy/cli/link.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 82d1d9a33..a92d809f5 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -46,8 +46,18 @@ def symlink(model_path, link_name, force): # Add workaround for Python 2 on Windows (see issue #909) if util.is_python2() and util.is_windows(): import subprocess - command = ['mklink', '/d', link_path, model_path] - subprocess.call(command, shell=True) + command = ['mklink', '/d', unicode(link_path), unicode(model_path)] + try: + subprocess.call(command, shell=True) + except: + # This is quite dirty, but just making sure other Windows-specific + # errors are caught so users at least see a proper error message. + util.sys_exit( + "Creating a symlink in spacy/data failed. You can still import " + "the model as a Python package and call its load() method, or " + "create the symlink manually:", + "{a} --> {b}".format(a=unicode(model_path), b=unicode(link_path)), + title="Error: Couldn't link model to '{l}'".format(l=link_name)) else: link_path.symlink_to(model_path) From 328678c7e998389ce78ea9ddc869ffa83a5a0762 Mon Sep 17 00:00:00 2001 From: Sohil Date: Thu, 13 Apr 2017 17:12:28 +0530 Subject: [PATCH 04/88] Extra brace ")" creating error There is an extra closing brace `)` which is creating error while running example. --- website/docs/usage/entity-recognition.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/entity-recognition.jade b/website/docs/usage/entity-recognition.jade index 210b04337..ab8ce22d0 100644 --- a/website/docs/usage/entity-recognition.jade +++ b/website/docs/usage/entity-recognition.jade @@ -57,7 +57,7 @@ p doc.ents = [Span(doc, 0, 1, label=doc.vocab.strings['GPE'])] assert doc[0].ent_type_ == 'GPE' doc.ents = [] - doc.ents = [(u'LondonCity', doc.vocab.strings['GPE']), 0, 1)] + doc.ents = [(u'LondonCity', doc.vocab.strings['GPE'], 0, 1)] p | The value you assign should be a sequence, the values of which From 41037f0f077c033b77b23fd211ee78e3495005c1 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 13 Apr 2017 13:51:54 +0200 Subject: [PATCH 05/88] Remove unused imports --- spacy/cli/convert.py | 4 ++-- spacy/cli/converters/conllu2json.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index d9a08b385..bd6eaba65 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,8 +1,8 @@ # coding: utf8 -from __future__ import unicode_literals, division, print_function +from __future__ import unicode_literals import io -from pathlib import Path, PurePosixPath +from pathlib import Path from .converters import conllu2json from .. import util diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index e13b7c81c..c3f21cffe 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -1,5 +1,5 @@ # coding: utf8 -from __future__ import unicode_literals, division, print_function +from __future__ import unicode_literals import json from ...gold import read_json_file, merge_sents From cf558e37c39b9a5d9107b80e8b90ae85562e6841 Mon Sep 17 00:00:00 2001 From: ines Date: Thu, 13 Apr 2017 13:52:09 +0200 Subject: [PATCH 06/88] Update adding languages docs with new commands --- website/docs/usage/adding-languages.jade | 18 ++++++++---------- website/docs/usage/cli.jade | 4 +++- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index e1631102a..d1541bc87 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -398,11 +398,12 @@ p | vectors files, you can use the | #[+src(gh("spacy-dev-resources", "training/init.py")) init.py] | script from our - | #[+a(gh("spacy-dev-resources")) developer resources] to create a - | spaCy data directory: + | #[+a(gh("spacy-dev-resources")) developer resources], or use the new + | #[+a("/docs/usage/cli#model") #[code model] command] to create a data + | directory: +code(false, "bash"). - python training/init.py xx your_data_directory/ my_data/word_freqs.txt my_data/clusters.txt my_data/word_vectors.bz2 + python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data] +aside-code("your_data_directory", "yaml"). ├── vocab/ @@ -421,17 +422,14 @@ p p | This creates a spaCy data directory with a vocabulary model, ready to be - | loaded. By default, the - | #[+src(gh("spacy-dev-resources", "training/init.py")) init.py] - | script expects to be able to find your language class using - | #[code spacy.util.get_lang_class(lang_id)]. You can edit the script to - | help it find your language class if necessary. + | loaded. By default, the command expects to be able to find your language + | class using #[code spacy.util.get_lang_class(lang_id)]. +h(3, "word-frequencies") Word frequencies p - | The #[+src(gh("spacy-dev-resources", "training/init.py")) init.py] - | script expects a tab-separated word frequencies file with three columns: + | The #[+a("/docs/usage/cli#model") #[code model] command] expects a + | tab-separated word frequencies file with three columns: +list("numbers") +item The number of times the word occurred in your language sample. diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index 6c57061db..ebd034bb8 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -145,7 +145,9 @@ p +h(2, "model") Model +tag experimental -p Initialise a new model and its data directory. +p + | Initialise a new model and its data directory. For more info on this, see + | the documentation on #[+a("/docs/usage/adding-languages") adding languages]. +code(false, "bash"). python -m spacy model [lang] [model_dir] [freqs_data] [clusters_data] [vectors_data] From a9469c81731eaaf0dfc25721da3ee371711b491f Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Thu, 13 Apr 2017 15:24:14 +0200 Subject: [PATCH 07/88] Fixed typo --- spacy/cli/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/model.py b/spacy/cli/model.py index 4cfd9a6f6..d697df05b 100644 --- a/spacy/cli/model.py +++ b/spacy/cli/model.py @@ -95,7 +95,7 @@ def read_clusters(clusters_path): return clusters -def populate_vocab(vocab, clusters, probs, oov_probs): +def populate_vocab(vocab, clusters, probs, oov_prob): # Ensure probs has entries for all words seen during clustering. for word in clusters: if word not in probs: From dd3244c08ab6a7816698fdedd9297cef9eac3629 Mon Sep 17 00:00:00 2001 From: Gyorgy Orosz Date: Thu, 13 Apr 2017 23:30:47 +0200 Subject: [PATCH 08/88] Made json dump to produce unicode strings in py2 --- spacy/cli/package.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 6de2fd140..e88f91bf4 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -6,9 +6,15 @@ import shutil import requests from pathlib import Path +import six + from .. import about from .. import util +if six.PY2: + json_dumps = lambda data: json.dumps(data, indent=2).decode("utf8") +elif six.PY3: + json_dumps = lambda data: json.dumps(data, indent=2) def package(input_dir, output_dir, force): input_path = Path(input_dir) @@ -27,7 +33,7 @@ def package(input_dir, output_dir, force): create_dirs(package_path, force) shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix()) - create_file(main_path / 'meta.json', json.dumps(meta, indent=2)) + create_file(main_path / 'meta.json', json_dumps(meta)) create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'MANIFEST.in', template_manifest) create_file(package_path / '__init__.py', template_init) From 84341c2975b82e711fff31ff3a3f062a465beb9e Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 14 Apr 2017 16:48:02 +0200 Subject: [PATCH 09/88] Only compile list of models if data_path exists --- spacy/cli/info.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 878f3ed5f..eae0593a4 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -49,5 +49,6 @@ def list_models(): # won't show up in list, but it seems worth it exclude = ['cache', 'pycache', '__pycache__'] data_path = util.get_data_path() - models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()] - return [m for m in models if m not in exclude] + if data_path: + models = [f.parts[-1] for f in data_path.iterdir() if f.is_dir()] + return [m for m in models if m not in exclude] From 33ba5066ebf200fee69fb29bb3722f2b0cb5b7f2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 14 Apr 2017 23:51:24 +0200 Subject: [PATCH 10/88] Refactor Language.end_training, making new save_to_directory method --- spacy/language.py | 125 +++++++++++++++++++--------------------------- 1 file changed, 50 insertions(+), 75 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 25bfb9e08..43bebd71d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -209,46 +209,34 @@ class Language(object): lang = None @classmethod - @contextmanager - def train(cls, path, gold_tuples, *configs): - if isinstance(path, basestring): - path = pathlib.Path(path) - tagger_cfg, parser_cfg, entity_cfg = configs - dep_model_dir = path / 'deps' - ner_model_dir = path / 'ner' - pos_model_dir = path / 'pos' - if dep_model_dir.exists(): - shutil.rmtree(str(dep_model_dir)) - if ner_model_dir.exists(): - shutil.rmtree(str(ner_model_dir)) - if pos_model_dir.exists(): - shutil.rmtree(str(pos_model_dir)) - dep_model_dir.mkdir() - ner_model_dir.mkdir() - pos_model_dir.mkdir() + def setup_directory(cls, path, **configs): + for name, config in configs.items(): + directory = path / name + if directory.exists(): + shutil.rmtree(str(directory)) + directory.mkdir() + with (directory / 'config.json').open('wb') as file_: + data = ujson.dumps(config, indent=2) + if isinstance(data, unicode): + data = data.encode('utf8') + file_.write(data) + if not (path / 'vocab').exists(): + (path / 'vocab').mkdir() + @classmethod + @contextmanager + def train(cls, path, gold_tuples, **configs): if parser_cfg['pseudoprojective']: # preprocess training data here before ArcEager.get_labels() is called gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) - parser_cfg['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) - entity_cfg['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) + for subdir in ('deps', 'ner', 'pos'): + if subdir not in configs: + configs[subdir] = {} + configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) + configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) - with (dep_model_dir / 'config.json').open('wb') as file_: - data = ujson.dumps(parser_cfg) - if isinstance(data, unicode): - data = data.encode('utf8') - file_.write(data) - with (ner_model_dir / 'config.json').open('wb') as file_: - data = ujson.dumps(entity_cfg) - if isinstance(data, unicode): - data = data.encode('utf8') - file_.write(data) - with (pos_model_dir / 'config.json').open('wb') as file_: - data = ujson.dumps(tagger_cfg) - if isinstance(data, unicode): - data = data.encode('utf8') - file_.write(data) + cls.setup_directory(path, **configs) self = cls( path=path, @@ -269,7 +257,9 @@ class Language(object): self.entity = self.Defaults.create_entity(self) self.pipeline = self.Defaults.create_pipeline(self) yield Trainer(self, gold_tuples) - self.end_training(path=path) + self.end_training() + self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg, + pos=self.tagger.cfg) def __init__(self, **overrides): if 'data_dir' in overrides and 'path' not in overrides: @@ -373,51 +363,36 @@ class Language(object): for doc in stream: yield doc - def end_training(self, path=None): - if path is None: - path = self.path - elif isinstance(path, basestring): - path = pathlib.Path(path) - - if self.tagger: - self.tagger.model.end_training() - self.tagger.model.dump(str(path / 'pos' / 'model')) - if self.parser: - self.parser.model.end_training() - self.parser.model.dump(str(path / 'deps' / 'model')) - if self.entity: - self.entity.model.end_training() - self.entity.model.dump(str(path / 'ner' / 'model')) + def save_to_directory(self, path): + configs = { + 'pos': self.tagger.cfg if self.tagger else {}, + 'deps': self.parser.cfg if self.parser else {}, + 'ner': self.entity.cfg if self.entity else {}, + } + self.setup_directory(path, **configs) + strings_loc = path / 'vocab' / 'strings.json' with strings_loc.open('w', encoding='utf8') as file_: self.vocab.strings.dump(file_) self.vocab.dump(path / 'vocab' / 'lexemes.bin') - + # TODO: Word vectors? if self.tagger: - tagger_freqs = list(self.tagger.freqs[TAG].items()) - else: - tagger_freqs = [] + self.tagger.model.dump(str(path / 'pos' / 'model')) if self.parser: - dep_freqs = list(self.parser.moves.freqs[DEP].items()) - head_freqs = list(self.parser.moves.freqs[HEAD].items()) - else: - dep_freqs = [] - head_freqs = [] + self.parser.model.dump(str(path / 'deps' / 'model')) if self.entity: - entity_iob_freqs = list(self.entity.moves.freqs[ENT_IOB].items()) - entity_type_freqs = list(self.entity.moves.freqs[ENT_TYPE].items()) - else: - entity_iob_freqs = [] - entity_type_freqs = [] - with (path / 'vocab' / 'serializer.json').open('wb') as file_: - data = ujson.dumps([ - (TAG, tagger_freqs), - (DEP, dep_freqs), - (ENT_IOB, entity_iob_freqs), - (ENT_TYPE, entity_type_freqs), - (HEAD, head_freqs) - ]) - if isinstance(data, unicode): - data = data.encode('utf8') - file_.write(data) + self.entity.model.dump(str(path / 'ner' / 'model')) + + def end_training(self, path=None): + if self.tagger: + self.tagger.model.end_training() + if self.parser: + self.parser.model.end_training() + if self.entity: + self.entity.model.end_training() + # NB: This is slightly different from before --- we no longer default + # to taking nlp.path + if path is not None: + self.save_to_directory(path) + From 354458484c070b2aae7348d257c560edb5aaa91a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 14 Apr 2017 23:52:17 +0200 Subject: [PATCH 11/88] WIP on add_label bug during NER training Currently when a new label is introduced to NER during training, it causes the labels to be read in in an unexpected order. This invalidates the model. --- spacy/pipeline.pyx | 30 ++++++---------------- spacy/syntax/arc_eager.pyx | 23 ++++++++++------- spacy/syntax/ner.pyx | 40 ++++++++++++++++++++++-------- spacy/syntax/parser.pyx | 13 +++++++++- spacy/syntax/transition_system.pyx | 3 ++- 5 files changed, 65 insertions(+), 44 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index ea8221cff..32cb3a7d7 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -17,17 +17,13 @@ cdef class EntityRecognizer(Parser): feature_templates = get_feature_templates('ner') def add_label(self, label): - for action in self.moves.action_types: - self.moves.add_action(action, label) - if 'actions' in self.cfg: - self.cfg['actions'].setdefault(action, - {}).setdefault(label, True) + Parser.add_label(self, label) if isinstance(label, basestring): label = self.vocab.strings[label] + # Set label into serializer. Super hacky :( for attr, freqs in self.vocab.serializer_freqs: if attr == ENT_TYPE and label not in freqs: freqs.append([label, 1]) - # Super hacky :( self.vocab._serializer = None @@ -36,19 +32,15 @@ cdef class BeamEntityRecognizer(BeamParser): TransitionSystem = BiluoPushDown feature_templates = get_feature_templates('ner') - + def add_label(self, label): - for action in self.moves.action_types: - self.moves.add_action(action, label) - if 'actions' in self.cfg: - self.cfg['actions'].setdefault(action, - {}).setdefault(label, True) + Parser.add_label(self, label) if isinstance(label, basestring): label = self.vocab.strings[label] + # Set label into serializer. Super hacky :( for attr, freqs in self.vocab.serializer_freqs: if attr == ENT_TYPE and label not in freqs: freqs.append([label, 1]) - # Super hacky :( self.vocab._serializer = None @@ -58,11 +50,7 @@ cdef class DependencyParser(Parser): feature_templates = get_feature_templates('basic') def add_label(self, label): - for action in self.moves.action_types: - self.moves.add_action(action, label) - if 'actions' in self.cfg: - self.cfg['actions'].setdefault(action, - {}).setdefault(label, True) + Parser.add_label(self, label) if isinstance(label, basestring): label = self.vocab.strings[label] for attr, freqs in self.vocab.serializer_freqs: @@ -78,11 +66,7 @@ cdef class BeamDependencyParser(BeamParser): feature_templates = get_feature_templates('basic') def add_label(self, label): - for action in self.moves.action_types: - self.moves.add_action(action, label) - if 'actions' in self.cfg: - self.cfg['actions'].setdefault(action, - {}).setdefault(label, True) + Parser.add_label(self, label) if isinstance(label, basestring): label = self.vocab.strings[label] for attr, freqs in self.vocab.serializer_freqs: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 93bc21e22..eac71eaa8 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -317,17 +317,20 @@ cdef class ArcEager(TransitionSystem): def get_actions(cls, **kwargs): actions = kwargs.get('actions', { - SHIFT: {'': True}, - REDUCE: {'': True}, - RIGHT: {}, - LEFT: {}, - BREAK: {'ROOT': True}}) + SHIFT: [''], + REDUCE: [''], + RIGHT: [], + LEFT: [], + BREAK: ['ROOT']}) + seen_actions = set() for label in kwargs.get('left_labels', []): if label.upper() != 'ROOT': - actions[LEFT][label] = True + if (LEFT, label) not in seen_actions: + actions[LEFT].append(label) for label in kwargs.get('right_labels', []): if label.upper() != 'ROOT': - actions[RIGHT][label] = True + if (RIGHT, label) not in seen_actions: + actions[RIGHT].append(label) for raw_text, sents in kwargs.get('gold_parses', []): for (ids, words, tags, heads, labels, iob), ctnts in sents: @@ -336,9 +339,11 @@ cdef class ArcEager(TransitionSystem): label = 'ROOT' if label != 'ROOT': if head < child: - actions[RIGHT][label] = True + if (RIGHT, label) not in seen_actions: + actions[RIGHT].append(label) elif head > child: - actions[LEFT][label] = True + if (LEFT, label) not in seen_actions: + actions[LEFT].append(label) return actions property action_types: diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 736cc0039..1090f546f 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -21,6 +21,7 @@ cdef enum: LAST UNIT OUT + ISNT N_MOVES @@ -31,6 +32,7 @@ MOVE_NAMES[IN] = 'I' MOVE_NAMES[LAST] = 'L' MOVE_NAMES[UNIT] = 'U' MOVE_NAMES[OUT] = 'O' +MOVE_NAMES[ISNT] = 'x' cdef do_func_t[N_MOVES] do_funcs @@ -54,16 +56,20 @@ cdef class BiluoPushDown(TransitionSystem): def get_actions(cls, **kwargs): actions = kwargs.get('actions', { - MISSING: {'': True}, - BEGIN: {}, - IN: {}, - LAST: {}, - UNIT: {}, - OUT: {'': True} + MISSING: [''], + BEGIN: [], + IN: [], + LAST: [], + UNIT: [], + OUT: [''] }) + seen_entities = set() for entity_type in kwargs.get('entity_types', []): + if entity_type in seen_entities: + continue + seen_entities.add(entity_type) for action in (BEGIN, IN, LAST, UNIT): - actions[action][entity_type] = True + actions[action].append(entity_type) moves = ('M', 'B', 'I', 'L', 'U') for raw_text, sents in kwargs.get('gold_parses', []): for (ids, words, tags, heads, labels, biluo), _ in sents: @@ -72,8 +78,10 @@ cdef class BiluoPushDown(TransitionSystem): if ner_tag.count('-') != 1: raise ValueError(ner_tag) _, label = ner_tag.split('-') - for move_str in ('B', 'I', 'L', 'U'): - actions[moves.index(move_str)][label] = True + if label not in seen_entities: + seen_entities.add(label) + for move_str in ('B', 'I', 'L', 'U'): + actions[moves.index(move_str)].append(label) return actions property action_types: @@ -111,11 +119,17 @@ cdef class BiluoPushDown(TransitionSystem): label = 0 elif '-' in name: move_str, label_str = name.split('-', 1) + # Hacky way to denote 'not this entity' + if label_str.startswith('!'): + label_str = label_str[1:] + move_str = 'x' label = self.strings[label_str] else: move_str = name label = 0 move = MOVE_NAMES.index(move_str) + if move == ISNT: + return Transition(clas=0, move=ISNT, label=label, score=0) for i in range(self.n_moves): if self.c[i].move == move and self.c[i].label == label: return self.c[i] @@ -225,6 +239,9 @@ cdef class Begin: elif g_act == BEGIN: # B, Gold B --> Label match return label != g_tag + # Support partial supervision in the form of "not this label" + elif g_act == ISNT: + return label == g_tag else: # B, Gold I --> False (P) # B, Gold L --> False (P) @@ -359,6 +376,9 @@ cdef class Unit: elif g_act == UNIT: # U, Gold U --> True iff tag match return label != g_tag + # Support partial supervision in the form of "not this label" + elif g_act == ISNT: + return label == g_tag else: # U, Gold B --> False # U, Gold I --> False @@ -388,7 +408,7 @@ cdef class Out: cdef int g_act = gold.ner[s.B(0)].move cdef int g_tag = gold.ner[s.B(0)].label - if g_act == MISSING: + if g_act == MISSING or g_act == ISNT: return 0 elif g_act == BEGIN: # O, Gold B --> False diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 344ac5568..969c4ef06 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -52,7 +52,7 @@ from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC -USE_FTRL = True +USE_FTRL = False DEBUG = False def set_debug(val): global DEBUG @@ -152,6 +152,13 @@ cdef class Parser: # TODO: remove this shim when we don't have to support older data if 'labels' in cfg and 'actions' not in cfg: cfg['actions'] = cfg.pop('labels') + # TODO: remove this shim when we don't have to support older data + for action_name, labels in dict(cfg['actions']).items(): + # We need this to be sorted + if isinstance(labels, dict): + labels = list(sorted(labels.keys())) + cfg['actions'][action_name] = labels + print(cfg['actions']) self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) if (path / 'model').exists(): self.model.load(str(path / 'model')) @@ -362,6 +369,10 @@ cdef class Parser: # Doesn't set label into serializer -- subclasses override it to do that. for action in self.moves.action_types: self.moves.add_action(action, label) + if 'actions' in self.cfg: + # Important that the labels be stored as a list! We need the + # order, or the model goes out of synch + self.cfg['actions'].setdefault(str(action), []).append(label) cdef class StepwiseState: diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 7e5577885..e6a96062b 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -32,7 +32,7 @@ cdef class TransitionSystem: self.c = self.mem.alloc(self._size, sizeof(Transition)) for action, label_strs in sorted(labels_by_action.items()): - for label_str in sorted(label_strs): + for label_str in label_strs: self.add_action(int(action), label_str) self.root_label = self.strings['ROOT'] self.freqs = {} if _freqs is None else _freqs @@ -105,5 +105,6 @@ cdef class TransitionSystem: self.c = self.mem.realloc(self.c, self._size * sizeof(self.c[0])) self.c[self.n_moves] = self.init_transition(self.n_moves, action, label) + print("Add action", action, self.strings[label], self.n_moves) self.n_moves += 1 return 1 From 97b83c74dc89a5ef2e85ba31697a68d35bf1899d Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Fri, 14 Apr 2017 23:54:27 +0200 Subject: [PATCH 12/88] WIP on training example --- examples/training/train_new_entity_type.py | 74 ++++++++++++++++++++++ 1 file changed, 74 insertions(+) create mode 100644 examples/training/train_new_entity_type.py diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py new file mode 100644 index 000000000..4e9b7c8a8 --- /dev/null +++ b/examples/training/train_new_entity_type.py @@ -0,0 +1,74 @@ +from __future__ import unicode_literals, print_function +import json +import pathlib +import random + +import spacy +from spacy.pipeline import EntityRecognizer +from spacy.gold import GoldParse +from spacy.tagger import Tagger + + +try: + unicode +except: + unicode = str + + +def train_ner(nlp, train_data, output_dir): + # Add new words to vocab. + for raw_text, _ in train_data: + doc = nlp.make_doc(raw_text) + for word in doc: + _ = nlp.vocab[word.orth] + + for itn in range(20): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + gold = GoldParse(doc, entities=entity_offsets) + doc = nlp.make_doc(raw_text) + nlp.tagger(doc) + loss = nlp.entity.update(doc, gold) + nlp.save_to_directory(output_dir) + #nlp.end_training(output_dir) + + +def main(model_name, output_directory=None): + nlp = spacy.load(model_name) + + train_data = [ + ( + "Horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')], + ), + ( + "horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')] + ), + ( + "horses pretend to care about your feelings", + [(0, 6, 'ANIMAL')] + ), + ( + "they pretend to care about your feelings, those horses", + [(48, 54, 'ANIMAL')] + ) + ] + nlp.entity.add_label('ANIMAL') + if output_directory is not None: + output_directory = pathlib.Path(output_directory) + ner = train_ner(nlp, train_data, output_directory) + + doc = nlp('Do you like horses?') + for ent in doc.ents: + print(ent.label_, ent.text) + nlp2 = spacy.load('en', path=output_directory) + nlp2.entity.add_label('ANIMAL') + doc2 = nlp2('Do you like horses?') + for ent in doc2.ents: + print(ent.label_, ent.text) + + +if __name__ == '__main__': + import plac + plac.call(main) From 561f2a3eb46f9c6553fd50eb6c82c75bd4d46d5e Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 11:59:21 +0200 Subject: [PATCH 13/88] Use consistent formatting for docstrings --- spacy/__main__.py | 12 ++--- spacy/attrs.pyx | 5 +- spacy/cli/converters/conllu2json.py | 3 +- spacy/deprecated.py | 22 +++++---- spacy/gold.pyx | 14 ++++-- spacy/language.py | 10 ++-- spacy/lemmatizer.py | 6 ++- spacy/lexeme.pyx | 17 ++++--- spacy/matcher.pyx | 26 ++++++---- spacy/morphology.pyx | 9 ++-- spacy/pipeline.pyx | 8 +++- spacy/scorer.py | 4 +- spacy/strings.pyx | 29 +++++++---- spacy/tagger.pyx | 19 +++++--- spacy/tokenizer.pyx | 43 ++++++++++------- spacy/train.py | 4 +- spacy/vocab.pyx | 74 ++++++++++++++++++----------- 17 files changed, 192 insertions(+), 113 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 7151e3c74..8d511d823 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -14,8 +14,9 @@ from spacy.cli import convert as cli_convert class CLI(object): - """Command-line interface for spaCy""" - + """ + Command-line interface for spaCy + """ commands = ('download', 'link', 'info', 'package', 'train', 'model', 'convert') @plac.annotations( @@ -29,7 +30,6 @@ class CLI(object): can be shortcut, model name or, if --direct flag is set, full model name with version. """ - cli_download(model, direct) @@ -44,7 +44,6 @@ class CLI(object): either the name of a pip package, or the local path to the model data directory. Linking models allows loading them via spacy.load(link_name). """ - cli_link(origin, link_name, force) @@ -58,7 +57,6 @@ class CLI(object): speficied as an argument, print model information. Flag --markdown prints details in Markdown for easy copy-pasting to GitHub issues. """ - cli_info(model, markdown) @@ -73,7 +71,6 @@ class CLI(object): installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ - cli_package(input_dir, output_dir, force) @@ -93,7 +90,6 @@ class CLI(object): """ Train a model. Expects data in spaCy's JSON format. """ - cli_train(lang, output_dir, train_data, dev_data, n_iter, not no_tagger, not no_parser, not no_ner, parser_L1) @@ -108,7 +104,6 @@ class CLI(object): """ Initialize a new model and its data directory. """ - cli_model(lang, model_dir, freqs_data, clusters_data, vectors_data) @plac.annotations( @@ -122,7 +117,6 @@ class CLI(object): Convert files into JSON format for use with train command and other experiment management functions. """ - cli_convert(input_file, output_dir, n_sents, morphology) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index f6b1d71ab..829bcc396 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -92,7 +92,8 @@ NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): - '''Normalize a dictionary of attributes, converting them to ints. + """ + Normalize a dictionary of attributes, converting them to ints. Arguments: stringy_attrs (dict): @@ -105,7 +106,7 @@ def intify_attrs(stringy_attrs, strings_map=None, _do_deprecated=False): inty_attrs (dict): Attributes dictionary with keys and optionally values converted to ints. - ''' + """ inty_attrs = {} if _do_deprecated: if 'F' in stringy_attrs: diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index c3f21cffe..188740d8b 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -7,7 +7,8 @@ from ... import util def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): - """Convert conllu files into JSON format for use with train cli. + """ + Convert conllu files into JSON format for use with train cli. use_morphology parameter enables appending morphology to tags, which is useful for languages such as Spanish, where UD tags are not so rich. """ diff --git a/spacy/deprecated.py b/spacy/deprecated.py index ec528b308..e93c4c420 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -36,7 +36,8 @@ def align_tokens(ref, indices): # Deprecated, surely? def detokenize(token_rules, words): # Deprecated? - """To align with treebanks, return a list of "chunks", where a chunk is a + """ + To align with treebanks, return a list of "chunks", where a chunk is a sequence of tokens that are separated by whitespace in actual strings. Each chunk should be a tuple of token indices, e.g. @@ -57,10 +58,13 @@ def detokenize(token_rules, words): # Deprecated? return positions -def fix_glove_vectors_loading(overrides): - """Special-case hack for loading the GloVe vectors, to support deprecated - <1.0 stuff. Phase this out once the data is fixed.""" + +def fix_glove_vectors_loading(overrides): + """ + Special-case hack for loading the GloVe vectors, to support deprecated + <1.0 stuff. Phase this out once the data is fixed. + """ if 'data_dir' in overrides and 'path' not in overrides: raise ValueError("The argument 'data_dir' has been renamed to 'path'") if overrides.get('path') is False: @@ -88,13 +92,13 @@ def fix_glove_vectors_loading(overrides): def resolve_model_name(name): - """If spaCy is loaded with 'de', check if symlink already exists. If + """ + If spaCy is loaded with 'de', check if symlink already exists. If not, user have upgraded from older version and have old models installed. Check if old model directory exists and if so, return that instead and create shortcut link. If English model is found and no shortcut exists, raise error and tell user to install new model. """ - if name == 'en' or name == 'de': versions = ['1.0.0', '1.1.0'] data_path = Path(util.get_data_path()) @@ -117,9 +121,11 @@ def resolve_model_name(name): class ModelDownload(): - """Replace download modules within en and de with deprecation warning and + """ + Replace download modules within en and de with deprecation warning and download default language model (using shortcut). Use classmethods to allow - importing ModelDownload as download and calling download.en() etc.""" + importing ModelDownload as download and calling download.en() etc. + """ @classmethod def load(self, lang): diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 471018109..d667371fe 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -220,7 +220,8 @@ cdef class GoldParse: def __init__(self, doc, annot_tuples=None, words=None, tags=None, heads=None, deps=None, entities=None, make_projective=False): - """Create a GoldParse. + """ + Create a GoldParse. Arguments: doc (Doc): @@ -310,13 +311,16 @@ cdef class GoldParse: @property def is_projective(self): - """Whether the provided syntactic annotations form a projective dependency - tree.""" + """ + Whether the provided syntactic annotations form a projective dependency + tree. + """ return not nonproj.is_nonproj_tree(self.heads) def biluo_tags_from_offsets(doc, entities): - '''Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out + """ + Encode labelled spans into per-token tags, using the Begin/In/Last/Unit/Out scheme (biluo). Arguments: @@ -347,7 +351,7 @@ def biluo_tags_from_offsets(doc, entities): tags = biluo_tags_from_offsets(doc, entities) assert tags == ['O', 'O', 'U-LOC', 'O'] - ''' + """ starts = {token.idx: token.i for token in doc} ends = {token.idx+len(token): token.i for token in doc} biluo = ['-' for _ in doc] diff --git a/spacy/language.py b/spacy/language.py index 43bebd71d..2d435b728 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -202,9 +202,10 @@ class BaseDefaults(object): class Language(object): - '''A text-processing pipeline. Usually you'll load this once per process, and + """ + A text-processing pipeline. Usually you'll load this once per process, and pass the instance around your program. - ''' + """ Defaults = BaseDefaults lang = None @@ -342,7 +343,8 @@ class Language(object): return doc def pipe(self, texts, tag=True, parse=True, entity=True, n_threads=2, batch_size=1000): - '''Process texts as a stream, and yield Doc objects in order. + """ + Process texts as a stream, and yield Doc objects in order. Supports GIL-free multi-threading. @@ -351,7 +353,7 @@ class Language(object): tag (bool) parse (bool) entity (bool) - ''' + """ skip = {self.tagger: not tag, self.parser: not parse, self.entity: not entity} stream = (self.make_doc(text) for text in texts) for proc in self.pipeline: diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 905c74810..4693e88c3 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -38,8 +38,10 @@ class Lemmatizer(object): return lemmas def is_base_form(self, univ_pos, morphology=None): - '''Check whether we're dealing with an uninflected paradigm, so we can - avoid lemmatization entirely.''' + """ + Check whether we're dealing with an uninflected paradigm, so we can + avoid lemmatization entirely. + """ morphology = {} if morphology is None else morphology others = [key for key in morphology if key not in (POS, 'number', 'pos', 'verbform')] true_morph_key = morphology.get('morph', 0) diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 3a26161bb..202dd5947 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -30,13 +30,15 @@ memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) cdef class Lexeme: - """An entry in the vocabulary. A Lexeme has no string context --- it's a + """ + An entry in the vocabulary. A Lexeme has no string context --- it's a word-type, as opposed to a word token. It therefore has no part-of-speech tag, dependency parse, or lemma (lemmatization depends on the part-of-speech tag). """ def __init__(self, Vocab vocab, int orth): - """Create a Lexeme object. + """ + Create a Lexeme object. Arguments: vocab (Vocab): The parent vocabulary @@ -80,7 +82,8 @@ cdef class Lexeme: return self.c.orth def set_flag(self, attr_id_t flag_id, bint value): - """Change the value of a boolean flag. + """ + Change the value of a boolean flag. Arguments: flag_id (int): The attribute ID of the flag to set. @@ -89,7 +92,8 @@ cdef class Lexeme: Lexeme.c_set_flag(self.c, flag_id, value) def check_flag(self, attr_id_t flag_id): - """Check the value of a boolean flag. + """ + Check the value of a boolean flag. Arguments: flag_id (int): The attribute ID of the flag to query. @@ -98,7 +102,8 @@ cdef class Lexeme: return True if Lexeme.c_check_flag(self.c, flag_id) else False def similarity(self, other): - '''Compute a semantic similarity estimate. Defaults to cosine over vectors. + """ + Compute a semantic similarity estimate. Defaults to cosine over vectors. Arguments: other: @@ -106,7 +111,7 @@ cdef class Lexeme: Token and Lexeme objects. Returns: score (float): A scalar similarity score. Higher is more similar. - ''' + """ if self.vector_norm == 0 or other.vector_norm == 0: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index c5e520656..af4b2a51d 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -180,7 +180,8 @@ cdef class Matcher: @classmethod def load(cls, path, vocab): - '''Load the matcher and patterns from a file path. + """ + Load the matcher and patterns from a file path. Arguments: path (Path): @@ -189,7 +190,7 @@ cdef class Matcher: The vocabulary that the documents to match over will refer to. Returns: Matcher: The newly constructed object. - ''' + """ if (path / 'gazetteer.json').exists(): with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: patterns = json.load(file_) @@ -198,7 +199,8 @@ cdef class Matcher: return cls(vocab, patterns) def __init__(self, vocab, patterns={}): - """Create the Matcher. + """ + Create the Matcher. Arguments: vocab (Vocab): @@ -227,7 +229,8 @@ cdef class Matcher: def add_entity(self, entity_key, attrs=None, if_exists='raise', acceptor=None, on_match=None): - """Add an entity to the matcher. + """ + Add an entity to the matcher. Arguments: entity_key (unicode or int): @@ -264,7 +267,8 @@ cdef class Matcher: self._callbacks[entity_key] = on_match def add_pattern(self, entity_key, token_specs, label=""): - """Add a pattern to the matcher. + """ + Add a pattern to the matcher. Arguments: entity_key (unicode or int): @@ -307,7 +311,8 @@ cdef class Matcher: return entity_key def has_entity(self, entity_key): - """Check whether the matcher has an entity. + """ + Check whether the matcher has an entity. Arguments: entity_key (string or int): The entity key to check. @@ -318,7 +323,8 @@ cdef class Matcher: return entity_key in self._entities def get_entity(self, entity_key): - """Retrieve the attributes stored for an entity. + """ + Retrieve the attributes stored for an entity. Arguments: entity_key (unicode or int): The entity to retrieve. @@ -332,7 +338,8 @@ cdef class Matcher: return None def __call__(self, Doc doc, acceptor=None): - """Find all token sequences matching the supplied patterns on the Doc. + """ + Find all token sequences matching the supplied patterns on the Doc. Arguments: doc (Doc): @@ -445,7 +452,8 @@ cdef class Matcher: return matches def pipe(self, docs, batch_size=1000, n_threads=2): - """Match a stream of documents, yielding them in turn. + """ + Match a stream of documents, yielding them in turn. Arguments: docs: A stream of documents. diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 372bbb5ce..b440ac818 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -16,7 +16,9 @@ from .attrs import LEMMA, intify_attrs def _normalize_props(props): - '''Transform deprecated string keys to correct names.''' + """ + Transform deprecated string keys to correct names. + """ out = {} for key, value in props.items(): if key == POS: @@ -98,13 +100,14 @@ cdef class Morphology: flags[0] &= ~(one << flag_id) def add_special_case(self, unicode tag_str, unicode orth_str, attrs, force=False): - '''Add a special-case rule to the morphological analyser. Tokens whose + """ + Add a special-case rule to the morphological analyser. Tokens whose tag and orth match the rule will receive the specified properties. Arguments: tag (unicode): The part-of-speech tag to key the exception. orth (unicode): The word-form to key the exception. - ''' + """ tag = self.strings[tag_str] tag_id = self.reverse_index[tag] orth = self.strings[orth_str] diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 32cb3a7d7..e12dfb690 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -11,7 +11,9 @@ from .attrs import DEP, ENT_TYPE cdef class EntityRecognizer(Parser): - """Annotate named entities on Doc objects.""" + """ + Annotate named entities on Doc objects. + """ TransitionSystem = BiluoPushDown feature_templates = get_feature_templates('ner') @@ -28,7 +30,9 @@ cdef class EntityRecognizer(Parser): cdef class BeamEntityRecognizer(BeamParser): - """Annotate named entities on Doc objects.""" + """ + Annotate named entities on Doc objects. + """ TransitionSystem = BiluoPushDown feature_templates = get_feature_templates('ner') diff --git a/spacy/scorer.py b/spacy/scorer.py index f9265f373..5f899c454 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -6,7 +6,9 @@ from .gold import tags_to_entities class PRFScore(object): - """A precision / recall / F score""" + """ + A precision / recall / F score + """ def __init__(self): self.tp = 0 self.fp = 0 diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 403ebd3c0..5eabdf6c1 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -73,13 +73,16 @@ cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) ex cdef class StringStore: - '''Map strings to and from integer IDs.''' + """ + Map strings to and from integer IDs. + """ def __init__(self, strings=None, freeze=False): - '''Create the StringStore. + """ + Create the StringStore. Arguments: strings: A sequence of unicode strings to add to the store. - ''' + """ self.mem = Pool() self._map = PreshMap() self._oov = PreshMap() @@ -104,7 +107,8 @@ cdef class StringStore: return (StringStore, (list(self),)) def __len__(self): - """The number of strings in the store. + """ + The number of strings in the store. Returns: int The number of strings in the store. @@ -112,8 +116,9 @@ cdef class StringStore: return self.size-1 def __getitem__(self, object string_or_id): - """Retrieve a string from a given integer ID, or vice versa. - + """ + Retrieve a string from a given integer ID, or vice versa. + Arguments: string_or_id (bytes or unicode or int): The value to encode. @@ -159,7 +164,8 @@ cdef class StringStore: return utf8str - self.c def __contains__(self, unicode string not None): - """Check whether a string is in the store. + """ + Check whether a string is in the store. Arguments: string (unicode): The string to check. @@ -172,7 +178,8 @@ cdef class StringStore: return self._map.get(key) is not NULL def __iter__(self): - """Iterate over the strings in the store, in order. + """ + Iterate over the strings in the store, in order. Yields: unicode A string in the store. """ @@ -230,7 +237,8 @@ cdef class StringStore: return &self.c[self.size-1] def dump(self, file_): - """Save the strings to a JSON file. + """ + Save the strings to a JSON file. Arguments: file_ (buffer): The file to save the strings. @@ -244,7 +252,8 @@ cdef class StringStore: file_.write(string_data) def load(self, file_): - """Load the strings from a JSON file. + """ + Load the strings from a JSON file. Arguments: file_ (buffer): The file from which to load the strings. diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 4a2ef082a..8b10ded82 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -106,10 +106,13 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: cdef class Tagger: - """Annotate part-of-speech tags on Doc objects.""" + """ + Annotate part-of-speech tags on Doc objects. + """ @classmethod def load(cls, path, vocab, require=False): - """Load the statistical model from the supplied path. + """ + Load the statistical model from the supplied path. Arguments: path (Path): @@ -142,7 +145,8 @@ cdef class Tagger: return self def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): - """Create a Tagger. + """ + Create a Tagger. Arguments: vocab (Vocab): @@ -180,7 +184,8 @@ cdef class Tagger: tokens._py_tokens = [None] * tokens.length def __call__(self, Doc tokens): - """Apply the tagger, setting the POS tags onto the Doc object. + """ + Apply the tagger, setting the POS tags onto the Doc object. Arguments: doc (Doc): The tokens to be tagged. @@ -208,7 +213,8 @@ cdef class Tagger: tokens._py_tokens = [None] * tokens.length def pipe(self, stream, batch_size=1000, n_threads=2): - """Tag a stream of documents. + """ + Tag a stream of documents. Arguments: stream: The sequence of documents to tag. @@ -225,7 +231,8 @@ cdef class Tagger: yield doc def update(self, Doc tokens, GoldParse gold, itn=0): - """Update the statistical model, with tags supplied for the given document. + """ + Update the statistical model, with tags supplied for the given document. Arguments: doc (Doc): diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 42f090cde..1d4fc2dce 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -23,12 +23,15 @@ from .tokens.doc cimport Doc cdef class Tokenizer: - """Segment text, and create Doc objects with the discovered segment boundaries.""" + """ + Segment text, and create Doc objects with the discovered segment boundaries. + """ @classmethod def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None, infix_finditer=None, token_match=None): - '''Load a Tokenizer, reading unsupplied components from the path. - + """ + Load a Tokenizer, reading unsupplied components from the path. + Arguments: path (Path): The path to load from. @@ -45,10 +48,10 @@ cdef class Tokenizer: infix_finditer: Signature of re.compile(string).finditer Returns Tokenizer - ''' if isinstance(path, basestring): path = pathlib.Path(path) + """ if rules is None: with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_: rules = json.load(file_) @@ -67,8 +70,9 @@ cdef class Tokenizer: return cls(vocab, rules, prefix_search, suffix_search, infix_finditer, token_match) def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): - '''Create a Tokenizer, to create Doc objects given unicode text. - + """ + Create a Tokenizer, to create Doc objects given unicode text. + Arguments: vocab (Vocab): A storage container for lexical types. @@ -85,7 +89,7 @@ cdef class Tokenizer: to find infixes. token_match: A boolean function matching strings that becomes tokens. - ''' + """ self.mem = Pool() self._cache = PreshMap() self._specials = PreshMap() @@ -117,7 +121,8 @@ cdef class Tokenizer: @cython.boundscheck(False) def __call__(self, unicode string): - """Tokenize a string. + """ + Tokenize a string. Arguments: string (unicode): The string to tokenize. @@ -170,7 +175,8 @@ cdef class Tokenizer: return tokens def pipe(self, texts, batch_size=1000, n_threads=2): - """Tokenize a stream of texts. + """ + Tokenize a stream of texts. Arguments: texts: A sequence of unicode texts. @@ -324,7 +330,8 @@ cdef class Tokenizer: self._cache.set(key, cached) def find_infix(self, unicode string): - """Find internal split points of the string, such as hyphens. + """ + Find internal split points of the string, such as hyphens. string (unicode): The string to segment. @@ -337,7 +344,8 @@ cdef class Tokenizer: return list(self.infix_finditer(string)) def find_prefix(self, unicode string): - """Find the length of a prefix that should be segmented from the string, + """ + Find the length of a prefix that should be segmented from the string, or None if no prefix rules match. Arguments: @@ -350,7 +358,8 @@ cdef class Tokenizer: return (match.end() - match.start()) if match is not None else 0 def find_suffix(self, unicode string): - """Find the length of a suffix that should be segmented from the string, + """ + Find the length of a suffix that should be segmented from the string, or None if no suffix rules match. Arguments: @@ -363,13 +372,15 @@ cdef class Tokenizer: return (match.end() - match.start()) if match is not None else 0 def _load_special_tokenization(self, special_cases): - '''Add special-case tokenization rules. - ''' + """ + Add special-case tokenization rules. + """ for chunk, substrings in sorted(special_cases.items()): self.add_special_case(chunk, substrings) def add_special_case(self, unicode string, substrings): - '''Add a special-case tokenization rule. + """ + Add a special-case tokenization rule. Arguments: string (unicode): The string to specially tokenize. @@ -378,7 +389,7 @@ cdef class Tokenizer: attributes. The ORTH fields of the attributes must exactly match the string when they are concatenated. Returns None - ''' + """ substrings = list(substrings) cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = len(substrings) diff --git a/spacy/train.py b/spacy/train.py index 18269e0ad..1991c55fe 100644 --- a/spacy/train.py +++ b/spacy/train.py @@ -9,7 +9,9 @@ from .gold import merge_sents class Trainer(object): - '''Manage training of an NLP pipeline.''' + """ + Manage training of an NLP pipeline. + """ def __init__(self, nlp, gold_tuples): self.nlp = nlp self.gold_tuples = gold_tuples diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 55dbe7ba0..56e2d620a 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -48,8 +48,9 @@ EMPTY_LEXEME.vector = EMPTY_VEC cdef class Vocab: - '''A map container for a language's LexemeC structs. - ''' + """ + A map container for a language's LexemeC structs. + """ @classmethod def load(cls, path, lex_attr_getters=None, lemmatizer=True, tag_map=True, serializer_freqs=True, oov_prob=True, **deprecated_kwargs): @@ -108,7 +109,8 @@ cdef class Vocab: def __init__(self, lex_attr_getters=None, tag_map=None, lemmatizer=None, serializer_freqs=None, strings=tuple(), **deprecated_kwargs): - '''Create the vocabulary. + """ + Create the vocabulary. lex_attr_getters (dict): A dictionary mapping attribute IDs to functions to compute them. @@ -123,7 +125,7 @@ cdef class Vocab: Returns: Vocab: The newly constructed vocab object. - ''' + """ util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) lex_attr_getters = lex_attr_getters if lex_attr_getters is not None else {} @@ -172,17 +174,19 @@ cdef class Vocab: return langfunc('_') if langfunc else '' def __len__(self): - """The current number of lexemes stored.""" + """ + The current number of lexemes stored. + """ return self.length def resize_vectors(self, int new_size): - ''' + """ Set vectors_length to a new size, and allocate more memory for the Lexeme vectors if necessary. The memory will be zeroed. Arguments: new_size (int): The new size of the vectors. - ''' + """ cdef hash_t key cdef size_t addr if new_size > self.vectors_length: @@ -193,7 +197,8 @@ cdef class Vocab: self.vectors_length = new_size def add_flag(self, flag_getter, int flag_id=-1): - '''Set a new boolean flag to words in the vocabulary. + """ + Set a new boolean flag to words in the vocabulary. The flag_setter function will be called over the words currently in the vocab, and then applied to new words as they occur. You'll then be able @@ -213,7 +218,7 @@ cdef class Vocab: Returns: flag_id (int): The integer ID by which the flag value can be checked. - ''' + """ if flag_id == -1: for bit in range(1, 64): if bit not in self.lex_attr_getters: @@ -234,9 +239,11 @@ cdef class Vocab: return flag_id cdef const LexemeC* get(self, Pool mem, unicode string) except NULL: - '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme + """ + Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool - is the lexicon's own memory, the lexeme is saved in the lexicon.''' + is the lexicon's own memory, the lexeme is saved in the lexicon. + """ if string == u'': return &EMPTY_LEXEME cdef LexemeC* lex @@ -252,9 +259,11 @@ cdef class Vocab: return self._new_lexeme(mem, string) cdef const LexemeC* get_by_orth(self, Pool mem, attr_t orth) except NULL: - '''Get a pointer to a LexemeC from the lexicon, creating a new Lexeme + """ + Get a pointer to a LexemeC from the lexicon, creating a new Lexeme if necessary, using memory acquired from the given pool. If the pool - is the lexicon's own memory, the lexeme is saved in the lexicon.''' + is the lexicon's own memory, the lexeme is saved in the lexicon. + """ if orth == 0: return &EMPTY_LEXEME cdef LexemeC* lex @@ -297,30 +306,33 @@ cdef class Vocab: self.length += 1 def __contains__(self, unicode string): - '''Check whether the string has an entry in the vocabulary. + """ + Check whether the string has an entry in the vocabulary. Arguments: string (unicode): The ID string. Returns: bool Whether the string has an entry in the vocabulary. - ''' + """ key = hash_string(string) lex = self._by_hash.get(key) return lex is not NULL def __iter__(self): - '''Iterate over the lexemes in the vocabulary. + """ + Iterate over the lexemes in the vocabulary. Yields: Lexeme An entry in the vocabulary. - ''' + """ cdef attr_t orth cdef size_t addr for orth, addr in self._by_orth.items(): yield Lexeme(self, orth) def __getitem__(self, id_or_string): - '''Retrieve a lexeme, given an int ID or a unicode string. If a previously + """ + Retrieve a lexeme, given an int ID or a unicode string. If a previously unseen unicode string is given, a new lexeme is created and stored. Arguments: @@ -332,7 +344,7 @@ cdef class Vocab: Returns: lexeme (Lexeme): The lexeme indicated by the given ID. - ''' + """ cdef attr_t orth if type(id_or_string) == unicode: orth = self.strings[id_or_string] @@ -355,7 +367,8 @@ cdef class Vocab: return tokens def dump(self, loc=None): - """Save the lexemes binary data to the given location, or + """ + Save the lexemes binary data to the given location, or return a byte-string with the data if loc is None. Arguments: @@ -392,14 +405,15 @@ cdef class Vocab: return fp.string_data() def load_lexemes(self, loc): - '''Load the binary vocabulary data from the given location. + """ + Load the binary vocabulary data from the given location. Arguments: loc (Path): The path to load from. Returns: None - ''' + """ fp = CFile(loc, 'rb', on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc)) cdef LexemeC* lexeme = NULL @@ -440,8 +454,9 @@ cdef class Vocab: fp.close() def _deserialize_lexemes(self, CFile fp): - '''Load the binary vocabulary data from the given CFile. - ''' + """ + Load the binary vocabulary data from the given CFile. + """ cdef LexemeC* lexeme = NULL cdef hash_t key cdef unicode py_str @@ -494,13 +509,14 @@ cdef class Vocab: fp.close() def dump_vectors(self, out_loc): - '''Save the word vectors to a binary file. + """ + Save the word vectors to a binary file. Arguments: loc (Path): The path to save to. Returns: None - ''' + """ cdef int32_t vec_len = self.vectors_length cdef int32_t word_len cdef bytes word_str @@ -522,7 +538,8 @@ cdef class Vocab: out_file.close() def load_vectors(self, file_): - """Load vectors from a text-based file. + """ + Load vectors from a text-based file. Arguments: file_ (buffer): The file to read from. Entries should be separated by newlines, @@ -561,7 +578,8 @@ cdef class Vocab: return vec_len def load_vectors_from_bin_loc(self, loc): - """Load vectors from the location of a binary file. + """ + Load vectors from the location of a binary file. Arguments: loc (unicode): The path of the binary file to load from. From d24589aa72f38d98198ee9333eea689195442cab Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 12:05:47 +0200 Subject: [PATCH 14/88] Clean up imports, unused code, whitespace, docstrings --- spacy/__init__.py | 20 +++-------------- spacy/attrs.pyx | 4 ++++ spacy/cfile.pyx | 3 +++ spacy/cli/convert.py | 1 - spacy/cli/converters/conllu2json.py | 1 - spacy/cli/download.py | 1 - spacy/cli/train.py | 3 --- spacy/deprecated.py | 1 + spacy/en/__init__.py | 6 ------ spacy/gold.pyx | 11 +++++----- spacy/language.py | 33 +++++++++-------------------- spacy/lemmatizer.py | 11 +++------- spacy/lexeme.pyx | 8 +++---- spacy/matcher.pyx | 3 ++- spacy/morphology.pyx | 6 +----- spacy/multi_words.py | 8 ------- spacy/orth.pyx | 3 ++- spacy/parts_of_speech.pyx | 1 + spacy/pipeline.pyx | 5 ++++- spacy/scorer.py | 5 ++--- spacy/strings.pyx | 7 +++--- spacy/symbols.pyx | 1 + spacy/tagger.pyx | 5 +++-- spacy/tokenizer.pyx | 8 +++---- spacy/train.py | 7 +++--- spacy/util.py | 18 +++++++--------- spacy/vocab.pyx | 24 +++++++++------------ 27 files changed, 77 insertions(+), 127 deletions(-) delete mode 100644 spacy/multi_words.py diff --git a/spacy/__init__.py b/spacy/__init__.py index d2e0ad92b..bc668121f 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,27 +1,13 @@ # coding: utf8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals -import json from pathlib import Path + from .util import set_lang_class, get_lang_class, parse_package_meta from .deprecated import resolve_model_name from .cli import info -from . import en -from . import de -from . import zh -from . import es -from . import it -from . import hu -from . import fr -from . import pt -from . import nl -from . import sv -from . import fi -from . import bn -from . import he - -from .about import * +from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he set_lang_class(en.English.lang, en.English) diff --git a/spacy/attrs.pyx b/spacy/attrs.pyx index 829bcc396..49a1e0438 100644 --- a/spacy/attrs.pyx +++ b/spacy/attrs.pyx @@ -1,3 +1,7 @@ +# coding: utf8 +from __future__ import unicode_literals + + IDS = { "": NULL_ATTR, "IS_ALPHA": IS_ALPHA, diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx index d5d4bf353..b83b55498 100644 --- a/spacy/cfile.pyx +++ b/spacy/cfile.pyx @@ -1,3 +1,6 @@ +# coding: utf8 +from __future__ import unicode_literals + from libc.stdio cimport fopen, fclose, fread, fwrite from libc.string cimport memcpy diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py index bd6eaba65..f3ebb4c15 100644 --- a/spacy/cli/convert.py +++ b/spacy/cli/convert.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import io from pathlib import Path from .converters import conllu2json diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 188740d8b..3c5ebb0e4 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -2,7 +2,6 @@ from __future__ import unicode_literals import json -from ...gold import read_json_file, merge_sents from ... import util diff --git a/spacy/cli/download.py b/spacy/cli/download.py index 56dbd5264..06333eabf 100644 --- a/spacy/cli/download.py +++ b/spacy/cli/download.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals -import pip import requests import os import subprocess diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 54338c7a7..489430634 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -5,8 +5,6 @@ import json from pathlib import Path from ..scorer import Scorer -from ..tagger import Tagger -from ..syntax.parser import Parser from ..gold import GoldParse, merge_sents from ..gold import read_json_file as read_gold_json from .. import util @@ -60,7 +58,6 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer: - loss = 0 for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for doc, gold in epoch: trainer.update(doc, gold) diff --git a/spacy/deprecated.py b/spacy/deprecated.py index e93c4c420..069a4f736 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -1,4 +1,5 @@ from pathlib import Path + from . import about from . import util from .cli import download diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 775d42a2b..5d808d4b3 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -11,12 +11,6 @@ from ..deprecated import fix_glove_vectors_loading from .language_data import * -try: - basestring -except NameError: - basestring = str - - class English(Language): lang = 'en' diff --git a/spacy/gold.pyx b/spacy/gold.pyx index d667371fe..8edf68aab 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -1,13 +1,11 @@ # cython: profile=True +# coding: utf8 from __future__ import unicode_literals, print_function import io -import json import re -import os -from os import path - -import ujson as json +import ujson +from pathlib import Path from .syntax import nonproj @@ -303,7 +301,8 @@ cdef class GoldParse: self.heads = proj_heads def __len__(self): - """Get the number of gold-standard tokens. + """ + Get the number of gold-standard tokens. Returns (int): The number of gold-standard tokens. """ diff --git a/spacy/language.py b/spacy/language.py index 2d435b728..9c70fcbf3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,9 +1,7 @@ -from __future__ import absolute_import -from __future__ import unicode_literals -import pathlib +# coding: utf8 +from __future__ import absolute_import, unicode_literals from contextlib import contextmanager import shutil - import ujson @@ -21,19 +19,18 @@ from .tokenizer import Tokenizer from .vocab import Vocab from .tagger import Tagger from .matcher import Matcher -from . import attrs -from . import orth -from . import util -from . import language_data from .lemmatizer import Lemmatizer from .train import Trainer - -from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP from .syntax.parser import get_templates from .syntax.nonproj import PseudoProjectivity from .pipeline import DependencyParser, EntityRecognizer from .syntax.arc_eager import ArcEager from .syntax.ner import BiluoPushDown +from .attrs import IS_STOP +from . import attrs +from . import orth +from . import util +from . import language_data class BaseDefaults(object): @@ -150,25 +147,15 @@ class BaseDefaults(object): return pipeline token_match = language_data.TOKEN_MATCH - prefixes = tuple(language_data.TOKENIZER_PREFIXES) - suffixes = tuple(language_data.TOKENIZER_SUFFIXES) - infixes = tuple(language_data.TOKENIZER_INFIXES) - tag_map = dict(language_data.TAG_MAP) - tokenizer_exceptions = {} - parser_features = get_templates('parser') - entity_features = get_templates('ner') - tagger_features = Tagger.feature_templates # TODO -- fix this - stop_words = set() - lemma_rules = {} lemma_exc = {} lemma_index = {} @@ -313,7 +300,8 @@ class Language(object): self.pipeline = [self.tagger, self.parser, self.matcher, self.entity] def __call__(self, text, tag=True, parse=True, entity=True): - """Apply the pipeline to some text. The text can span multiple sentences, + """ + Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. @@ -373,7 +361,7 @@ class Language(object): } self.setup_directory(path, **configs) - + strings_loc = path / 'vocab' / 'strings.json' with strings_loc.open('w', encoding='utf8') as file_: self.vocab.strings.dump(file_) @@ -397,4 +385,3 @@ class Language(object): # to taking nlp.path if path is not None: self.save_to_directory(path) - diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 4693e88c3..d7541c56b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -1,13 +1,8 @@ -from __future__ import unicode_literals, print_function -import codecs -import pathlib - -import ujson as json +# coding: utf8 +from __future__ import unicode_literals from .symbols import POS, NOUN, VERB, ADJ, PUNCT -from .symbols import VerbForm_inf, VerbForm_none -from .symbols import Number_sing -from .symbols import Degree_pos +from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos class Lemmatizer(object): diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 202dd5947..5d9ce7b98 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,4 +1,7 @@ # cython: embedsignature=True +# coding: utf8 +from __future__ import unicode_literals, print_function + from libc.math cimport sqrt from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool @@ -9,14 +12,11 @@ from cython.view cimport array as cvarray cimport numpy as np np.import_array() - - from libc.string cimport memset +import numpy from .orth cimport word_shape from .typedefs cimport attr_t, flags_t -import numpy - from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from .attrs cimport IS_BRACKET diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index af4b2a51d..37c34f0ca 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -1,5 +1,6 @@ # cython: profile=True # cython: infer_types=True +# coding: utf8 from __future__ import unicode_literals from .typedefs cimport attr_t @@ -164,7 +165,7 @@ def _convert_strings(token_specs, string_store): def merge_phrase(matcher, doc, i, matches): '''Callback to merge a phrase on match''' ent_id, label, start, end = matches[i] - span = doc[start : end] + span = doc[start : end] span.merge(ent_type=label, ent_id=ent_id) diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index b440ac818..02da21f09 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -1,13 +1,9 @@ # cython: infer_types +# coding: utf8 from __future__ import unicode_literals from libc.string cimport memset -try: - import ujson as json -except ImportError: - import json - from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT from .attrs cimport POS, IS_SPACE from .parts_of_speech import IDS as POS_IDS diff --git a/spacy/multi_words.py b/spacy/multi_words.py deleted file mode 100644 index 748086d30..000000000 --- a/spacy/multi_words.py +++ /dev/null @@ -1,8 +0,0 @@ -class RegexMerger(object): - def __init__(self, regexes): - self.regexes = regexes - - def __call__(self, tokens): - for tag, entity_type, regex in self.regexes: - for m in regex.finditer(tokens.string): - tokens.merge(m.start(), m.end(), tag, m.group(), entity_type) diff --git a/spacy/orth.pyx b/spacy/orth.pyx index 1a6ffee34..a34058b8e 100644 --- a/spacy/orth.pyx +++ b/spacy/orth.pyx @@ -1,6 +1,7 @@ -# coding: utf8 # cython: infer_types=True +# coding: utf8 from __future__ import unicode_literals + import unicodedata import re diff --git a/spacy/parts_of_speech.pyx b/spacy/parts_of_speech.pyx index a5c770f61..38d5959b6 100644 --- a/spacy/parts_of_speech.pyx +++ b/spacy/parts_of_speech.pyx @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index e12dfb690..147746a27 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -1,3 +1,6 @@ +# coding: utf8 +from __future__ import unicode_literals + from .syntax.parser cimport Parser from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown @@ -36,7 +39,7 @@ cdef class BeamEntityRecognizer(BeamParser): TransitionSystem = BiluoPushDown feature_templates = get_feature_templates('ner') - + def add_label(self, label): Parser.add_label(self, label) if isinstance(label, basestring): diff --git a/spacy/scorer.py b/spacy/scorer.py index 5f899c454..b1ce3faa4 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -1,6 +1,5 @@ -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals +# coding: utf8 +from __future__ import division, print_function, unicode_literals from .gold import tags_to_entities diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 5eabdf6c1..38afd7f02 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -1,12 +1,11 @@ # cython: infer_types=True +# coding: utf8 from __future__ import unicode_literals, absolute_import cimport cython from libc.string cimport memcpy from libc.stdint cimport uint64_t, uint32_t - from murmurhash.mrmr cimport hash64, hash32 - from preshed.maps cimport map_iter, key_t from .typedefs cimport hash_t @@ -154,11 +153,11 @@ cdef class StringStore: raise TypeError(type(string_or_id)) utf8str = self._intern_utf8(byte_string, len(byte_string)) if utf8str is NULL: - # TODO: We need to use 32 bit here, for compatibility with the + # TODO: We need to use 32 bit here, for compatibility with the # vocabulary values. This makes birthday paradox probabilities # pretty bad. # We could also get unlucky here, and hash into a value that - # collides with the 'real' strings. + # collides with the 'real' strings. return hash32_utf8(byte_string, len(byte_string)) else: return utf8str - self.c diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index d9102037a..662aca777 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -1,3 +1,4 @@ +# coding: utf8 from __future__ import unicode_literals IDS = { diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 8b10ded82..a1a4f8886 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -1,5 +1,7 @@ import json -import pathlib +# coding: utf8 +from __future__ import unicode_literals + from collections import defaultdict from cymem.cymem cimport Pool @@ -12,7 +14,6 @@ from thinc.linalg cimport VecVec from .tokens.doc cimport Doc from .attrs cimport TAG from .gold cimport GoldParse - from .attrs cimport * diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1d4fc2dce..beb450d4f 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -1,7 +1,7 @@ # cython: embedsignature=True +# coding: utf8 from __future__ import unicode_literals -import pathlib from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc @@ -111,7 +111,7 @@ cdef class Tokenizer: self.token_match) return (self.__class__, args, None, None) - + cpdef Doc tokens_from_list(self, list strings): return Doc(self.vocab, words=strings) #raise NotImplementedError( @@ -276,7 +276,7 @@ cdef class Tokenizer: cache_hit = self._try_cache(hash_string(string), tokens) if cache_hit: pass - elif self.token_match and self.token_match(string): + elif self.token_match and self.token_match(string): # We're always saying 'no' to spaces here -- the caller will # fix up the outermost one, with reference to the original. # See Issue #859 @@ -377,7 +377,7 @@ cdef class Tokenizer: """ for chunk, substrings in sorted(special_cases.items()): self.add_special_case(chunk, substrings) - + def add_special_case(self, unicode string, substrings): """ Add a special-case tokenization rule. diff --git a/spacy/train.py b/spacy/train.py index 1991c55fe..13d7e1d84 100644 --- a/spacy/train.py +++ b/spacy/train.py @@ -1,11 +1,10 @@ -from __future__ import absolute_import -from __future__ import unicode_literals +# coding: utf8 +from __future__ import absolute_import, unicode_literals import random import tqdm -from .gold import GoldParse +from .gold import GoldParse, merge_sents from .scorer import Scorer -from .gold import merge_sents class Trainer(object): diff --git a/spacy/util.py b/spacy/util.py index 2d9812839..c574cd2c3 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,11 +1,10 @@ # coding: utf8 from __future__ import unicode_literals, print_function -import os + import io import json import re -import os.path -import pathlib +from pathlib import Path import sys import textwrap @@ -23,7 +22,7 @@ except NameError: # Python 3 LANGUAGES = {} -_data_path = pathlib.Path(__file__).parent / 'data' +_data_path = Path(__file__).parent / 'data' def set_lang_class(name, cls): @@ -163,8 +162,8 @@ def is_python2(): def parse_package_meta(package_path, package, require=True): - location = os.path.join(str(package_path), package, 'meta.json') - if os.path.isfile(location): + location = package_path / package / 'meta.json' + if location.is_file(): with io.open(location, encoding='utf8') as f: meta = json.load(f) return meta @@ -209,10 +208,9 @@ def print_markdown(data, **kwargs): which will be converted to a list of tuples.""" def excl_value(value): - # don't print value if it contains absolute path of directory - # (i.e. personal info that shouldn't need to be shared) - # other conditions can be included here if necessary - if str(pathlib.Path(__file__).parent) in value: + # don't print value if it contains absolute path of directory (i.e. + # personal info). Other conditions can be included here if necessary. + if unicode_(Path(__file__).parent) in value: return True if type(data) == dict: diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 56e2d620a..d3eccb804 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -1,10 +1,6 @@ +# coding: utf8 from __future__ import unicode_literals -from libc.string cimport memset -from libc.stdint cimport int32_t -from libc.math cimport sqrt - -from pathlib import Path import bz2 import ujson as json import re @@ -14,28 +10,28 @@ try: except ImportError: import pickle +from libc.string cimport memset +from libc.stdint cimport int32_t +from libc.math cimport sqrt +from cymem.cymem cimport Address from .lexeme cimport EMPTY_LEXEME from .lexeme cimport Lexeme from .strings cimport hash_string from .typedefs cimport attr_t from .cfile cimport CFile, StringCFile -from .lemmatizer import Lemmatizer -from .attrs import intify_attrs from .tokens.token cimport Token - -from . import attrs -from . import symbols - -from cymem.cymem cimport Address from .serialize.packer cimport Packer from .attrs cimport PROB, LANG -from . import util - try: import copy_reg except ImportError: import copyreg as copy_reg +from .lemmatizer import Lemmatizer +from .attrs import intify_attrs +from . import util +from . import attrs +from . import symbols DEF MAX_VEC_SIZE = 100000 From 26445ee3046c185c182bc20280985017ff781898 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 12:07:02 +0200 Subject: [PATCH 15/88] Add compat module for Python2/3 and platform compatibility --- spacy/compat.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 spacy/compat.py diff --git a/spacy/compat.py b/spacy/compat.py new file mode 100644 index 000000000..6c2b00bae --- /dev/null +++ b/spacy/compat.py @@ -0,0 +1,55 @@ +# coding: utf8 +from __future__ import unicode_literals + +import six +import sys +import json + +try: + import cPickle as pickle +except ImportError: + import pickle + +try: + import copy_reg +except ImportError: + import copyreg as copy_reg + + +is_python2 = six.PY2 +is_python3 = six.PY3 +is_windows = sys.platform.startswith('win') +is_linux = sys.platform.startswith('linux') +is_osx = sys.platform == 'darwin' + + +if is_python2: + bytes_ = str + unicode_ = unicode + basestring_ = basestring + input_ = raw_input + json_dumps = lambda data: json.dumps(data, indent=2).decode('utf8') + +elif is_python3: + bytes_ = bytes + unicode_ = str + basestring_ = str + input_ = input + json_dumps = lambda data: json.dumps(data, indent=2) + + +def symlink_to(orig, dest): + if is_python3: + orig.symlink_to(dest) + + elif is_python2: + import subprocess + subprocess.call(['mklink', '/d', unicode(orig), unicode(dest)], shell=True) + + +def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): + return ((python2 == None or python2 == is_python2) and + (python3 == None or python3 == is_python3) and + (windows == None or windows == is_windows) and + (linux == None or linux == is_linux) and + (osx == None or osx == is_osx)) From c05ec4b89a3e077d54a7802958f9af2b9777d711 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 12:11:16 +0200 Subject: [PATCH 16/88] Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path --- spacy/cli/link.py | 29 ++++++++++++----------------- spacy/cli/package.py | 21 +++++++-------------- spacy/deprecated.py | 10 +--------- spacy/language.py | 18 +++--------------- spacy/tagger.pyx | 3 ++- spacy/tokenizer.pyx | 4 +--- spacy/util.py | 37 +++++++++++-------------------------- spacy/vocab.pyx | 13 ++----------- 8 files changed, 39 insertions(+), 96 deletions(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index a92d809f5..11d01ac11 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import pip from pathlib import Path import importlib +from ..compat import unicode_, symlink_to from .. import util @@ -43,23 +44,17 @@ def symlink(model_path, link_name, force): elif link_path.exists(): link_path.unlink() - # Add workaround for Python 2 on Windows (see issue #909) - if util.is_python2() and util.is_windows(): - import subprocess - command = ['mklink', '/d', unicode(link_path), unicode(model_path)] - try: - subprocess.call(command, shell=True) - except: - # This is quite dirty, but just making sure other Windows-specific - # errors are caught so users at least see a proper error message. - util.sys_exit( - "Creating a symlink in spacy/data failed. You can still import " - "the model as a Python package and call its load() method, or " - "create the symlink manually:", - "{a} --> {b}".format(a=unicode(model_path), b=unicode(link_path)), - title="Error: Couldn't link model to '{l}'".format(l=link_name)) - else: - link_path.symlink_to(model_path) + try: + symlink_to(link_path, model_path) + except: + # This is quite dirty, but just making sure other errors are caught so + # users at least see a proper message. + util.sys_exit( + "Creating a symlink in spacy/data failed. You can still import " + "the model as a Python package and call its load() method, or " + "create the symlink manually:", + "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)), + title="Error: Couldn't link model to '{l}'".format(l=link_name)) util.print_msg( "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()), diff --git a/spacy/cli/package.py b/spacy/cli/package.py index e88f91bf4..fb0de2cc5 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -1,20 +1,13 @@ # coding: utf8 from __future__ import unicode_literals -import json import shutil import requests from pathlib import Path -import six - -from .. import about +from ..compat import unicode_, json_dumps from .. import util -if six.PY2: - json_dumps = lambda data: json.dumps(data, indent=2).decode("utf8") -elif six.PY3: - json_dumps = lambda data: json.dumps(data, indent=2) def package(input_dir, output_dir, force): input_path = Path(input_dir) @@ -32,31 +25,31 @@ def package(input_dir, output_dir, force): package_path = main_path / model_name create_dirs(package_path, force) - shutil.copytree(input_path.as_posix(), (package_path / model_name_v).as_posix()) + shutil.copytree(unicode_(input_path), unicode_(package_path / model_name_v)) create_file(main_path / 'meta.json', json_dumps(meta)) create_file(main_path / 'setup.py', template_setup) create_file(main_path / 'MANIFEST.in', template_manifest) create_file(package_path / '__init__.py', template_init) util.print_msg( - main_path.as_posix(), + unicode_(main_path), "To build the package, run `python setup.py sdist` in that directory.", title="Successfully created package {p}".format(p=model_name_v)) def check_dirs(input_path, output_path): if not input_path.exists(): - util.sys_exit(input_path.as_poisx(), title="Model directory not found") + util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found") if not output_path.exists(): - util.sys_exit(output_path.as_posix(), title="Output directory not found") + util.sys_exit(unicode_(output_path), title="Output directory not found") def create_dirs(package_path, force): if package_path.exists(): if force: - shutil.rmtree(package_path.as_posix()) + shutil.rmtree(unicode_(package_path.as_posix)) else: - util.sys_exit(package_path.as_posix(), + util.sys_exit(unicode_(package_path.as_posix), "Please delete the directory and try again.", title="Package directory already exists") Path.mkdir(package_path, parents=True) diff --git a/spacy/deprecated.py b/spacy/deprecated.py index 069a4f736..3a00e7207 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -6,12 +6,6 @@ from .cli import download from .cli import link -try: - basestring -except NameError: - basestring = str - - def read_lang_data(package): tokenization = package.load_json(('tokenizer', 'specials.json')) with package.open(('tokenizer', 'prefix.txt'), default=None) as file_: @@ -73,9 +67,7 @@ def fix_glove_vectors_loading(overrides): if overrides.get('path') in (None, True): data_path = util.get_data_path() else: - path = overrides['path'] - if isinstance(path, basestring): - path = Path(path) + path = util.ensure_path(overrides['path']) data_path = path.parent vec_path = None if 'add_vectors' not in overrides: diff --git a/spacy/language.py b/spacy/language.py index 9c70fcbf3..4b6c3397d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -4,17 +4,6 @@ from contextlib import contextmanager import shutil import ujson - -try: - basestring -except NameError: - basestring = str - -try: - unicode -except NameError: - unicode = str - from .tokenizer import Tokenizer from .vocab import Vocab from .tagger import Tagger @@ -26,6 +15,7 @@ from .syntax.nonproj import PseudoProjectivity from .pipeline import DependencyParser, EntityRecognizer from .syntax.arc_eager import ArcEager from .syntax.ner import BiluoPushDown +from .compat import unicode_ from .attrs import IS_STOP from . import attrs from . import orth @@ -205,7 +195,7 @@ class Language(object): directory.mkdir() with (directory / 'config.json').open('wb') as file_: data = ujson.dumps(config, indent=2) - if isinstance(data, unicode): + if isinstance(data, unicode_): data = data.encode('utf8') file_.write(data) if not (path / 'vocab').exists(): @@ -252,9 +242,7 @@ class Language(object): def __init__(self, **overrides): if 'data_dir' in overrides and 'path' not in overrides: raise ValueError("The argument 'data_dir' has been renamed to 'path'") - path = overrides.get('path', True) - if isinstance(path, basestring): - path = pathlib.Path(path) + path = util.ensure_path(overrides.get('path', True)) if path is True: path = util.get_data_path() / self.lang if not path.exists() and 'path' not in overrides: diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index a1a4f8886..82c183706 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -15,6 +15,7 @@ from .tokens.doc cimport Doc from .attrs cimport TAG from .gold cimport GoldParse from .attrs cimport * +from . import util cpdef enum: @@ -127,7 +128,7 @@ cdef class Tagger: """ # TODO: Change this to expect config.json when we don't have to # support old data. - path = path if not isinstance(path, basestring) else pathlib.Path(path) + path = util.ensure_path(path) if (path / 'templates.json').exists(): with (path / 'templates.json').open('r', encoding='utf8') as file_: templates = json.load(file_) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index beb450d4f..4312c72e6 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -48,10 +48,8 @@ cdef class Tokenizer: infix_finditer: Signature of re.compile(string).finditer Returns Tokenizer - if isinstance(path, basestring): - path = pathlib.Path(path) - """ + path = util.ensure_path(path) if rules is None: with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_: rules = json.load(file_) diff --git a/spacy/util.py b/spacy/util.py index c574cd2c3..dd5937f7b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,17 +8,7 @@ from pathlib import Path import sys import textwrap - -try: - basestring -except NameError: - basestring = str - - -try: - raw_input -except NameError: # Python 3 - raw_input = input +from .compat import basestring_, unicode_, input_ LANGUAGES = {} @@ -46,9 +36,14 @@ def get_data_path(require_exists=True): def set_data_path(path): global _data_path - if isinstance(path, basestring): - path = pathlib.Path(path) - _data_path = path + _data_path = ensure_path(path) + + +def ensure_path(path): + if isinstance(path, basestring_): + return Path(path) + else: + return path def or_(val1, val2): @@ -94,7 +89,7 @@ def constraint_match(constraint_string, version): def read_regex(path): - path = path if not isinstance(path, basestring) else pathlib.Path(path) + path = ensure_path(path) with path.open() as file_: entries = file_.read().split('\n') expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) @@ -151,16 +146,6 @@ def check_renamed_kwargs(renamed, kwargs): raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) -def is_windows(): - """Check if user is on Windows.""" - return sys.platform.startswith('win') - - -def is_python2(): - """Check if Python 2 is used.""" - return sys.version.startswith('2.') - - def parse_package_meta(package_path, package, require=True): location = package_path / package / 'meta.json' if location.is_file(): @@ -180,7 +165,7 @@ def get_raw_input(description, default=False): additional = ' (default: {d})'.format(d=default) if default else '' prompt = ' {d}{a}: '.format(d=description, a=additional) - user_input = raw_input(prompt) + user_input = input_(prompt) return user_input diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index d3eccb804..4089f65a3 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -5,11 +5,6 @@ import bz2 import ujson as json import re -try: - import cPickle as pickle -except ImportError: - import pickle - from libc.string cimport memset from libc.stdint cimport int32_t from libc.math cimport sqrt @@ -23,10 +18,7 @@ from .tokens.token cimport Token from .serialize.packer cimport Packer from .attrs cimport PROB, LANG -try: - import copy_reg -except ImportError: - import copyreg as copy_reg +from .compat import copy_reg, pickle from .lemmatizer import Lemmatizer from .attrs import intify_attrs from . import util @@ -69,8 +61,7 @@ cdef class Vocab: Returns: Vocab: The newly constructed vocab object. """ - if isinstance(path, basestring): - path = Path(path) + path = util.ensure_path(path) util.check_renamed_kwargs({'get_lex_attr': 'lex_attr_getters'}, deprecated_kwargs) if 'vectors' in deprecated_kwargs: raise AttributeError( From 956dc36785bb827d1b24d94839cfb77148b78457 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 12:12:31 +0200 Subject: [PATCH 17/88] Move functions to deprecated --- spacy/deprecated.py | 21 +++++++++++++++++++-- spacy/util.py | 33 --------------------------------- 2 files changed, 19 insertions(+), 35 deletions(-) diff --git a/spacy/deprecated.py b/spacy/deprecated.py index 3a00e7207..dc7d3e1e4 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -53,6 +53,23 @@ def detokenize(token_rules, words): # Deprecated? return positions +def match_best_version(target_name, target_version, path): + path = util.ensure_path(path) + if path is None or not path.exists(): + return None + matches = [] + for data_name in path.iterdir(): + name, version = split_data_name(data_name.parts[-1]) + if name == target_name: + matches.append((tuple(float(v) for v in version.split('.')), data_name)) + if matches: + return Path(max(matches)[1]) + else: + return None + + +def split_data_name(name): + return name.split('-', 1) if '-' in name else (name, '') def fix_glove_vectors_loading(overrides): @@ -72,11 +89,11 @@ def fix_glove_vectors_loading(overrides): vec_path = None if 'add_vectors' not in overrides: if 'vectors' in overrides: - vec_path = util.match_best_version(overrides['vectors'], None, data_path) + vec_path = match_best_version(overrides['vectors'], None, data_path) if vec_path is None: return overrides else: - vec_path = util.match_best_version('en_glove_cc_300_1m_vectors', None, data_path) + vec_path = match_best_version('en_glove_cc_300_1m_vectors', None, data_path) if vec_path is not None: vec_path = vec_path / 'vocab' / 'vec.bin' if vec_path is not None: diff --git a/spacy/util.py b/spacy/util.py index dd5937f7b..8229d05cd 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -55,39 +55,6 @@ def or_(val1, val2): return val2 -def match_best_version(target_name, target_version, path): - path = path if not isinstance(path, basestring) else pathlib.Path(path) - if path is None or not path.exists(): - return None - matches = [] - for data_name in path.iterdir(): - name, version = split_data_name(data_name.parts[-1]) - if name == target_name and constraint_match(target_version, version): - matches.append((tuple(float(v) for v in version.split('.')), data_name)) - if matches: - return pathlib.Path(max(matches)[1]) - else: - return None - - -def split_data_name(name): - return name.split('-', 1) if '-' in name else (name, '') - - -def constraint_match(constraint_string, version): - # From http://github.com/spacy-io/sputnik - if not constraint_string: - return True - - constraints = [c.strip() for c in constraint_string.split(',') if c.strip()] - - for c in constraints: - if not re.match(r'[><=][=]?\d+(\.\d+)*', c): - raise ValueError('invalid constraint: %s' % c) - - return all(semver.match(version, c) for c in constraints) - - def read_regex(path): path = ensure_path(path) with path.open() as file_: From 958b12dec88538d54ca3e35bb78e1ea960c5e2b2 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 12:13:00 +0200 Subject: [PATCH 18/88] Use pathlib instead of os.path --- spacy/gold.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 8edf68aab..a33ca69a4 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -139,9 +139,10 @@ def _min_edit_path(cand_words, gold_words): def read_json_file(loc, docs_filter=None): - if path.isdir(loc): - for filename in os.listdir(loc): - yield from read_json_file(path.join(loc, filename)) + loc = Path(loc) + if loc.is_dir(): + for filename in loc.iterdir(): + yield from read_json_file(loc / filename) else: with io.open(loc, 'r', encoding='utf8') as file_: docs = json.load(file_) From e1efd589c334d1820da3f3ee282c7dabb2307884 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 12:13:34 +0200 Subject: [PATCH 19/88] Fix json imports and use ujson --- spacy/gold.pyx | 2 +- spacy/matcher.pyx | 10 +++------- spacy/tagger.pyx | 4 ++-- spacy/tokenizer.pyx | 10 ++-------- spacy/util.py | 4 ++-- spacy/vocab.pyx | 8 ++++---- 6 files changed, 14 insertions(+), 24 deletions(-) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index a33ca69a4..425ad0fe0 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -145,7 +145,7 @@ def read_json_file(loc, docs_filter=None): yield from read_json_file(loc / filename) else: with io.open(loc, 'r', encoding='utf8') as file_: - docs = json.load(file_) + docs = ujson.load(file_) for doc in docs: if docs_filter is not None and not docs_filter(doc): continue diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 37c34f0ca..c9084c359 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -3,6 +3,8 @@ # coding: utf8 from __future__ import unicode_literals +import ujson + from .typedefs cimport attr_t from .typedefs cimport hash_t from .attrs cimport attr_id_t @@ -53,12 +55,6 @@ from .attrs import FLAG36 as L9_ENT from .attrs import FLAG35 as L10_ENT -try: - import ujson as json -except ImportError: - import json - - cpdef enum quantifier_t: _META ONE @@ -194,7 +190,7 @@ cdef class Matcher: """ if (path / 'gazetteer.json').exists(): with (path / 'gazetteer.json').open('r', encoding='utf8') as file_: - patterns = json.load(file_) + patterns = ujson.load(file_) else: patterns = {} return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 82c183706..59e8a2c66 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -1,7 +1,7 @@ -import json # coding: utf8 from __future__ import unicode_literals +import ujson from collections import defaultdict from cymem.cymem cimport Pool @@ -131,7 +131,7 @@ cdef class Tagger: path = util.ensure_path(path) if (path / 'templates.json').exists(): with (path / 'templates.json').open('r', encoding='utf8') as file_: - templates = json.load(file_) + templates = ujson.load(file_) elif require: raise IOError( "Required file %s/templates.json not found when loading Tagger" % str(path)) diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 4312c72e6..c094bea0d 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -2,16 +2,10 @@ # coding: utf8 from __future__ import unicode_literals +import ujson from cython.operator cimport dereference as deref from cython.operator cimport preincrement as preinc - -try: - import ujson as json -except ImportError: - import json - - from cymem.cymem cimport Pool from preshed.maps cimport PreshMap @@ -52,7 +46,7 @@ cdef class Tokenizer: path = util.ensure_path(path) if rules is None: with (path / 'tokenizer' / 'specials.json').open('r', encoding='utf8') as file_: - rules = json.load(file_) + rules = ujson.load(file_) if prefix_search in (None, True): with (path / 'tokenizer' / 'prefix.txt').open() as file_: entries = file_.read().split('\n') diff --git a/spacy/util.py b/spacy/util.py index 8229d05cd..c6946ce6e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals, print_function import io -import json +import ujson import re from pathlib import Path import sys @@ -117,7 +117,7 @@ def parse_package_meta(package_path, package, require=True): location = package_path / package / 'meta.json' if location.is_file(): with io.open(location, encoding='utf8') as f: - meta = json.load(f) + meta = ujson.load(f) return meta elif require: raise IOError("Could not read meta.json from %s" % location) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4089f65a3..4df97ddf0 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -2,7 +2,7 @@ from __future__ import unicode_literals import bz2 -import ujson as json +import ujson import re from libc.string cimport memset @@ -69,7 +69,7 @@ cdef class Vocab: "Install vectors after loading.") if tag_map is True and (path / 'vocab' / 'tag_map.json').exists(): with (path / 'vocab' / 'tag_map.json').open('r', encoding='utf8') as file_: - tag_map = json.load(file_) + tag_map = ujson.load(file_) elif tag_map is True: tag_map = None if lex_attr_getters is not None \ @@ -82,12 +82,12 @@ cdef class Vocab: lemmatizer = Lemmatizer.load(path) if serializer_freqs is True and (path / 'vocab' / 'serializer.json').exists(): with (path / 'vocab' / 'serializer.json').open('r', encoding='utf8') as file_: - serializer_freqs = json.load(file_) + serializer_freqs = ujson.load(file_) else: serializer_freqs = None with (path / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: - strings_list = json.load(file_) + strings_list = ujson.load(file_) cdef Vocab self = cls(lex_attr_getters=lex_attr_getters, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs, strings=strings_list) From 35fb4febe2eff62eee91d33b01f48f65f0650b69 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 12:13:45 +0200 Subject: [PATCH 20/88] Fix whitespace --- spacy/cli/info.py | 3 --- spacy/cli/link.py | 1 - 2 files changed, 4 deletions(-) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index eae0593a4..50844257f 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -18,7 +18,6 @@ def info(model=None, markdown=False): else: data['source'] = str(model_path) print_info(data, "model " + model, markdown) - else: data = get_spacy_data() print_info(data, "spaCy", markdown) @@ -26,10 +25,8 @@ def info(model=None, markdown=False): def print_info(data, title, markdown): title = "Info about {title}".format(title=title) - if markdown: util.print_markdown(data, title=title) - else: util.print_table(data, title=title) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 11d01ac11..9abb7bfb4 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -21,7 +21,6 @@ def link_package(package_name, link_name, force=False): # Python's installation and import rules are very complicated. pkg = importlib.import_module(package_name) package_path = Path(pkg.__file__).parent.parent - meta = get_meta(package_path, package_name) model_name = package_name + '-' + meta['version'] model_path = package_path / package_name / model_name From fefe6684cd9e0d9d24a895e48a19f5ab633fe7af Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 12:17:27 +0200 Subject: [PATCH 21/88] Fix symlink function to check for Windows --- spacy/compat.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/spacy/compat.py b/spacy/compat.py index 6c2b00bae..a42646959 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -39,12 +39,11 @@ elif is_python3: def symlink_to(orig, dest): - if is_python3: - orig.symlink_to(dest) - - elif is_python2: + if is_python2 and is_windows: import subprocess subprocess.call(['mklink', '/d', unicode(orig), unicode(dest)], shell=True) + else: + orig.symlink_to(dest) def is_config(python2=None, python3=None, windows=None, linux=None, osx=None): From 0739ae7b76b5fc6342618cf2e4746db5516edf89 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 15 Apr 2017 13:05:15 +0200 Subject: [PATCH 22/88] Tidy up and fix formatting and imports --- spacy/compat.py | 6 +- spacy/deprecated.py | 3 + spacy/syntax/_parse_features.pyx | 8 +-- spacy/syntax/arc_eager.pyx | 23 +++---- spacy/syntax/beam_parser.pyx | 31 ++------- spacy/syntax/iterators.pyx | 11 ++- spacy/syntax/ner.pyx | 15 ++--- spacy/syntax/nonproj.pyx | 5 +- spacy/syntax/parser.pyx | 80 +++++++++++----------- spacy/syntax/stateclass.pyx | 8 ++- spacy/syntax/transition_system.pyx | 5 +- spacy/syntax/util.py | 18 ----- spacy/tokens/doc.pyx | 93 +++++++++++++++----------- spacy/tokens/span.pyx | 72 ++++++++++++-------- spacy/tokens/token.pyx | 103 +++++++++++++++++------------ 15 files changed, 251 insertions(+), 230 deletions(-) delete mode 100644 spacy/syntax/util.py diff --git a/spacy/compat.py b/spacy/compat.py index a42646959..d216994cc 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import six import sys -import json +import ujson try: import cPickle as pickle @@ -28,14 +28,14 @@ if is_python2: unicode_ = unicode basestring_ = basestring input_ = raw_input - json_dumps = lambda data: json.dumps(data, indent=2).decode('utf8') + json_dumps = lambda data: ujson.dumps(data, indent=2).decode('utf8') elif is_python3: bytes_ = bytes unicode_ = str basestring_ = str input_ = input - json_dumps = lambda data: json.dumps(data, indent=2) + json_dumps = lambda data: ujson.dumps(data, indent=2) def symlink_to(orig, dest): diff --git a/spacy/deprecated.py b/spacy/deprecated.py index dc7d3e1e4..f481a2502 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -1,3 +1,6 @@ +# coding: utf8 +from __future__ import unicode_literals + from pathlib import Path from . import about diff --git a/spacy/syntax/_parse_features.pyx b/spacy/syntax/_parse_features.pyx index 36a78c638..2e0db4877 100644 --- a/spacy/syntax/_parse_features.pyx +++ b/spacy/syntax/_parse_features.pyx @@ -7,17 +7,17 @@ out of "context") is in features/extractor.pyx The atomic feature names are listed in a big enum, so that the feature tuples can refer to them. """ -from libc.string cimport memset +# coding: utf-8 +from __future__ import unicode_literals +from libc.string cimport memset from itertools import combinations +from cymem.cymem cimport Pool from ..structs cimport TokenC - from .stateclass cimport StateClass from ._state cimport StateC -from cymem.cymem cimport Pool - cdef inline void fill_token(atom_t* context, const TokenC* token) nogil: if token is NULL: diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index eac71eaa8..974f62558 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -1,29 +1,26 @@ # cython: profile=True # cython: cdivision=True # cython: infer_types=True +# coding: utf-8 from __future__ import unicode_literals + from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF - import ctypes -import os - -from ..structs cimport TokenC +from libc.stdint cimport uint32_t +from libc.string cimport memcpy +from cymem.cymem cimport Pool +from .stateclass cimport StateClass +from ._state cimport StateC, is_space_token +from .nonproj import PseudoProjectivity +from .nonproj import is_nonproj_tree from .transition_system cimport do_func_t, get_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t from ..gold cimport GoldParse from ..gold cimport GoldParseC from ..attrs cimport TAG, HEAD, DEP, ENT_IOB, ENT_TYPE, IS_SPACE from ..lexeme cimport Lexeme - -from libc.stdint cimport uint32_t -from libc.string cimport memcpy - -from cymem.cymem cimport Pool -from .stateclass cimport StateClass -from ._state cimport StateC, is_space_token -from .nonproj import PseudoProjectivity -from .nonproj import is_nonproj_tree +from ..structs cimport TokenC DEF NON_MONOTONIC = True diff --git a/spacy/syntax/beam_parser.pyx b/spacy/syntax/beam_parser.pyx index 8bafda9e3..e96e28fcf 100644 --- a/spacy/syntax/beam_parser.pyx +++ b/spacy/syntax/beam_parser.pyx @@ -1,50 +1,34 @@ +""" +MALT-style dependency parser +""" # cython: profile=True # cython: experimental_cpp_class_def=True # cython: cdivision=True # cython: infer_types=True -""" -MALT-style dependency parser -""" -from __future__ import unicode_literals +# coding: utf-8 + +from __future__ import unicode_literals, print_function cimport cython from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF - from libc.stdint cimport uint32_t, uint64_t from libc.string cimport memset, memcpy from libc.stdlib cimport rand from libc.math cimport log, exp, isnan, isinf -import random -import os.path -from os import path -import shutil -import json -import math - from cymem.cymem cimport Pool, Address from murmurhash.mrmr cimport real_hash64 as hash64 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t - - -from util import Config - from thinc.linear.features cimport ConjunctionExtracter from thinc.structs cimport FeatureC, ExampleC - -from thinc.extra.search cimport Beam -from thinc.extra.search cimport MaxViolation +from thinc.extra.search cimport Beam, MaxViolation from thinc.extra.eg cimport Example from thinc.extra.mb cimport Minibatch from ..structs cimport TokenC - from ..tokens.doc cimport Doc from ..strings cimport StringStore - from .transition_system cimport TransitionSystem, Transition - from ..gold cimport GoldParse - from . import _parse_features from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context @@ -266,4 +250,3 @@ def is_gold(StateClass state, GoldParse gold, StringStore strings): id_, word, tag, head, dep, ner = gold.orig_annot[gold.cand_to_gold[i]] truth.add((id_, head, dep)) return truth == predicted - diff --git a/spacy/syntax/iterators.pyx b/spacy/syntax/iterators.pyx index 0dc5f9ad2..c3ba2ca92 100644 --- a/spacy/syntax/iterators.pyx +++ b/spacy/syntax/iterators.pyx @@ -1,9 +1,14 @@ -from spacy.parts_of_speech cimport NOUN, PROPN, PRON +# coding: utf-8 +from __future__ import unicode_literals + +from ..parts_of_speech cimport NOUN, PROPN, PRON def english_noun_chunks(obj): - '''Detect base noun phrases from a dependency parse. - Works on both Doc and Span.''' + """ + Detect base noun phrases from a dependency parse. + Works on both Doc and Span. + """ labels = ['nsubj', 'dobj', 'nsubjpass', 'pcomp', 'pobj', 'attr', 'ROOT', 'root'] doc = obj.doc # Ensure works on both Doc and Span. diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx index 1090f546f..2758c242c 100644 --- a/spacy/syntax/ner.pyx +++ b/spacy/syntax/ner.pyx @@ -1,17 +1,16 @@ +# coding: utf-8 from __future__ import unicode_literals -from .transition_system cimport Transition -from .transition_system cimport do_func_t - -from ..structs cimport TokenC, Entity - from thinc.typedefs cimport weight_t -from ..gold cimport GoldParseC -from ..gold cimport GoldParse -from ..attrs cimport ENT_TYPE, ENT_IOB from .stateclass cimport StateClass from ._state cimport StateC +from .transition_system cimport Transition +from .transition_system cimport do_func_t +from ..structs cimport TokenC, Entity +from ..gold cimport GoldParseC +from ..gold cimport GoldParse +from ..attrs cimport ENT_TYPE, ENT_IOB cdef enum: diff --git a/spacy/syntax/nonproj.pyx b/spacy/syntax/nonproj.pyx index 1f4878247..881d8d480 100644 --- a/spacy/syntax/nonproj.pyx +++ b/spacy/syntax/nonproj.pyx @@ -1,8 +1,9 @@ +# coding: utf-8 from __future__ import unicode_literals from copy import copy from ..tokens.doc cimport Doc -from spacy.attrs import DEP, HEAD +from ..attrs import DEP, HEAD def ancestors(tokenid, heads): @@ -201,5 +202,3 @@ class PseudoProjectivity: filtered_sents.append(((ids,words,tags,heads,filtered_labels,iob), ctnts)) filtered.append((raw_text, filtered_sents)) return filtered - - diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 969c4ef06..79969d1f4 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -1,56 +1,44 @@ -# cython: infer_types=True """ MALT-style dependency parser """ +# coding: utf-8 +# cython: infer_types=True from __future__ import unicode_literals + +from collections import Counter +import ujson + cimport cython cimport cython.parallel from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cpython.exc cimport PyErr_CheckSignals - from libc.stdint cimport uint32_t, uint64_t from libc.string cimport memset, memcpy from libc.stdlib cimport malloc, calloc, free - -import os.path -from collections import Counter -from os import path -import shutil -import json -import sys -from .nonproj import PseudoProjectivity - -from cymem.cymem cimport Pool, Address -from murmurhash.mrmr cimport hash64 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec -from thinc.structs cimport SparseArrayC +from thinc.structs cimport SparseArrayC, FeatureC, ExampleC +from thinc.extra.eg cimport Example +from cymem.cymem cimport Pool, Address +from murmurhash.mrmr cimport hash64 from preshed.maps cimport MapStruct from preshed.maps cimport map_get -from thinc.structs cimport FeatureC -from thinc.structs cimport ExampleC -from thinc.extra.eg cimport Example - -from util import Config - -from ..structs cimport TokenC - -from ..tokens.doc cimport Doc -from ..strings cimport StringStore - -from .transition_system import OracleError -from .transition_system cimport TransitionSystem, Transition - -from ..gold cimport GoldParse - from . import _parse_features from ._parse_features cimport CONTEXT_SIZE from ._parse_features cimport fill_context from .stateclass cimport StateClass from ._state cimport StateC +from .nonproj import PseudoProjectivity +from .transition_system import OracleError +from .transition_system cimport TransitionSystem, Transition +from ..structs cimport TokenC +from ..tokens.doc cimport Doc +from ..strings cimport StringStore +from ..gold cimport GoldParse + USE_FTRL = False DEBUG = False @@ -80,7 +68,9 @@ cdef class ParserModel(AveragedPerceptron): return nr_feat def update(self, Example eg, itn=0): - '''Does regression on negative cost. Sort of cute?''' + """ + Does regression on negative cost. Sort of cute? + """ self.time += 1 cdef int best = arg_max_if_gold(eg.c.scores, eg.c.costs, eg.c.nr_class) cdef int guess = eg.guess @@ -132,10 +122,13 @@ cdef class ParserModel(AveragedPerceptron): cdef class Parser: - """Base class of the DependencyParser and EntityRecognizer.""" + """ + Base class of the DependencyParser and EntityRecognizer. + """ @classmethod def load(cls, path, Vocab vocab, TransitionSystem=None, require=False, **cfg): - """Load the statistical model from the supplied path. + """ + Load the statistical model from the supplied path. Arguments: path (Path): @@ -148,7 +141,7 @@ cdef class Parser: The newly constructed object. """ with (path / 'config.json').open() as file_: - cfg = json.load(file_) + cfg = ujson.load(file_) # TODO: remove this shim when we don't have to support older data if 'labels' in cfg and 'actions' not in cfg: cfg['actions'] = cfg.pop('labels') @@ -168,7 +161,8 @@ cdef class Parser: return self def __init__(self, Vocab vocab, TransitionSystem=None, ParserModel model=None, **cfg): - """Create a Parser. + """ + Create a Parser. Arguments: vocab (Vocab): @@ -198,7 +192,8 @@ cdef class Parser: return (Parser, (self.vocab, self.moves, self.model), None, None) def __call__(self, Doc tokens): - """Apply the entity recognizer, setting the annotations onto the Doc object. + """ + Apply the entity recognizer, setting the annotations onto the Doc object. Arguments: doc (Doc): The document to be processed. @@ -215,7 +210,8 @@ cdef class Parser: self.moves.finalize_doc(tokens) def pipe(self, stream, int batch_size=1000, int n_threads=2): - """Process a stream of documents. + """ + Process a stream of documents. Arguments: stream: The sequence of documents to process. @@ -303,7 +299,8 @@ cdef class Parser: return 0 def update(self, Doc tokens, GoldParse gold, itn=0): - """Update the statistical model. + """ + Update the statistical model. Arguments: doc (Doc): @@ -342,7 +339,8 @@ cdef class Parser: return loss def step_through(self, Doc doc, GoldParse gold=None): - """Set up a stepwise state, to introspect and control the transition sequence. + """ + Set up a stepwise state, to introspect and control the transition sequence. Arguments: doc (Doc): The document to step through. @@ -426,7 +424,9 @@ cdef class StepwiseState: @property def costs(self): - '''Find the action-costs for the current state''' + """ + Find the action-costs for the current state. + """ self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, self.stcls, self.gold) costs = {} diff --git a/spacy/syntax/stateclass.pyx b/spacy/syntax/stateclass.pyx index df485933d..541df2509 100644 --- a/spacy/syntax/stateclass.pyx +++ b/spacy/syntax/stateclass.pyx @@ -1,5 +1,9 @@ +# coding: utf-8 +from __future__ import unicode_literals + from libc.string cimport memcpy, memset from libc.stdint cimport uint32_t + from ..vocab cimport EMPTY_LEXEME from ..structs cimport Entity from ..lexeme cimport Lexeme @@ -28,6 +32,6 @@ cdef class StateClass: top = words[self.S(0)] + '_%d' % self.S_(0).head second = words[self.S(1)] + '_%d' % self.S_(1).head third = words[self.S(2)] + '_%d' % self.S_(2).head - n0 = words[self.B(0)] - n1 = words[self.B(1)] + n0 = words[self.B(0)] + n1 = words[self.B(1)] return ' '.join((third, second, top, '|', n0, n1)) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index e6a96062b..6b737f543 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -1,4 +1,8 @@ # cython: infer_types=True +# coding: utf-8 +from __future__ import unicode_literals + +from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF from cymem.cymem cimport Pool from thinc.typedefs cimport weight_t from collections import defaultdict @@ -6,7 +10,6 @@ from collections import defaultdict from ..structs cimport TokenC from .stateclass cimport StateClass from ..attrs cimport TAG, HEAD, DEP, ENT_TYPE, ENT_IOB -from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF cdef weight_t MIN_SCORE = -90000 diff --git a/spacy/syntax/util.py b/spacy/syntax/util.py deleted file mode 100644 index 87c1baab0..000000000 --- a/spacy/syntax/util.py +++ /dev/null @@ -1,18 +0,0 @@ -from os import path -import json - -class Config(object): - def __init__(self, **kwargs): - for key, value in kwargs.items(): - setattr(self, key, value) - - def get(self, attr, default=None): - return self.__dict__.get(attr, default) - - @classmethod - def write(cls, model_dir, name, **kwargs): - open(path.join(model_dir, '%s.json' % name), 'w').write(json.dumps(kwargs)) - - @classmethod - def read(cls, model_dir, name): - return cls(**json.load(open(path.join(model_dir, '%s.json' % name)))) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d59317747..66c09072c 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -1,15 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + cimport cython +cimport numpy as np +import numpy +import numpy.linalg +import struct + from libc.string cimport memcpy, memset from libc.stdint cimport uint32_t from libc.math cimport sqrt -import numpy -import numpy.linalg -import struct -cimport numpy as np -import six -import warnings - +from .span cimport Span +from .token cimport Token from ..lexeme cimport Lexeme from ..lexeme cimport EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t @@ -19,11 +22,10 @@ from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech cimport CCONJ, PUNCT, NOUN from ..parts_of_speech cimport univ_pos_t from ..lexeme cimport Lexeme -from .span cimport Span -from .token cimport Token from ..serialize.bits cimport BitArray from ..util import normalize_slice from ..syntax.iterators import CHUNKERS +from ..compat import is_config DEF PADDING = 5 @@ -76,7 +78,7 @@ cdef class Doc: """ def __init__(self, Vocab vocab, words=None, spaces=None, orths_and_spaces=None): - ''' + """ Create a Doc object. Aside: Implementation @@ -97,7 +99,7 @@ cdef class Doc: A list of boolean values, of the same length as words. True means that the word is followed by a space, False means it is not. If None, defaults to [True]*len(words) - ''' + """ self.vocab = vocab size = 20 self.mem = Pool() @@ -158,7 +160,7 @@ cdef class Doc: self.is_parsed = True def __getitem__(self, object i): - ''' + """ doc[i] Get the Token object at position i, where i is an integer. Negative indexing is supported, and follows the usual Python @@ -172,7 +174,7 @@ cdef class Doc: are not supported, as `Span` objects must be contiguous (cannot have gaps). You can use negative indices and open-ended ranges, which have their normal Python semantics. - ''' + """ if isinstance(i, slice): start, stop = normalize_slice(len(self), i.start, i.stop, i.step) return Span(self, start, stop, label=0) @@ -186,7 +188,7 @@ cdef class Doc: return Token.cinit(self.vocab, &self.c[i], i, self) def __iter__(self): - ''' + """ for token in doc Iterate over `Token` objects, from which the annotations can be easily accessed. This is the main way of accessing Token @@ -194,7 +196,7 @@ cdef class Doc: Python. If faster-than-Python speeds are required, you can instead access the annotations as a numpy array, or access the underlying C data directly from Cython. - ''' + """ cdef int i for i in range(self.length): if self._py_tokens[i] is not None: @@ -203,10 +205,10 @@ cdef class Doc: yield Token.cinit(self.vocab, &self.c[i], i, self) def __len__(self): - ''' + """ len(doc) The number of tokens in the document. - ''' + """ return self.length def __unicode__(self): @@ -216,7 +218,7 @@ cdef class Doc: return u''.join([t.text_with_ws for t in self]).encode('utf-8') def __str__(self): - if six.PY3: + if is_config(python3=True): return self.__unicode__() return self.__bytes__() @@ -228,7 +230,8 @@ cdef class Doc: return self def similarity(self, other): - '''Make a semantic similarity estimate. The default estimate is cosine + """ + Make a semantic similarity estimate. The default estimate is cosine similarity using an average of word vectors. Arguments: @@ -237,7 +240,7 @@ cdef class Doc: Return: score (float): A scalar similarity score. Higher is more similar. - ''' + """ if 'similarity' in self.user_hooks: return self.user_hooks['similarity'](self, other) if self.vector_norm == 0 or other.vector_norm == 0: @@ -245,9 +248,9 @@ cdef class Doc: return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) property has_vector: - ''' + """ A boolean value indicating whether a word vector is associated with the object. - ''' + """ def __get__(self): if 'has_vector' in self.user_hooks: return self.user_hooks['has_vector'](self) @@ -255,11 +258,11 @@ cdef class Doc: return any(token.has_vector for token in self) property vector: - ''' + """ A real-valued meaning representation. Defaults to an average of the token vectors. Type: numpy.ndarray[ndim=1, dtype='float32'] - ''' + """ def __get__(self): if 'vector' in self.user_hooks: return self.user_hooks['vector'](self) @@ -294,17 +297,21 @@ cdef class Doc: return self.text property text: - '''A unicode representation of the document text.''' + """ + A unicode representation of the document text. + """ def __get__(self): return u''.join(t.text_with_ws for t in self) property text_with_ws: - '''An alias of Doc.text, provided for duck-type compatibility with Span and Token.''' + """ + An alias of Doc.text, provided for duck-type compatibility with Span and Token. + """ def __get__(self): return self.text property ents: - ''' + """ Yields named-entity `Span` objects, if the entity recognizer has been applied to the document. Iterate over the span to get individual Token objects, or access the label: @@ -318,7 +325,7 @@ cdef class Doc: assert ents[0].label_ == 'PERSON' assert ents[0].orth_ == 'Best' assert ents[0].text == 'Mr. Best' - ''' + """ def __get__(self): cdef int i cdef const TokenC* token @@ -382,13 +389,13 @@ cdef class Doc: self.c[start].ent_iob = 3 property noun_chunks: - ''' + """ Yields base noun-phrase #[code Span] objects, if the document has been syntactically parsed. A base noun phrase, or 'NP chunk', is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional - phrases, and no relative clauses. For example: - ''' + phrases, and no relative clauses. + """ def __get__(self): if not self.is_parsed: raise ValueError( @@ -496,7 +503,8 @@ cdef class Doc: return output def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): - """Produce a dict of {attribute (int): count (ints)} frequencies, keyed + """ + Produce a dict of {attribute (int): count (ints)} frequencies, keyed by the values of the given attribute ID. Example: @@ -563,8 +571,9 @@ cdef class Doc: self.c[i] = parsed[i] def from_array(self, attrs, array): - '''Write to a `Doc` object, from an `(M, N)` array of attributes. - ''' + """ + Write to a `Doc` object, from an `(M, N)` array of attributes. + """ cdef int i, col cdef attr_id_t attr_id cdef TokenC* tokens = self.c @@ -603,19 +612,23 @@ cdef class Doc: return self def to_bytes(self): - '''Serialize, producing a byte string.''' + """ + Serialize, producing a byte string. + """ byte_string = self.vocab.serializer.pack(self) cdef uint32_t length = len(byte_string) return struct.pack('I', length) + byte_string def from_bytes(self, data): - '''Deserialize, loading from bytes.''' + """ + Deserialize, loading from bytes. + """ self.vocab.serializer.unpack_into(data[4:], self) return self @staticmethod def read_bytes(file_): - ''' + """ A static method, used to read serialized #[code Doc] objects from a file. For example: @@ -630,7 +643,7 @@ cdef class Doc: for byte_string in Doc.read_bytes(file_): docs.append(Doc(nlp.vocab).from_bytes(byte_string)) assert len(docs) == 2 - ''' + """ keep_reading = True while keep_reading: try: @@ -644,7 +657,8 @@ cdef class Doc: yield n_bytes_str + data def merge(self, int start_idx, int end_idx, *args, **attributes): - """Retokenize the document, such that the span at doc.text[start_idx : end_idx] + """ + Retokenize the document, such that the span at doc.text[start_idx : end_idx] is merged into a single token. If start_idx and end_idx do not mark start and end token boundaries, the document remains unchanged. @@ -658,7 +672,6 @@ cdef class Doc: token (Token): The newly merged token, or None if the start and end indices did not fall at token boundaries. - """ cdef unicode tag, lemma, ent_type if len(args) == 3: diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index f43d47876..7e2f62171 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -1,26 +1,31 @@ +# coding: utf8 from __future__ import unicode_literals from collections import defaultdict + +cimport numpy as np import numpy import numpy.linalg -cimport numpy as np from libc.math cimport sqrt -import six +from .doc cimport token_by_start, token_by_end from ..structs cimport TokenC, LexemeC from ..typedefs cimport flags_t, attr_t, hash_t from ..attrs cimport attr_id_t from ..parts_of_speech cimport univ_pos_t from ..util import normalize_slice -from .doc cimport token_by_start, token_by_end from ..attrs cimport IS_PUNCT, IS_SPACE from ..lexeme cimport Lexeme +from ..compat import is_config cdef class Span: - """A slice from a Doc object.""" + """ + A slice from a Doc object. + """ def __cinit__(self, Doc doc, int start, int end, int label=0, vector=None, vector_norm=None): - '''Create a Span object from the slice doc[start : end] + """ + Create a Span object from the slice doc[start : end] Arguments: doc (Doc): The parent document. @@ -30,7 +35,7 @@ cdef class Span: vector (ndarray[ndim=1, dtype='float32']): A meaning representation of the span. Returns: Span The newly constructed object. - ''' + """ if not (0 <= start <= end <= len(doc)): raise IndexError @@ -68,7 +73,7 @@ cdef class Span: return self.end - self.start def __repr__(self): - if six.PY3: + if is_config(python3=True): return self.text return self.text.encode('utf-8') @@ -89,7 +94,8 @@ cdef class Span: yield self.doc[i] def merge(self, *args, **attributes): - """Retokenize the document, such that the span is merged into a single token. + """ + Retokenize the document, such that the span is merged into a single token. Arguments: **attributes: @@ -102,7 +108,8 @@ cdef class Span: return self.doc.merge(self.start_char, self.end_char, *args, **attributes) def similarity(self, other): - '''Make a semantic similarity estimate. The default estimate is cosine + """ + Make a semantic similarity estimate. The default estimate is cosine similarity using an average of word vectors. Arguments: @@ -111,7 +118,7 @@ cdef class Span: Return: score (float): A scalar similarity score. Higher is more similar. - ''' + """ if 'similarity' in self.doc.user_span_hooks: self.doc.user_span_hooks['similarity'](self, other) if self.vector_norm == 0.0 or other.vector_norm == 0.0: @@ -133,11 +140,12 @@ cdef class Span: self.end = end + 1 property sent: - '''The sentence span that this span is a part of. + """ + The sentence span that this span is a part of. Returns: Span The sentence this is part of. - ''' + """ def __get__(self): if 'sent' in self.doc.user_span_hooks: return self.doc.user_span_hooks['sent'](self) @@ -198,13 +206,13 @@ cdef class Span: return u''.join([t.text_with_ws for t in self]) property noun_chunks: - ''' + """ Yields base noun-phrase #[code Span] objects, if the document has been syntactically parsed. A base noun phrase, or 'NP chunk', is a noun phrase that does not permit other NPs to be nested within it – so no NP-level coordination, no prepositional phrases, and no relative clauses. For example: - ''' + """ def __get__(self): if not self.doc.is_parsed: raise ValueError( @@ -223,17 +231,16 @@ cdef class Span: yield span property root: - """The token within the span that's highest in the parse tree. If there's a tie, the earlist is prefered. + """ + The token within the span that's highest in the parse tree. If there's a + tie, the earlist is prefered. Returns: Token: The root token. - i.e. has the - shortest path to the root of the sentence (or is the root itself). - - If multiple words are equally high in the tree, the first word is taken. - - For example: + i.e. has the shortest path to the root of the sentence (or is the root + itself). If multiple words are equally high in the tree, the first word + is taken. For example: >>> toks = nlp(u'I like New York in Autumn.') @@ -303,7 +310,8 @@ cdef class Span: return self.doc[root] property lefts: - """Tokens that are to the left of the span, whose head is within the Span. + """ + Tokens that are to the left of the span, whose head is within the Span. Yields: Token A left-child of a token of the span. """ @@ -314,7 +322,8 @@ cdef class Span: yield left property rights: - """Tokens that are to the right of the Span, whose head is within the Span. + """ + Tokens that are to the right of the Span, whose head is within the Span. Yields: Token A right-child of a token of the span. """ @@ -325,7 +334,8 @@ cdef class Span: yield right property subtree: - """Tokens that descend from tokens in the span, but fall outside it. + """ + Tokens that descend from tokens in the span, but fall outside it. Yields: Token A descendant of a token within the span. """ @@ -337,7 +347,9 @@ cdef class Span: yield from word.subtree property ent_id: - '''An (integer) entity ID. Usually assigned by patterns in the Matcher.''' + """ + An (integer) entity ID. Usually assigned by patterns in the Matcher. + """ def __get__(self): return self.root.ent_id @@ -345,9 +357,11 @@ cdef class Span: # TODO raise NotImplementedError( "Can't yet set ent_id from Span. Vote for this feature on the issue " - "tracker: http://github.com/spacy-io/spaCy") + "tracker: http://github.com/explosion/spaCy/issues") property ent_id_: - '''A (string) entity ID. Usually assigned by patterns in the Matcher.''' + """ + A (string) entity ID. Usually assigned by patterns in the Matcher. + """ def __get__(self): return self.root.ent_id_ @@ -355,7 +369,7 @@ cdef class Span: # TODO raise NotImplementedError( "Can't yet set ent_id_ from Span. Vote for this feature on the issue " - "tracker: http://github.com/spacy-io/spaCy") + "tracker: http://github.com/explosion/spaCy/issues") property orth_: def __get__(self): @@ -397,5 +411,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: raise RuntimeError( "Array bounds exceeded while searching for root word. This likely " "means the parse tree is in an invalid state. Please report this " - "issue here: http://github.com/honnibal/spaCy/") + "issue here: http://github.com/explosion/spaCy/issues") return n diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index a89b35eee..94491614c 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -1,5 +1,5 @@ -# coding: utf8 # cython: infer_types=True +# coding: utf8 from __future__ import unicode_literals from libc.string cimport memcpy @@ -8,20 +8,15 @@ from cpython.mem cimport PyMem_Malloc, PyMem_Free from cython.view cimport array as cvarray cimport numpy as np np.import_array() - import numpy -import six - from ..typedefs cimport hash_t from ..lexeme cimport Lexeme from .. import parts_of_speech - from ..attrs cimport LEMMA from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP from ..parts_of_speech cimport CCONJ, PUNCT - from ..attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE from ..attrs cimport IS_BRACKET from ..attrs cimport IS_QUOTE @@ -29,12 +24,13 @@ from ..attrs cimport IS_LEFT_PUNCT from ..attrs cimport IS_RIGHT_PUNCT from ..attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP from ..attrs cimport IS_OOV - from ..lexeme cimport Lexeme +from ..compat import is_config cdef class Token: - """An individual token --- i.e. a word, punctuation symbol, whitespace, etc. + """ + An individual token --- i.e. a word, punctuation symbol, whitespace, etc. """ def __cinit__(self, Vocab vocab, Doc doc, int offset): self.vocab = vocab @@ -46,7 +42,9 @@ cdef class Token: return hash((self.doc, self.i)) def __len__(self): - '''Number of unicode characters in token.text''' + """ + Number of unicode characters in token.text. + """ return self.c.lex.length def __unicode__(self): @@ -56,7 +54,7 @@ cdef class Token: return self.text.encode('utf8') def __str__(self): - if six.PY3: + if is_config(python3=True): return self.__unicode__() return self.__bytes__() @@ -83,27 +81,30 @@ cdef class Token: raise ValueError(op) cpdef bint check_flag(self, attr_id_t flag_id) except -1: - '''Check the value of a boolean flag. + """ + Check the value of a boolean flag. Arguments: flag_id (int): The ID of the flag attribute. Returns: is_set (bool): Whether the flag is set. - ''' + """ return Lexeme.c_check_flag(self.c.lex, flag_id) def nbor(self, int i=1): - '''Get a neighboring token. + """ + Get a neighboring token. Arguments: i (int): The relative position of the token to get. Defaults to 1. Returns: neighbor (Token): The token at position self.doc[self.i+i] - ''' + """ return self.doc[self.i+i] def similarity(self, other): - '''Compute a semantic similarity estimate. Defaults to cosine over vectors. + """ + Compute a semantic similarity estimate. Defaults to cosine over vectors. Arguments: other: @@ -111,7 +112,7 @@ cdef class Token: Token and Lexeme objects. Returns: score (float): A scalar similarity score. Higher is more similar. - ''' + """ if 'similarity' in self.doc.user_token_hooks: return self.doc.user_token_hooks['similarity'](self) if self.vector_norm == 0 or other.vector_norm == 0: @@ -209,9 +210,9 @@ cdef class Token: self.c.dep = label property has_vector: - ''' + """ A boolean value indicating whether a word vector is associated with the object. - ''' + """ def __get__(self): if 'has_vector' in self.doc.user_token_hooks: return self.doc.user_token_hooks['has_vector'](self) @@ -223,11 +224,11 @@ cdef class Token: return False property vector: - ''' + """ A real-valued meaning representation. Type: numpy.ndarray[ndim=1, dtype='float32'] - ''' + """ def __get__(self): if 'vector' in self.doc.user_token_hooks: return self.doc.user_token_hooks['vector'](self) @@ -245,6 +246,7 @@ cdef class Token: property repvec: def __get__(self): raise AttributeError("repvec was renamed to vector in v0.100") + property has_repvec: def __get__(self): raise AttributeError("has_repvec was renamed to has_vector in v0.100") @@ -265,7 +267,8 @@ cdef class Token: property lefts: def __get__(self): - """The leftward immediate children of the word, in the syntactic + """ + The leftward immediate children of the word, in the syntactic dependency parse. """ cdef int nr_iter = 0 @@ -282,8 +285,10 @@ cdef class Token: property rights: def __get__(self): - """The rightward immediate children of the word, in the syntactic - dependency parse.""" + """ + The rightward immediate children of the word, in the syntactic + dependency parse. + """ cdef const TokenC* ptr = self.c + (self.c.r_edge - self.i) tokens = [] cdef int nr_iter = 0 @@ -300,19 +305,21 @@ cdef class Token: yield t property children: - '''A sequence of the token's immediate syntactic children. + """ + A sequence of the token's immediate syntactic children. Yields: Token A child token such that child.head==self - ''' + """ def __get__(self): yield from self.lefts yield from self.rights property subtree: - '''A sequence of all the token's syntactic descendents. + """ + A sequence of all the token's syntactic descendents. Yields: Token A descendent token such that self.is_ancestor(descendent) - ''' + """ def __get__(self): for word in self.lefts: yield from word.subtree @@ -321,26 +328,29 @@ cdef class Token: yield from word.subtree property left_edge: - '''The leftmost token of this token's syntactic descendents. + """ + The leftmost token of this token's syntactic descendents. Returns: Token The first token such that self.is_ancestor(token) - ''' + """ def __get__(self): return self.doc[self.c.l_edge] property right_edge: - '''The rightmost token of this token's syntactic descendents. + """ + The rightmost token of this token's syntactic descendents. Returns: Token The last token such that self.is_ancestor(token) - ''' + """ def __get__(self): return self.doc[self.c.r_edge] property ancestors: - '''A sequence of this token's syntactic ancestors. + """ + A sequence of this token's syntactic ancestors. Yields: Token A sequence of ancestor tokens such that ancestor.is_ancestor(self) - ''' + """ def __get__(self): cdef const TokenC* head_ptr = self.c # guard against infinite loop, no token can have @@ -356,25 +366,29 @@ cdef class Token: return self.is_ancestor(descendant) def is_ancestor(self, descendant): - '''Check whether this token is a parent, grandparent, etc. of another + """ + Check whether this token is a parent, grandparent, etc. of another in the dependency tree. Arguments: descendant (Token): Another token. Returns: is_ancestor (bool): Whether this token is the ancestor of the descendant. - ''' + """ if self.doc is not descendant.doc: return False return any( ancestor.i == self.i for ancestor in descendant.ancestors ) property head: - '''The syntactic parent, or "governor", of this token. + """ + The syntactic parent, or "governor", of this token. Returns: Token - ''' + """ def __get__(self): - """The token predicted by the parser to be the head of the current token.""" + """ + The token predicted by the parser to be the head of the current token. + """ return self.doc[self.i + self.c.head] def __set__(self, Token new_head): # this function sets the head of self to new_head @@ -467,10 +481,11 @@ cdef class Token: self.c.head = rel_newhead_i property conjuncts: - '''A sequence of coordinated tokens, including the token itself. + """ + A sequence of coordinated tokens, including the token itself. Yields: Token A coordinated token - ''' + """ def __get__(self): """Get a list of conjoined words.""" cdef Token word @@ -501,7 +516,9 @@ cdef class Token: return iob_strings[self.c.ent_iob] property ent_id: - '''An (integer) entity ID. Usually assigned by patterns in the Matcher.''' + """ + An (integer) entity ID. Usually assigned by patterns in the Matcher. + """ def __get__(self): return self.c.ent_id @@ -509,7 +526,9 @@ cdef class Token: self.c.ent_id = key property ent_id_: - '''A (string) entity ID. Usually assigned by patterns in the Matcher.''' + """ + A (string) entity ID. Usually assigned by patterns in the Matcher. + """ def __get__(self): return self.vocab.strings[self.c.ent_id] From 1a98e48b8e34d8a03cf8c91603a0a8c13e9e3381 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 13:35:01 +0200 Subject: [PATCH 23/88] Fix Stepwisestate' --- spacy/syntax/parser.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 79969d1f4..b3281fe14 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -383,7 +383,7 @@ cdef class StepwiseState: def __init__(self, Parser parser, Doc doc, GoldParse gold=None): self.parser = parser self.doc = doc - if gold: + if gold is not None: self.gold = gold else: self.gold = GoldParse(doc) @@ -427,6 +427,8 @@ cdef class StepwiseState: """ Find the action-costs for the current state. """ + if not self.gold: + raise ValueError("Can't set costs: No GoldParse provided") self.parser.moves.set_costs(self.eg.c.is_valid, self.eg.c.costs, self.stcls, self.gold) costs = {} From e6ee7e130f1d7487ea3aa08b552a2a4eec7696cc Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 13:38:53 +0200 Subject: [PATCH 24/88] Fix parse package meta --- spacy/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index c6946ce6e..219009f17 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -116,7 +116,7 @@ def check_renamed_kwargs(renamed, kwargs): def parse_package_meta(package_path, package, require=True): location = package_path / package / 'meta.json' if location.is_file(): - with io.open(location, encoding='utf8') as f: + with location.open('r', encoding='utf8') as f: meta = ujson.load(f) return meta elif require: From a7626bd7fd2c5bd1d445e75ad36575fbf27079c0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 15:43:14 +0200 Subject: [PATCH 25/88] Tmp commit to example --- examples/train_ner_standalone.py | 78 ++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 18 deletions(-) diff --git a/examples/train_ner_standalone.py b/examples/train_ner_standalone.py index 612e7bec3..abc6a0152 100644 --- a/examples/train_ner_standalone.py +++ b/examples/train_ner_standalone.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python '''Example of training a named entity recognition system from scratch using spaCy This example is written to be self-contained and reasonably transparent. @@ -31,6 +32,8 @@ from spacy.gold import GoldParse from spacy.gold import _iob_to_biluo as iob_to_biluo from spacy.scorer import Scorer +from deepsense import neptune + try: unicode except NameError: @@ -81,7 +84,7 @@ def load_vocab(path): def init_ner_model(vocab, features=None): if features is None: features = tuple(EntityRecognizer.feature_templates) - return BeamEntityRecognizer(vocab, features=features) + return EntityRecognizer(vocab, features=features) def save_ner_model(model, path): @@ -99,7 +102,7 @@ def save_ner_model(model, path): def load_ner_model(vocab, path): - return BeamEntityRecognizer.load(path, vocab) + return EntityRecognizer.load(path, vocab) class Pipeline(object): @@ -110,18 +113,21 @@ class Pipeline(object): raise IOError("Cannot load pipeline from %s\nDoes not exist" % path) if not path.is_dir(): raise IOError("Cannot load pipeline from %s\nNot a directory" % path) - vocab = load_vocab(path / 'vocab') + vocab = load_vocab(path) tokenizer = Tokenizer(vocab, {}, None, None, None) ner_model = load_ner_model(vocab, path / 'ner') return cls(vocab, tokenizer, ner_model) - def __init__(self, vocab=None, tokenizer=None, ner_model=None): + def __init__(self, vocab=None, tokenizer=None, entity=None): if vocab is None: - self.vocab = init_vocab() + vocab = init_vocab() if tokenizer is None: tokenizer = Tokenizer(vocab, {}, None, None, None) - if ner_model is None: - self.entity = init_ner_model(self.vocab) + if entity is None: + entity = init_ner_model(self.vocab) + self.vocab = vocab + self.tokenizer = tokenizer + self.entity = entity self.pipeline = [self.entity] def __call__(self, input_): @@ -173,7 +179,25 @@ class Pipeline(object): save_ner_model(self.entity, path / 'ner') -def train(nlp, train_examples, dev_examples, nr_epoch=5): +def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5): + channels = {} + channels['loss'] = ctx.job.create_channel( + name='loss', + channel_type=neptune.ChannelType.NUMERIC) + + channels['f'] = ctx.job.create_channel( + name='F-Measure', + channel_type=neptune.ChannelType.NUMERIC) + channels['p'] = ctx.job.create_channel( + name='Precision', + channel_type=neptune.ChannelType.NUMERIC) + channels['r'] = ctx.job.create_channel( + name='Recall', + channel_type=neptune.ChannelType.NUMERIC) + channels['log'] = ctx.job.create_channel( + name='logs', + channel_type=neptune.ChannelType.TEXT) + next_epoch = train_examples print("Iter", "Loss", "P", "R", "F") for i in range(nr_epoch): @@ -186,14 +210,25 @@ def train(nlp, train_examples, dev_examples, nr_epoch=5): next_epoch.append((input_, annot)) random.shuffle(next_epoch) scores = nlp.evaluate(dev_examples) - precision = '%.2f' % scores['ents_p'] - recall = '%.2f' % scores['ents_r'] - f_measure = '%.2f' % scores['ents_f'] - print(i, int(loss), precision, recall, f_measure) + report_scores(channels, i, loss, scores) nlp.average_weights() scores = nlp.evaluate(dev_examples) - print("After averaging") - print(scores['ents_p'], scores['ents_r'], scores['ents_f']) + report_scores(channels, i+1, loss, scores) + + +def report_scores(channels, i, loss, scores): + precision = '%.2f' % scores['ents_p'] + recall = '%.2f' % scores['ents_r'] + f_measure = '%.2f' % scores['ents_f'] + print('%d %s %s %s' % (int(loss), precision, recall, f_measure)) + channels['log'].send(x=i, y='%d %s %s %s' % (int(loss), precision, recall, + f_measure)) + channels['f'].send(x=i, y=scores['ents_f']) + channels['p'].send(x=i, y=scores['ents_p']) + channels['r'].send(x=i, y=scores['ents_r']) + channels['loss'].send(x=i, y=loss) + + def read_examples(path): @@ -221,15 +256,22 @@ def read_examples(path): train_loc=("Path to your training data", "positional", None, Path), dev_loc=("Path to your development data", "positional", None, Path), ) -def main(model_dir, train_loc, dev_loc, nr_epoch=10): +def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'), + train_loc=None, dev_loc=None, nr_epoch=30): + ctx = neptune.Context() + + train_loc = Path(ctx.params.train_loc) + dev_loc = Path(ctx.params.dev_loc) + model_dir = model_dir.resolve() + train_examples = read_examples(train_loc) dev_examples = read_examples(dev_loc) - nlp = Pipeline() + nlp = Pipeline.load(model_dir) - train(nlp, train_examples, list(dev_examples), nr_epoch) + train(nlp, train_examples, list(dev_examples), ctx, nr_epoch) nlp.save(model_dir) if __name__ == '__main__': - plac.call(main) + main() From ed27ca7e217a23fe48ff1a8f4a61fd380ee3174a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 15:43:38 +0200 Subject: [PATCH 26/88] Use python -m virtualenv in fabfile --- fabfile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fabfile.py b/fabfile.py index 588d42e9d..cfa80ead5 100644 --- a/fabfile.py +++ b/fabfile.py @@ -14,7 +14,7 @@ VENV_DIR = path.join(PWD, ENV) def env(lang='python2.7'): if path.exists(VENV_DIR): local('rm -rf {env}'.format(env=VENV_DIR)) - local('virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) + local('python -m virtualenv -p {lang} {env}'.format(lang=lang, env=VENV_DIR)) def install(): From 4884b2c113f1b729f8602adf617cb15dd2c501b6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 16:00:28 +0200 Subject: [PATCH 27/88] Refix StepwiseState --- spacy/syntax/parser.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index b3281fe14..872677842 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -366,11 +366,11 @@ cdef class Parser: def add_label(self, label): # Doesn't set label into serializer -- subclasses override it to do that. for action in self.moves.action_types: - self.moves.add_action(action, label) - if 'actions' in self.cfg: + added = self.moves.add_action(action, label) + if added: # Important that the labels be stored as a list! We need the # order, or the model goes out of synch - self.cfg['actions'].setdefault(str(action), []).append(label) + self.cfg.setdefault('extra_labels', []).append(label) cdef class StepwiseState: @@ -385,11 +385,11 @@ cdef class StepwiseState: self.doc = doc if gold is not None: self.gold = gold + self.parser.moves.preprocess_gold(self.gold) else: self.gold = GoldParse(doc) self.stcls = StateClass.init(doc.c, doc.length) self.parser.moves.initialize_state(self.stcls.c) - self.parser.moves.preprocess_gold(gold) self.eg = Example( nr_class=self.parser.moves.n_moves, nr_atom=CONTEXT_SIZE, From c729d72fc6d31af4d6a2567cc705c78d42bdb54e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 16:11:06 +0200 Subject: [PATCH 28/88] Add new example for training new entity types --- examples/training/train_new_entity_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index 4e9b7c8a8..af98ef583 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -29,8 +29,8 @@ def train_ner(nlp, train_data, output_dir): doc = nlp.make_doc(raw_text) nlp.tagger(doc) loss = nlp.entity.update(doc, gold) + nlp.end_training() nlp.save_to_directory(output_dir) - #nlp.end_training(output_dir) def main(model_name, output_directory=None): From c76cb8af352fc64eb59539f6ffbb5df3e3799912 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 16:11:26 +0200 Subject: [PATCH 29/88] Fix training for new labels --- spacy/syntax/parser.pyx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 872677842..4473045e9 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -151,7 +151,6 @@ cdef class Parser: if isinstance(labels, dict): labels = list(sorted(labels.keys())) cfg['actions'][action_name] = labels - print(cfg['actions']) self = cls(vocab, TransitionSystem=TransitionSystem, model=None, **cfg) if (path / 'model').exists(): self.model.load(str(path / 'model')) @@ -187,6 +186,11 @@ cdef class Parser: self.model.learn_rate = cfg.get('learn_rate', 0.001) self.cfg = cfg + # TODO: This is a pretty hacky fix to the problem of adding more + # labels. The issue is they come in out of order, if labels are + # added during training + for label in cfg.get('extra_labels', []): + self.add_label(label) def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) From 45464d065e383ffc60bee4a1d122b89604d8a8cd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 16:11:43 +0200 Subject: [PATCH 30/88] Remove print statement --- spacy/syntax/transition_system.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index 6b737f543..5009bae6e 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -108,6 +108,5 @@ cdef class TransitionSystem: self.c = self.mem.realloc(self.c, self._size * sizeof(self.c[0])) self.c[self.n_moves] = self.init_transition(self.n_moves, action, label) - print("Add action", action, self.strings[label], self.n_moves) self.n_moves += 1 return 1 From b9c26aae11c0802c397e27db3e27c366186e196a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 16:13:17 +0200 Subject: [PATCH 31/88] Remove neptune refs from new train example --- examples/train_ner_standalone.py | 37 ++------------------------------ 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/examples/train_ner_standalone.py b/examples/train_ner_standalone.py index abc6a0152..9591d1b71 100644 --- a/examples/train_ner_standalone.py +++ b/examples/train_ner_standalone.py @@ -32,8 +32,6 @@ from spacy.gold import GoldParse from spacy.gold import _iob_to_biluo as iob_to_biluo from spacy.scorer import Scorer -from deepsense import neptune - try: unicode except NameError: @@ -180,24 +178,6 @@ class Pipeline(object): def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5): - channels = {} - channels['loss'] = ctx.job.create_channel( - name='loss', - channel_type=neptune.ChannelType.NUMERIC) - - channels['f'] = ctx.job.create_channel( - name='F-Measure', - channel_type=neptune.ChannelType.NUMERIC) - channels['p'] = ctx.job.create_channel( - name='Precision', - channel_type=neptune.ChannelType.NUMERIC) - channels['r'] = ctx.job.create_channel( - name='Recall', - channel_type=neptune.ChannelType.NUMERIC) - channels['log'] = ctx.job.create_channel( - name='logs', - channel_type=neptune.ChannelType.TEXT) - next_epoch = train_examples print("Iter", "Loss", "P", "R", "F") for i in range(nr_epoch): @@ -210,25 +190,17 @@ def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5): next_epoch.append((input_, annot)) random.shuffle(next_epoch) scores = nlp.evaluate(dev_examples) - report_scores(channels, i, loss, scores) + report_scores(i, loss, scores) nlp.average_weights() scores = nlp.evaluate(dev_examples) report_scores(channels, i+1, loss, scores) -def report_scores(channels, i, loss, scores): +def report_scores(i, loss, scores): precision = '%.2f' % scores['ents_p'] recall = '%.2f' % scores['ents_r'] f_measure = '%.2f' % scores['ents_f'] print('%d %s %s %s' % (int(loss), precision, recall, f_measure)) - channels['log'].send(x=i, y='%d %s %s %s' % (int(loss), precision, recall, - f_measure)) - channels['f'].send(x=i, y=scores['ents_f']) - channels['p'].send(x=i, y=scores['ents_p']) - channels['r'].send(x=i, y=scores['ents_r']) - channels['loss'].send(x=i, y=loss) - - def read_examples(path): @@ -258,11 +230,6 @@ def read_examples(path): ) def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'), train_loc=None, dev_loc=None, nr_epoch=30): - ctx = neptune.Context() - - train_loc = Path(ctx.params.train_loc) - dev_loc = Path(ctx.params.dev_loc) - model_dir = model_dir.resolve() train_examples = read_examples(train_loc) dev_examples = read_examples(dev_loc) From 40e30242415723f01aeab281d6cda1d1c66a4189 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 15 Apr 2017 16:13:42 +0200 Subject: [PATCH 32/88] Move standalone NER training script into examples directory --- examples/{ => training}/train_ner_standalone.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/{ => training}/train_ner_standalone.py (100%) diff --git a/examples/train_ner_standalone.py b/examples/training/train_ner_standalone.py similarity index 100% rename from examples/train_ner_standalone.py rename to examples/training/train_ner_standalone.py From 31fa73293a260a764fda000f72a167a56eab6330 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:03:28 +0200 Subject: [PATCH 33/88] Move read_json out to own util function --- spacy/util.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 219009f17..5fd9d563b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -113,12 +113,15 @@ def check_renamed_kwargs(renamed, kwargs): raise TypeError("Keyword argument %s now renamed to %s" % (old, new)) +def read_json(location): + with location.open('r', encoding='utf8') as f: + return ujson.load(f) + + def parse_package_meta(package_path, package, require=True): location = package_path / package / 'meta.json' if location.is_file(): - with location.open('r', encoding='utf8') as f: - meta = ujson.load(f) - return meta + return read_json(location) elif require: raise IOError("Could not read meta.json from %s" % location) else: From 13c8a42d2b7f4ef630b33ede0239a70a1c0140bc Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:03:58 +0200 Subject: [PATCH 34/88] Fix typos --- spacy/cli/package.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index fb0de2cc5..d85f1a92a 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -47,9 +47,9 @@ def check_dirs(input_path, output_path): def create_dirs(package_path, force): if package_path.exists(): if force: - shutil.rmtree(unicode_(package_path.as_posix)) + shutil.rmtree(unicode_(package_path)) else: - util.sys_exit(unicode_(package_path.as_posix), + util.sys_exit(unicode_(package_path), "Please delete the directory and try again.", title="Package directory already exists") Path.mkdir(package_path, parents=True) From a7574b75728174c7fdab6c74a8f18f0c024489e4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:06:02 +0200 Subject: [PATCH 35/88] Add more options to read in meta data in package command Add meta option to supply path to meta.json. If no meta path is set, check if meta.json exists in input directory and use it. Otherwise, prompt for details on the command line. --- spacy/__main__.py | 5 +++-- spacy/cli/package.py | 17 +++++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 8d511d823..4c065a7e6 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -63,15 +63,16 @@ class CLI(object): @plac.annotations( input_dir=("directory with model data", "positional", None, str), output_dir=("output parent directory", "positional", None, str), + meta=("path to meta.json", "option", "m", str), force=("force overwriting of existing folder in output directory", "flag", "f", bool) ) - def package(self, input_dir, output_dir, force=False): + def package(self, input_dir, output_dir, meta=None, force=False): """ Generate Python package for model data, including meta and required installation files. A new directory will be created in the specified output directory, and model data will be copied over. """ - cli_package(input_dir, output_dir, force) + cli_package(input_dir, output_dir, meta, force) @plac.annotations( diff --git a/spacy/cli/package.py b/spacy/cli/package.py index d85f1a92a..26ce01a18 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -9,15 +9,22 @@ from ..compat import unicode_, json_dumps from .. import util -def package(input_dir, output_dir, force): +def package(input_dir, output_dir, meta_path, force): input_path = Path(input_dir) output_path = Path(output_dir) - check_dirs(input_path, output_path) + meta_path = util.ensure_path(meta_path) + check_dirs(input_path, output_path, meta_path) template_setup = get_template('setup.py') template_manifest = get_template('MANIFEST.in') template_init = get_template('en_model_name/__init__.py') - meta = generate_meta() + + meta_path = meta_path or input_path / 'meta.json' + if meta_path.is_file(): + util.print_msg(unicode_(meta_path), title="Reading meta.json from file") + meta = util.read_json(meta_path) + else: + meta = generate_meta() model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] @@ -37,11 +44,13 @@ def package(input_dir, output_dir, force): title="Successfully created package {p}".format(p=model_name_v)) -def check_dirs(input_path, output_path): +def check_dirs(input_path, output_path, meta_path): if not input_path.exists(): util.sys_exit(unicode_(input_path.as_poisx), title="Model directory not found") if not output_path.exists(): util.sys_exit(unicode_(output_path), title="Output directory not found") + if meta_path and not meta_path.exists(): + util.sys_exit(unicode_(meta_path), title="meta.json not found") def create_dirs(package_path, force): From e3de035814fa5b5bd2561bbbadc037deb8c15298 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:13:17 +0200 Subject: [PATCH 36/88] Add meta validation to check for required settings Complain if no "lang", "name" or "version" is found (those settings are used in directory / package names). Package will still build without, but it'll inevitably fail somewhere down the line. --- spacy/cli/package.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 26ce01a18..1abc36837 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -26,6 +26,7 @@ def package(input_dir, output_dir, meta_path, force): else: meta = generate_meta() + validate_meta(meta, ['lang', 'name', 'version']) model_name = meta['lang'] + '_' + meta['name'] model_name_v = model_name + '-' + meta['version'] main_path = output_path / model_name_v @@ -89,6 +90,14 @@ def generate_meta(): return meta +def validate_meta(meta, keys): + for key in keys: + if key not in meta or meta[key] == '': + util.sys_exit( + "This setting is required to build your package.", + title='No "{k}" setting found in meta.json'.format(k=key)) + + def get_template(filepath): url = 'https://raw.githubusercontent.com/explosion/spacy-dev-resources/master/templates/model/' r = requests.get(url + filepath) From a3ddbc0444c1f9ec49e59d8c58b1bb06fe595e1a Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:14:36 +0200 Subject: [PATCH 37/88] Add note about --force flag to error message --- spacy/cli/package.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 1abc36837..102b07472 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -60,7 +60,8 @@ def create_dirs(package_path, force): shutil.rmtree(unicode_(package_path)) else: util.sys_exit(unicode_(package_path), - "Please delete the directory and try again.", + "Please delete the directory and try again, or use the --force " + "flag to overwrite existing directories.", title="Package directory already exists") Path.mkdir(package_path, parents=True) From 8191e33cf1a59c20262d795b012fd7077f78f0e4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:32:31 +0200 Subject: [PATCH 38/88] Update link error message with info on permissions --- spacy/cli/link.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 9abb7bfb4..781adda2c 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -49,9 +49,12 @@ def symlink(model_path, link_name, force): # This is quite dirty, but just making sure other errors are caught so # users at least see a proper message. util.sys_exit( - "Creating a symlink in spacy/data failed. You can still import " - "the model as a Python package and call its load() method, or " - "create the symlink manually:", + "Creating a symlink in spacy/data failed. Make sure you have the " + "required permissions and try re-running the command as admin, or " + "use a virtualenv to install spaCy in a user directory, instead of " + "doing a system installation.", + "You can still import the model as a Python package and call its " + "load() method, or create the symlink manually:", "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)), title="Error: Couldn't link model to '{l}'".format(l=link_name)) From d29c825ca4af79c1da9e7140dc3730fa6dcc4383 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:37:24 +0200 Subject: [PATCH 39/88] Update docs for package command --- website/docs/usage/cli.jade | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index ebd034bb8..5ad8a214d 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -249,14 +249,16 @@ p p | Generate a #[+a("/docs/usage/models#own-models") model Python package] - | from an existing model data directory. All data files are copied over, - | and the meta data can be entered directly from the command line. While - | this feature is still experimental, the required file templates are - | downloaded from #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. - | This means you need to be connected to the internet to use this command. + | from an existing model data directory. All data files are copied over. + | If the path to a meta.json is supplied, or a meta.json is found in the + | input directory, this file is used. Otherwise, the data can be entered + | directly from the command line. While this feature is still experimental, + | the required file templates are downloaded from + | #[+src(gh("spacy-dev-resources", "templates/model")) GitHub]. This means + | you need to be connected to the internet to use this command. +code(false, "bash"). - python -m spacy package [input_dir] [output_dir] [--force] + python -m spacy package [input_dir] [output_dir] [--meta] [--force] +table(["Argument", "Type", "Description"]) +row @@ -269,6 +271,11 @@ p +cell positional +cell Directory to create package folder in. + +row + +cell #[code meta] + +cell option + +cell Path to meta.json file (optional). + +row +cell #[code --force], #[code -f] +cell flag From d10bd0eaf9a1ce355dca11826edd0b9208c13523 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 13:42:34 +0200 Subject: [PATCH 40/88] Fix formatting --- spacy/util.py | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 5fd9d563b..f8af8baa3 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -129,10 +129,11 @@ def parse_package_meta(package_path, package, require=True): def get_raw_input(description, default=False): - """Get user input via raw_input / input and return input value. Takes a + """ + Get user input via raw_input / input and return input value. Takes a description for the prompt, and an optional default value that's displayed - with the prompt.""" - + with the prompt. + """ additional = ' (default: {d})'.format(d=default) if default else '' prompt = ' {d}{a}: '.format(d=description, a=additional) user_input = input_(prompt) @@ -140,9 +141,10 @@ def get_raw_input(description, default=False): def print_table(data, **kwargs): - """Print data in table format. Can either take a list of tuples or a - dictionary, which will be converted to a list of tuples.""" - + """ + Print data in table format. Can either take a list of tuples or a + dictionary, which will be converted to a list of tuples. + """ if type(data) == dict: data = list(data.items()) @@ -158,10 +160,11 @@ def print_table(data, **kwargs): def print_markdown(data, **kwargs): - """Print listed data in GitHub-flavoured Markdown format so it can be + """ + Print listed data in GitHub-flavoured Markdown format so it can be copy-pasted into issues. Can either take a list of tuples or a dictionary, - which will be converted to a list of tuples.""" - + which will be converted to a list of tuples. + """ def excl_value(value): # don't print value if it contains absolute path of directory (i.e. # personal info). Other conditions can be included here if necessary. @@ -178,16 +181,16 @@ def print_markdown(data, **kwargs): if 'title' in kwargs and kwargs['title']: print(tpl_title.format(msg=kwargs['title'])) - print(tpl_msg.format(msg=markdown)) def print_msg(*text, **kwargs): - """Print formatted message. Each positional argument is rendered as newline- + """ + Print formatted message. Each positional argument is rendered as newline- separated paragraph. If kwarg 'title' exist, title is printed above the text and highlighted (using ANSI escape sequences manually to avoid unnecessary - dependency).""" - + dependency). + """ message = '\n\n'.join([_wrap_text(t) for t in text]) tpl_msg = '\n{msg}\n' tpl_title = '\n\033[93m{msg}\033[0m' @@ -199,9 +202,10 @@ def print_msg(*text, **kwargs): def _wrap_text(text): - """Wrap text at given width using textwrap module. Indent should consist of - spaces. Its length is deducted from wrap width to ensure exact wrapping.""" - + """ + Wrap text at given width using textwrap module. Indent should consist of + spaces. Its length is deducted from wrap width to ensure exact wrapping. + """ wrap_max = 80 indent = ' ' wrap_width = wrap_max - len(indent) @@ -211,10 +215,11 @@ def _wrap_text(text): def sys_exit(*messages, **kwargs): - """Performs SystemExit. For modules used from the command line, like + """ + Performs SystemExit. For modules used from the command line, like download and link. To print message, use the same arguments as for - print_msg().""" - + print_msg(). + """ if messages: print_msg(*messages, **kwargs) sys.exit(0) From c7adca58a9c85423d97f859c06b6c92e8aee35ab Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 16:55:01 +0200 Subject: [PATCH 41/88] Tidy up example and only save/test if output_directory is not None --- examples/training/train_new_entity_type.py | 31 ++++++++++------------ 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index af98ef583..cbe2963d3 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,22 +1,16 @@ from __future__ import unicode_literals, print_function -import json -import pathlib + import random +from pathlib import Path import spacy from spacy.pipeline import EntityRecognizer from spacy.gold import GoldParse from spacy.tagger import Tagger - -try: - unicode -except: - unicode = str - def train_ner(nlp, train_data, output_dir): - # Add new words to vocab. + # Add new words to vocab for raw_text, _ in train_data: doc = nlp.make_doc(raw_text) for word in doc: @@ -30,11 +24,14 @@ def train_ner(nlp, train_data, output_dir): nlp.tagger(doc) loss = nlp.entity.update(doc, gold) nlp.end_training() - nlp.save_to_directory(output_dir) + if output_dir: + nlp.save_to_directory(output_dir) def main(model_name, output_directory=None): nlp = spacy.load(model_name) + if output_directory is not None: + output_directory = Path(output_directory) train_data = [ ( @@ -55,18 +52,18 @@ def main(model_name, output_directory=None): ) ] nlp.entity.add_label('ANIMAL') - if output_directory is not None: - output_directory = pathlib.Path(output_directory) ner = train_ner(nlp, train_data, output_directory) + # Test that the entity is recognized doc = nlp('Do you like horses?') for ent in doc.ents: print(ent.label_, ent.text) - nlp2 = spacy.load('en', path=output_directory) - nlp2.entity.add_label('ANIMAL') - doc2 = nlp2('Do you like horses?') - for ent in doc2.ents: - print(ent.label_, ent.text) + if output_directory: + nlp2 = spacy.load('en', path=output_directory) + nlp2.entity.add_label('ANIMAL') + doc2 = nlp2('Do you like horses?') + for ent in doc2.ents: + print(ent.label_, ent.text) if __name__ == '__main__': From 137b210bcfbb5eced0fd44d56ba9d5cf515b89f0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 18:02:42 +0200 Subject: [PATCH 42/88] Restore use of FTRL training --- spacy/syntax/parser.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx index 4473045e9..0bc9cb4ef 100644 --- a/spacy/syntax/parser.pyx +++ b/spacy/syntax/parser.pyx @@ -40,7 +40,7 @@ from ..strings cimport StringStore from ..gold cimport GoldParse -USE_FTRL = False +USE_FTRL = True DEBUG = False def set_debug(val): global DEBUG From 6a4221a6decd92d7656c139f4b297095f3ab4158 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 18:07:53 +0200 Subject: [PATCH 43/88] Allow lemma to be set from Python. Re #973 --- spacy/tokens/token.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 94491614c..f146f5cd6 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -192,6 +192,8 @@ cdef class Token: property lemma: def __get__(self): return self.c.lemma + def __set__(self, int lemma): + self.c.lemma = lemma property pos: def __get__(self): @@ -570,6 +572,8 @@ cdef class Token: property lemma_: def __get__(self): return self.vocab.strings[self.c.lemma] + def __set__(self, unicode lemma_): + self.c.lemma = self.vocab.strings[lemma_] property pos_: def __get__(self): From 89a4f262fc7ec88b6deae3e65ca75d5c138a7352 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 13:00:37 -0500 Subject: [PATCH 44/88] Fix training methods --- spacy/cli/train.py | 13 +++++++------ spacy/gold.pyx | 6 +++--- spacy/language.py | 9 ++++++--- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/spacy/cli/train.py b/spacy/cli/train.py index 489430634..3900c7f39 100644 --- a/spacy/cli/train.py +++ b/spacy/cli/train.py @@ -2,8 +2,8 @@ from __future__ import unicode_literals, division, print_function import json -from pathlib import Path +from ..util import ensure_path from ..scorer import Scorer from ..gold import GoldParse, merge_sents from ..gold import read_json_file as read_gold_json @@ -12,9 +12,9 @@ from .. import util def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ner, parser_L1): - output_path = Path(output_dir) - train_path = Path(train_data) - dev_path = Path(dev_data) + output_path = ensure_path(output_dir) + train_path = ensure_path(train_data) + dev_path = ensure_path(dev_data) check_dirs(output_path, train_path, dev_path) lang = util.get_lang_class(language) @@ -43,7 +43,7 @@ def train(language, output_dir, train_data, dev_data, n_iter, tagger, parser, ne def train_config(config): - config_path = Path(config) + config_path = ensure_path(config) if not config_path.is_file(): util.sys_exit(config_path.as_posix(), title="Config file not found") config = json.load(config_path) @@ -57,7 +57,8 @@ def train_model(Language, train_data, dev_data, output_path, tagger_cfg, parser_ entity_cfg, n_iter): print("Itn.\tN weight\tN feats\tUAS\tNER F.\tTag %\tToken %") - with Language.train(output_path, train_data, tagger_cfg, parser_cfg, entity_cfg) as trainer: + with Language.train(output_path, train_data, + pos=tagger_cfg, deps=parser_cfg, ner=entity_cfg) as trainer: for itn, epoch in enumerate(trainer.epochs(n_iter, augment_data=None)): for doc, gold in epoch: trainer.update(doc, gold) diff --git a/spacy/gold.pyx b/spacy/gold.pyx index 425ad0fe0..1e55075c7 100644 --- a/spacy/gold.pyx +++ b/spacy/gold.pyx @@ -5,9 +5,9 @@ from __future__ import unicode_literals, print_function import io import re import ujson -from pathlib import Path from .syntax import nonproj +from .util import ensure_path def tags_to_entities(tags): @@ -139,12 +139,12 @@ def _min_edit_path(cand_words, gold_words): def read_json_file(loc, docs_filter=None): - loc = Path(loc) + loc = ensure_path(loc) if loc.is_dir(): for filename in loc.iterdir(): yield from read_json_file(loc / filename) else: - with io.open(loc, 'r', encoding='utf8') as file_: + with loc.open('r', encoding='utf8') as file_: docs = ujson.load(file_) for doc in docs: if docs_filter is not None and not docs_filter(doc): diff --git a/spacy/language.py b/spacy/language.py index 4b6c3397d..47408921c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -204,15 +204,18 @@ class Language(object): @classmethod @contextmanager def train(cls, path, gold_tuples, **configs): - if parser_cfg['pseudoprojective']: + parser_cfg = configs.get('deps', {}) + if parser_cfg.get('pseudoprojective'): # preprocess training data here before ArcEager.get_labels() is called gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples) for subdir in ('deps', 'ner', 'pos'): if subdir not in configs: configs[subdir] = {} - configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) - configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) + if parser_cfg: + configs['deps']['actions'] = ArcEager.get_actions(gold_parses=gold_tuples) + if 'ner' in configs: + configs['ner']['actions'] = BiluoPushDown.get_actions(gold_parses=gold_tuples) cls.setup_directory(path, **configs) From 90cf6b9429c336663a24b609da02368cd066217f Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 19:32:21 +0200 Subject: [PATCH 45/88] Add pytest to requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f1f26171b..6212ab3cd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ ujson>=1.35 dill>=0.2,<0.3 requests>=2.13.0,<3.0.0 regex==2017.4.5 +pytest>=3.0.6,<4.0.0 From 0084466a6695dea92a4b42f9a24b65682d443db0 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 19:51:29 +0200 Subject: [PATCH 46/88] Remove unused utf8open util and replace os.path with ensure_path --- spacy/tests/tokenizer/test_tokenizer.py | 7 +++---- spacy/util.py | 4 ---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py index 22afa1f43..da79b43a8 100644 --- a/spacy/tests/tokenizer/test_tokenizer.py +++ b/spacy/tests/tokenizer/test_tokenizer.py @@ -3,9 +3,8 @@ from __future__ import unicode_literals from ...vocab import Vocab from ...tokenizer import Tokenizer -from ...util import utf8open +from ... import util -from os import path import pytest @@ -75,8 +74,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n @pytest.mark.parametrize('file_name', ["sun.txt"]) def test_tokenizer_handle_text_from_file(tokenizer, file_name): - loc = path.join(path.dirname(__file__), file_name) - text = utf8open(loc).read() + loc = util.ensure_path(__file__).parent / file_name + text = loc.open('r', encoding='utf8').read() assert len(text) != 0 tokens = tokenizer(text) assert len(tokens) > 100 diff --git a/spacy/util.py b/spacy/util.py index f8af8baa3..3318725ec 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -103,10 +103,6 @@ def normalize_slice(length, start, stop, step=None): return start, stop -def utf8open(loc, mode='r'): - return io.open(loc, mode, encoding='utf8') - - def check_renamed_kwargs(renamed, kwargs): for old, new in renamed.items(): if old in kwargs: From ed7e19ad685c3e7c615feb39793cd318f1e6580c Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 19:54:16 +0200 Subject: [PATCH 47/88] Remove unused import --- spacy/util.py | 1 - 1 file changed, 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 3318725ec..24c0d74f1 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,7 +1,6 @@ # coding: utf8 from __future__ import unicode_literals, print_function -import io import ujson import re from pathlib import Path From d3759dfb3224a78a93663f1a74b810e4323be86c Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:34:37 +0200 Subject: [PATCH 48/88] Fix docstring --- spacy/deprecated.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/deprecated.py b/spacy/deprecated.py index f481a2502..65053089a 100644 --- a/spacy/deprecated.py +++ b/spacy/deprecated.py @@ -107,7 +107,7 @@ def fix_glove_vectors_loading(overrides): def resolve_model_name(name): """ If spaCy is loaded with 'de', check if symlink already exists. If - not, user have upgraded from older version and have old models installed. + not, user may have upgraded from older version and have old models installed. Check if old model directory exists and if so, return that instead and create shortcut link. If English model is found and no shortcut exists, raise error and tell user to install new model. From 7670c745b6370fafd29ccedcdcba875156ccd576 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:35:00 +0200 Subject: [PATCH 49/88] Update spacy.load() and fix path checks --- spacy/__init__.py | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index bc668121f..06e9374ea 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -1,39 +1,38 @@ # coding: utf8 from __future__ import unicode_literals -from pathlib import Path - -from .util import set_lang_class, get_lang_class, parse_package_meta +from . import util from .deprecated import resolve_model_name from .cli import info from . import en, de, zh, es, it, hu, fr, pt, nl, sv, fi, bn, he -set_lang_class(en.English.lang, en.English) -set_lang_class(de.German.lang, de.German) -set_lang_class(es.Spanish.lang, es.Spanish) -set_lang_class(pt.Portuguese.lang, pt.Portuguese) -set_lang_class(fr.French.lang, fr.French) -set_lang_class(it.Italian.lang, it.Italian) -set_lang_class(hu.Hungarian.lang, hu.Hungarian) -set_lang_class(zh.Chinese.lang, zh.Chinese) -set_lang_class(nl.Dutch.lang, nl.Dutch) -set_lang_class(sv.Swedish.lang, sv.Swedish) -set_lang_class(fi.Finnish.lang, fi.Finnish) -set_lang_class(bn.Bengali.lang, bn.Bengali) -set_lang_class(he.Hebrew.lang, he.Hebrew) +_languages = (en.English, de.German, es.Spanish, pt.Portuguese, fr.French, + it.Italian, hu.Hungarian, zh.Chinese, nl.Dutch, sv.Swedish, + fi.Finnish, bn.Bengali, he.Hebrew) + + +for _lang in _languages: + util.set_lang_class(_lang.lang, _lang) def load(name, **overrides): - data_path = overrides.get('path', util.get_data_path()) - model_name = resolve_model_name(name) - meta = parse_package_meta(data_path, model_name, require=False) + if overrides.get('path') in (None, False, True): + data_path = util.get_data_path() + model_name = resolve_model_name(name) + model_path = data_path / model_name + if not model_path.exists(): + model_path = None + util.print_msg( + "Only loading the '{}' tokenizer.".format(name), + title="Warning: no model found for '{}'".format(name)) + else: + model_path = util.ensure_path(overrides['path']) + data_path = model_path.parent + meta = util.parse_package_meta(data_path, model_name, require=False) lang = meta['lang'] if meta and 'lang' in meta else name - cls = get_lang_class(lang) + cls = util.get_lang_class(lang) overrides['meta'] = meta - model_path = Path(data_path / model_name) - if model_path.exists(): - overrides['path'] = model_path - + overrides['path'] = model_path return cls(**overrides) From 1f9f867c70fc76ae1c0b307e38997c17e084c051 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:35:08 +0200 Subject: [PATCH 50/88] Remove unused util function --- spacy/util.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 24c0d74f1..573489682 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -45,15 +45,6 @@ def ensure_path(path): return path -def or_(val1, val2): - if val1 is not None: - return val1 - elif callable(val2): - return val2() - else: - return val2 - - def read_regex(path): path = ensure_path(path) with path.open() as file_: From 5cb17b9f33ae3803a8fc25305f71153acf71374f Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:35:47 +0200 Subject: [PATCH 51/88] Add NER training docs --- website/docs/usage/_data.json | 5 + website/docs/usage/training-ner.jade | 174 +++++++++++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 website/docs/usage/training-ner.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index c8c85af1d..f81fb245f 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -22,6 +22,7 @@ "Custom tokenization": "customizing-tokenizer", "Training": "training", "Adding languages": "adding-languages" + "Training NER": "training-ner", }, "Examples": { "Tutorials": "tutorials", @@ -106,6 +107,10 @@ "training": { "title": "Training the tagger, parser and entity recognizer" + "training-ner": { + "title": "Training the Named Entity Recognizer", + "next": "saving-loading" + }, }, "pos-tagging": { diff --git a/website/docs/usage/training-ner.jade b/website/docs/usage/training-ner.jade new file mode 100644 index 000000000..78eb4905e --- /dev/null +++ b/website/docs/usage/training-ner.jade @@ -0,0 +1,174 @@ +include ../../_includes/_mixins + +p + | All #[+a("/docs/usage/models") spaCy models] support online learning, so + | you can update a pre-trained model with new examples. You can even add + | new classes to an existing model, to recognise a new entity type, + | part-of-speech, or syntactic relation. Updating an existing model is + | particularly useful as a "quick and dirty solution", if you have only a + | few corrections or annotations. + ++h(2, "improving-accuracy") Improving accuracy on existing entity types + +p + | To update the model, you first need to create an instance of + | #[+api("goldparse") #[code spacy.gold.GoldParse]], with the entity labels + | you want to learn. You will then pass this instance to the + | #[+api("entityrecognizer#update") #[code EntityRecognizer.update()]] + | method. For example: + ++code. + import spacy + from spacy.gold import GoldParse + + nlp = spacy.load('en') + doc = nlp.make_doc(u'Facebook released React in 2014') + gold = GoldParse(doc, entities=['U-ORG', 'O', 'U-TECHNOLOGY', 'O', 'U-DATE']) + nlp.entity.update(doc, gold) + +p + | You'll usually need to provide many examples to meaningfully improve the + | system — a few hundred is a good start, although more is better. You + | should avoid iterating over the same few examples multiple times, or the + | model is likely to "forget" how to annotate other examples. If you + | iterate over the same few examples, you're effectively changing the loss + | function. The optimizer will find a way to minimize the loss on your + | examples, without regard for the consequences on the examples it's no + | longer paying attention to. + +p + | One way to avoid this "catastrophic forgetting" problem is to "remind" + | the model of other examples by augmenting your annotations with sentences + | annotated with entities automatically recognised by the original model. + | Ultimately, this is an empirical process: you'll need to + | #[strong experiment on your own data] to find a solution that works best + | for you. + ++h(2, "adding") Adding a new entity type + +p + | You can add new entity types to an existing model. Let's say we want to + | recognise the category #[code TECHNOLOGY]. The new category will include + | programming languages, frameworks and platforms. First, we need to + | register the new entity type: + ++code. + nlp.entity.add_label('TECHNOLOGY') + +p + | Next, iterate over your examples, calling #[code entity.update()]. As + | above, we want to avoid iterating over only a small number of sentences. + | A useful compromise is to run the model over a number of plain-text + | sentences, and pass the entities to #[code GoldParse], as "true" + | annotations. This encourages the optimizer to find a solution that + | predicts the new category with minimal difference from the previous + | output. + ++h(2, "saving-loading") Saving and loading + +p + | After training our model, you'll usually want to save its state, and load + | it back later. You can do this with the #[code Language.save_to_directory()] + | method: + ++code. + nlp.save_to_directory('/home/me/data/en_technology') + +p + | To make the model more convenient to deploy, we recommend wrapping it as + | a Python package, so that you can install it via pip and load it as a + | module. spaCy comes with a handy #[+a("/docs/usage/cli#package") CLI command] + | to create all required files and directories. + ++code(false, "bash"). + python -m spacy package /home/me/data/en_technology /home/me/my_models + +p + | To build the package and create a #[code .tar.gz] archive, run + | #[code python setup.py sdist] from within its directory. + ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading") saving and loading models]. + +p + | After you've generated and installed the package, you'll be able to + | load the model as follows: + ++code. + import en_technology + nlp = en_technology.load() + ++h(2, "example") Example: Adding and training an #[code ANIMAL] entity + +p + | This script shows how to add a new entity type to an existing pre-trained + | NER model. To keep the example short and simple, only four sentences are + | provided as examples. In practice, you'll need many more — + | #[strong a few hundred] would be a good start. You will also likely need + | to mix in #[strong examples of other entity types], which might be + | obtained by running the entity recognizer over unlabelled sentences, and + | adding their annotations to the training set. + +p + | For the full, runnable script of this example, see + | #[+src(gh("spacy", "examples/training/train_new_entity_type.py")) train_new_entity_type.py]. + ++code("Training the entity recognizer"). + import spacy + from spacy.pipeline import EntityRecognizer + from spacy.gold import GoldParse + from spacy.tagger import Tagger + import random + + model_name = 'en' + entity_label = 'ANIMAL' + output_directory = '/path/to/model' + train_data = [ + ("Horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("horses are too tall and they pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("horses pretend to care about your feelings", + [(0, 6, 'ANIMAL')]), + ("they pretend to care about your feelings, those horses", + [(48, 54, 'ANIMAL')]) + ] + + nlp = spacy.load(model_name) + nlp.entity.add_label(entity_label) + ner = train_ner(nlp, train_data, output_directory) + + def train_ner(nlp, train_data, output_dir): + # Add new words to vocab + for raw_text, _ in train_data: + doc = nlp.make_doc(raw_text) + for word in doc: + _ = nlp.vocab[word.orth] + + for itn in range(20): + random.shuffle(train_data) + for raw_text, entity_offsets in train_data: + gold = GoldParse(doc, entities=entity_offsets) + doc = nlp.make_doc(raw_text) + nlp.tagger(doc) + loss = nlp.entity.update(doc, gold) + nlp.end_training() + nlp.save_to_directory(output_dir) + +p + +button(gh("spaCy", "examples/training/train_new_entity_type.py"), false, "secondary") Full example + +p + | The actual training is performed by looping over the examples, and + | calling #[code nlp.entity.update()]. The #[code update()] method steps + | through the words of the input. At each word, it makes a prediction. It + | then consults the annotations provided on the #[code GoldParse] instance, + | to see whether it was right. If it was wrong, it adjusts its weights so + | that the correct action will score higher next time. + +p + | After training your model, you can + | #[+a("/docs/usage/saving-loading") save it to a directory]. We recommend wrapping + | models as Python packages, for ease of deployment. From b15bdb5279583ea648cad4aafa976747237b40da Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:35:56 +0200 Subject: [PATCH 52/88] Update training docs --- website/docs/usage/training.jade | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/website/docs/usage/training.jade b/website/docs/usage/training.jade index 39f524829..8a5c111bd 100644 --- a/website/docs/usage/training.jade +++ b/website/docs/usage/training.jade @@ -1,13 +1,10 @@ include ../../_includes/_mixins p - | This tutorial describes how to train new statistical models for spaCy's + | This workflow describes how to train new statistical models for spaCy's | part-of-speech tagger, named entity recognizer and dependency parser. - -p - | I'll start with some quick code examples, that describe how to train - | each model. I'll then provide a bit of background about the algorithms, - | and explain how the data and feature templates work. + | Once the model is trained, you can then + | #[+a("/docs/usage/saving-loading") save and load] it. +h(2, "train-pos-tagger") Training the part-of-speech tagger @@ -48,7 +45,21 @@ p p +button(gh("spaCy", "examples/training/train_ner.py"), false, "secondary") Full example -+h(2, "train-entity") Training the dependency parser ++h(2, "extend-entity") Extending the named entity recognizer + +p + | All #[+a("/docs/usage/models") spaCy models] support online learning, so + | you can update a pre-trained model with new examples. You can even add + | new classes to an existing model, to recognise a new entity type, + | part-of-speech, or syntactic relation. Updating an existing model is + | particularly useful as a "quick and dirty solution", if you have only a + | few corrections or annotations. + +p.o-inline-list + +button(gh("spaCy", "examples/training/train_new_entity_type.py"), true, "secondary") Full example + +button("/docs/usage/training-ner", false, "secondary") Usage Workflow + ++h(2, "train-dependency") Training the dependency parser +code. from spacy.vocab import Vocab @@ -67,7 +78,7 @@ p p +button(gh("spaCy", "examples/training/train_parser.py"), false, "secondary") Full example -+h(2, 'feature-templates') Customizing the feature extraction ++h(2, "feature-templates") Customizing the feature extraction p | spaCy currently uses linear models for the tagger, parser and entity From 17e974338860a231054fd3d214e4248e06b594ac Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:09 +0200 Subject: [PATCH 53/88] Add saving & loading models docs --- website/docs/usage/_data.json | 4 + website/docs/usage/saving-loading.jade | 108 +++++++++++++++++++++++++ 2 files changed, 112 insertions(+) create mode 100644 website/docs/usage/saving-loading.jade diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index f81fb245f..edb37bbad 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -23,6 +23,7 @@ "Training": "training", "Adding languages": "adding-languages" "Training NER": "training-ner", + "Saving & loading": "saving-loading" }, "Examples": { "Tutorials": "tutorials", @@ -111,6 +112,9 @@ "title": "Training the Named Entity Recognizer", "next": "saving-loading" }, + + "saving-loading": { + "title": "Saving and loading models" }, "pos-tagging": { diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade new file mode 100644 index 000000000..063c5dc50 --- /dev/null +++ b/website/docs/usage/saving-loading.jade @@ -0,0 +1,108 @@ +include ../../_includes/_mixins + +p + | After training your model, you'll usually want to save its state, and load + | it back later. You can do this with the #[code Language.save_to_directory()] + | method: + ++code. + nlp.save_to_directory('/home/me/data/en_example_model') + +p + | The directory will be created if it doesn't exist, and the whole pipeline + | will be written out. To make the model more convenient to deploy, we + | recommend wrapping it as a Python package. + ++h(2, "generating") Generating a model package + ++infobox("Important note") + | The model packages are #[strong not suitable] for the public + | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not + | designed for binary data and files over 50 MB. However, if your company + | is running an internal installation of pypi, publishing your models on + | there can be a convenient solution to share them with your team. + +p + | spaCy comes with a handy CLI command that will create all required files, + | and walk you through generating the meta data. You can also create the + | meta.json manually and place it in the model data directory, or supply a + | path to it using the #[code --meta] flag. For more info on this, see the + | #[+a("/docs/usage/cli/#package") #[code package] command] documentation. + ++aside-code("meta.json", "json"). + { + "name": "example_model", + "lang": "en", + "version": "1.0.0", + "spacy_version": ">=1.7.0,<2.0.0", + "description": "Example model for spaCy", + "author": "You", + "email": "you@example.com", + "license": "CC BY-SA 3.0" + } + ++code(false, "bash"). + python -m spacy package /home/me/data/en_example_model /home/me/my_models + +p This command will create a model package directory that should look like this: + ++code("Directory structure", "yaml"). + └── / + ├── MANIFEST.in # to include meta.json + ├── meta.json # model meta data + ├── setup.py # setup file for pip installation + └── en_example_model # model directory + ├── __init__.py # init for pip installation + └── en_example_model-1.0.0 # model data + +p + | You can also find templates for all files in our + | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. + | If you're creating the package manually, keep in mind that the directories + | need to be named according to the naming conventions of + | #[code [language]_[type]] and #[code [language]_[type]-[version]]. The + | #[code lang] setting in the meta.json is also used to create the + | respective #[code Language] class in spaCy, which will later be returned + | by the model's #[code load()] method. + ++h(2, "building") Building a model package + +p + | To build the package, run the following command from within the + | directory. This will create a #[code .tar.gz] archive in a directory + | #[code /dist]. + ++code(false, "bash"). + python setup.py sdist + +p + | For more information on building Python packages, see the + | #[+a("https://setuptools.readthedocs.io/en/latest/") Python Setuptools documentation]. + + ++h(2, "loading") Loading a model package + +p + | Model packages can be installed by pointing pip to the model's + | #[code .tar.gz] archive: + ++code(false, "bash"). + pip install /path/to/en_example_model-1.0.0.tar.gz + +p You'll then be able to load the model as follows: + ++code. + import en_example_model + nlp = en_example_model.load() + +p + | To load the model via #[code spacy.load()], you can also + | create a #[+a("/docs/usage/models#usage") shortcut link] that maps the + | package name to a custom model name of your choice: + ++code(false, "bash"). + python -m spacy link en_example_model example + ++code. + import spacy + nlp = spacy.load('example') From 5bbbb7674b93d82d7235a5b183da5506235a0951 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:22 +0200 Subject: [PATCH 54/88] Add training examples to tutorials --- website/docs/usage/_data.json | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index edb37bbad..dc71ef618 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -365,6 +365,18 @@ }, "code": { + "Training a new entity type": { + "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_new_entity_type.py", + "author": "Matthew Honnibal", + "tags": ["ner", "training"] + }, + + "Training an NER system from scratch": { + "url": "https://github.com/explosion/spaCy/blob/master/examples/training/train_ner_standalone.py", + "author": "Matthew Honnibal", + "tags": ["ner", "training"] + }, + "Information extraction": { "url": "https://github.com/explosion/spaCy/blob/master/examples/information_extraction.py", "author": "Matthew Honnibal", From c365795bf6ef0055364c41344fba56853ecc97ef Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:31 +0200 Subject: [PATCH 55/88] Update navigation --- website/docs/usage/_data.json | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/_data.json b/website/docs/usage/_data.json index dc71ef618..2ffbf9d68 100644 --- a/website/docs/usage/_data.json +++ b/website/docs/usage/_data.json @@ -20,8 +20,8 @@ "Word vectors": "word-vectors-similarities", "Deep learning": "deep-learning", "Custom tokenization": "customizing-tokenizer", + "Adding languages": "adding-languages", "Training": "training", - "Adding languages": "adding-languages" "Training NER": "training-ner", "Saving & loading": "saving-loading" }, @@ -103,11 +103,14 @@ "customizing-tokenizer": { "title": "Customizing the tokenizer", - "next": "training" + "next": "adding-languages" }, "training": { - "title": "Training the tagger, parser and entity recognizer" + "title": "Training spaCy's statistical models", + "next": "saving-loading" + }, + "training-ner": { "title": "Training the Named Entity Recognizer", "next": "saving-loading" From dea79224edcb536253c0e74803f112b2891ca4ed Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:51 +0200 Subject: [PATCH 56/88] Remove saving & loading docs and link to new workflow --- website/docs/usage/models.jade | 67 +++++----------------------------- 1 file changed, 9 insertions(+), 58 deletions(-) diff --git a/website/docs/usage/models.jade b/website/docs/usage/models.jade index 39c271df4..9d50dcbc0 100644 --- a/website/docs/usage/models.jade +++ b/website/docs/usage/models.jade @@ -235,62 +235,13 @@ p p | If you've trained your own model, for example for - | #[+a("/docs/usage/adding-languages") additional languages], you can - | create a shortuct link for it by pointing #[code spacy.link] to the - | model's data directory. To allow your model to be downloaded and - | installed via pip, you'll also need to generate a package for it. You can - | do this manually, or via the new - | #[+a("/docs/usage/cli#package") #[code spacy package] command] that will - | create all required files, and walk you through generating the meta data. + | #[+a("/docs/usage/adding-languages") additional languages] or + | #[+a("/docs/usage/train-ner") custom named entities], you can save its + | state using the #[code Language.save_to_directory()] method. To make the + | model more convenient to deploy, we recommend wrapping it as a Python + | package. - -+infobox("Important note") - | The model packages are #[strong not suitable] for the public - | #[+a("https://pypi.python.org") pypi.python.org] directory, which is not - | designed for binary data and files over 50 MB. However, if your company - | is running an internal installation of pypi, publishing your models on - | there can be a convenient solution to share them with your team. - -p The model directory should look like this: - -+code("Directory structure", "yaml"). - └── / - ├── MANIFEST.in # to include meta.json - ├── meta.json # model meta data - ├── setup.py # setup file for pip installation - └── en_core_web_md # model directory - ├── __init__.py # init for pip installation - └── en_core_web_md-1.2.0 # model data - -p - | You can find templates for all files in our - | #[+a(gh("spacy-dev-resouces", "templates/model")) spaCy dev resources]. - | Unless you want to customise installation and loading, the only file - | you'll need to modify is #[code meta.json], which includes the model's - | meta data. It will later be copied into the package and data directory. - -+code("meta.json", "json"). - { - "name": "core_web_md", - "lang": "en", - "version": "1.2.0", - "spacy_version": "1.7.0", - "description": "English model for spaCy", - "author": "Explosion AI", - "email": "contact@explosion.ai", - "license": "MIT" - } - -p - | Keep in mind that the directories need to be named according to the - | naming conventions. The #[code lang] setting is also used to create the - | respective #[code Language] class in spaCy, which will later be returned - | by the model's #[code load()] method. - -p - | To generate the package, run the following command from within the - | directory. This will create a #[code .tar.gz] archive in a directory - | #[code /dist]. - -+code(false, "bash"). - python setup.py sdist ++infobox("Saving and loading models") + | For more information and a detailed guide on how to package your model, + | see the documentation on + | #[+a("/docs/usage/saving-loading") saving and loading models]. From e4dd645c378acd32a5e91b38b51b94427abf3d46 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:36:59 +0200 Subject: [PATCH 57/88] Update link --- website/docs/usage/cli.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/cli.jade b/website/docs/usage/cli.jade index 5ad8a214d..e4d762615 100644 --- a/website/docs/usage/cli.jade +++ b/website/docs/usage/cli.jade @@ -248,7 +248,7 @@ p +tag experimental p - | Generate a #[+a("/docs/usage/models#own-models") model Python package] + | Generate a #[+a("/docs/usage/saving-loading#generating") model Python package] | from an existing model data directory. All data files are copied over. | If the path to a meta.json is supplied, or a meta.json is found in the | input directory, this file is used. Otherwise, the data can be entered From 264af6cd17d734efa5cb958f4cded7d852f80237 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:37:29 +0200 Subject: [PATCH 58/88] Add documentation --- examples/training/train_new_entity_type.py | 29 ++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index cbe2963d3..ef4070153 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,3 +1,32 @@ +#!/usr/bin/env python +""" +Example of training and additional entity type + +This script shows how to add a new entity type to an existing pre-trained NER +model. To keep the example short and simple, only four sentences are provided +as examples. In practice, you'll need many more — a few hundred would be a +good start. You will also likely need to mix in examples of other entity +types, which might be obtained by running the entity recognizer over unlabelled +sentences, and adding their annotations to the training set. + +The actual training is performed by looping over the examples, and calling +`nlp.entity.update()`. The `update()` method steps through the words of the +input. At each word, it makes a prediction. It then consults the annotations +provided on the GoldParse instance, to see whether it was right. If it was +wrong, it adjusts its weights so that the correct action will score higher +next time. + +After training your model, you can save it to a directory. We recommend +wrapping models as Python packages, for ease of deployment. + +For more details, see the documentation: +* Training the Named Entity Recognizer: https://spacy.io/docs/usage/train-ner +* Saving and loading models: https://spacy.io/docs/usage/saving-loading + +Developed for: spaCy 1.7.6 +Last tested for: spaCy 1.7.6 +""" +# coding: utf8 from __future__ import unicode_literals, print_function import random From 6145b7c15334f43d093c6bcd7fb2dd145ec0df98 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 20:53:25 +0200 Subject: [PATCH 59/88] Remove redundant Path --- spacy/cli/link.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index 781adda2c..e5d590e5a 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -29,7 +29,7 @@ def link_package(package_name, link_name, force=False): def symlink(model_path, link_name, force): model_path = Path(model_path) - if not Path(model_path).exists(): + if not model_path.exists(): util.sys_exit( "The data should be located in {p}".format(p=model_path), title="Can't locate model data") From 4931c56afc7ec607b76d312b28ee4ea00cb77002 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 13:59:38 -0500 Subject: [PATCH 60/88] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index ecdbc1fef..b9b7444a8 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.7.5' +__version__ = '1.7.6' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' From 13d30b6c01d25472f9b222be3788ef9e1a8ce9e3 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 21:18:39 +0200 Subject: [PATCH 61/88] xfail lemmatizer test that's causing problems (see #546) --- spacy/tests/tagger/test_lemmatizer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 3e2933fcd..5db0d0b2c 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -16,6 +16,7 @@ def test_tagger_lemmatizer_noun_lemmas(lemmatizer, text, lemmas): assert lemmatizer.noun(text) == set(lemmas) +@pytest.mark.xfail @pytest.mark.models def test_tagger_lemmatizer_base_forms(lemmatizer): if lemmatizer is None: From 5c5f8c0a72f43ba2139185786b4e08884096b8fa Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:14:38 +0200 Subject: [PATCH 62/88] Check if full string is found in lang classes first This allows users to set arbitrary strings. (Otherwise, custom lang class "my_custom_class" would always load Burmese "my" tokenizer if one was available.) --- spacy/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/util.py b/spacy/util.py index 573489682..f807dae9e 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -20,9 +20,11 @@ def set_lang_class(name, cls): def get_lang_class(name): + if name in LANGUAGES: + return LANGUAGES[name] lang = re.split('[^a-zA-Z0-9]', name, 1)[0] if lang not in LANGUAGES: - raise RuntimeError('Language not supported: %s' % lang) + raise RuntimeError('Language not supported: %s' % name) return LANGUAGES[lang] From 97647c46cdbd623e34da4c162ceecd4b97b0946e Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:14:45 +0200 Subject: [PATCH 63/88] Add docstring and todo note --- spacy/util.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/spacy/util.py b/spacy/util.py index f807dae9e..0ccdfbd72 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -107,6 +107,13 @@ def read_json(location): def parse_package_meta(package_path, package, require=True): + """ + Check if a meta.json exists in a package and return its contents as a + dictionary. If require is set to True, raise an error if no meta.json found. + """ + # TODO: Allow passing in full model path and only require one argument + # instead of path and package name. This lets us avoid passing in an awkward + # empty string in spacy.load() if user supplies full model path. location = package_path / package / 'meta.json' if location.is_file(): return read_json(location) From ad168ba88c8d18b3755f3c49ced4cb6c34248fc7 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:15:51 +0200 Subject: [PATCH 64/88] Set model name to empty string if path override exists Required for parse_package_meta, which composes path of data_path and model_name (needs to be fixed in the future) --- spacy/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/__init__.py b/spacy/__init__.py index 06e9374ea..22f406771 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -30,6 +30,7 @@ def load(name, **overrides): else: model_path = util.ensure_path(overrides['path']) data_path = model_path.parent + model_name = '' meta = util.parse_package_meta(data_path, model_name, require=False) lang = meta['lang'] if meta and 'lang' in meta else name cls = util.get_lang_class(lang) From 5610fdcc064877174f383654b615c1c97b2ff96d Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:16:47 +0200 Subject: [PATCH 65/88] Get language name first if no model path exists Makes sure spaCy fails early if no tokenizer exists, and allows printing better error message. --- spacy/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/__init__.py b/spacy/__init__.py index 22f406771..efd6c00c0 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -23,9 +23,10 @@ def load(name, **overrides): model_name = resolve_model_name(name) model_path = data_path / model_name if not model_path.exists(): + lang_name = util.get_lang_class(name).lang model_path = None util.print_msg( - "Only loading the '{}' tokenizer.".format(name), + "Only loading the '{}' tokenizer.".format(lang_name), title="Warning: no model found for '{}'".format(name)) else: model_path = util.ensure_path(overrides['path']) From 17c9fffb9e1a5318cc9e5de320f1ea38d51ee1a9 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 15:28:16 -0500 Subject: [PATCH 66/88] Fix naked except --- spacy/cli/link.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/cli/link.py b/spacy/cli/link.py index e5d590e5a..f2d2fd436 100644 --- a/spacy/cli/link.py +++ b/spacy/cli/link.py @@ -48,7 +48,7 @@ def symlink(model_path, link_name, force): except: # This is quite dirty, but just making sure other errors are caught so # users at least see a proper message. - util.sys_exit( + util.print_msg( "Creating a symlink in spacy/data failed. Make sure you have the " "required permissions and try re-running the command as admin, or " "use a virtualenv to install spaCy in a user directory, instead of " @@ -57,6 +57,7 @@ def symlink(model_path, link_name, force): "load() method, or create the symlink manually:", "{a} --> {b}".format(a=unicode_(model_path), b=unicode_(link_path)), title="Error: Couldn't link model to '{l}'".format(l=link_name)) + raise util.print_msg( "{a} --> {b}".format(a=model_path.as_posix(), b=link_path.as_posix()), From 4efd6fb9d60f2d36a1f9ae6892c2d6bd4021140c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 16 Apr 2017 15:28:27 -0500 Subject: [PATCH 67/88] Fix training --- spacy/language.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 47408921c..f47b1d0cc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -239,8 +239,7 @@ class Language(object): self.pipeline = self.Defaults.create_pipeline(self) yield Trainer(self, gold_tuples) self.end_training() - self.save_to_directory(path, deps=self.parser.cfg, ner=self.entity.cfg, - pos=self.tagger.cfg) + self.save_to_directory(path) def __init__(self, **overrides): if 'data_dir' in overrides and 'path' not in overrides: From de5062711b5b76d481bf9360a9cad99adf5f080b Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:26:24 +0200 Subject: [PATCH 68/88] Update adding languages workflow to reflect changes in __init__.py --- website/docs/usage/adding-languages.jade | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/website/docs/usage/adding-languages.jade b/website/docs/usage/adding-languages.jade index d1541bc87..0c98cc5ca 100644 --- a/website/docs/usage/adding-languages.jade +++ b/website/docs/usage/adding-languages.jade @@ -63,14 +63,16 @@ p tag_map = TAG_MAP stop_words = STOP_WORDS -p Additionally, the new #[code Language] class needs to be registered in #[+src(gh("spaCy", "spacy/__init__.py")) spacy/__init__.py] using the #[code set_lang_class()] function, so that you can use #[code spacy.load()]. +p + | Additionally, the new #[code Language] class needs to be added to the + | list of available languages in #[+src(gh("spaCy", "spacy/__init__.py")) __init__.py]. + | The languages are then registered using the #[code set_lang_class()] function. +code("spacy/__init__.py"). from . import en from . import xx - set_lang_class(en.English.lang, en.English) - set_lang_class(xx.Xxxxx.lang, xx.Xxxxx) + _languages = (en.English, ..., xx.Xxxxx) p You'll also need to list the new package in #[+src(gh("spaCy", "spacy/setup.py")) setup.py]: From 16a8521efa4275fc8f87c28e00f06745e5cd40b4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:38:38 +0200 Subject: [PATCH 69/88] Increment version --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index b9b7444a8..5e438c7af 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy' -__version__ = '1.7.6' +__version__ = '1.8.0' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Matthew Honnibal' From 02e7512b914316a5f96ed2eb1a65bcf4a3f6281a Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 22:39:58 +0200 Subject: [PATCH 70/88] Increment version --- website/_harp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/_harp.json b/website/_harp.json index d117f74af..03fcbb956 100644 --- a/website/_harp.json +++ b/website/_harp.json @@ -12,7 +12,7 @@ "COMPANY_URL": "https://explosion.ai", "DEMOS_URL": "https://demos.explosion.ai", - "SPACY_VERSION": "1.7", + "SPACY_VERSION": "1.8", "LATEST_NEWS": { "url": "https://survey.spacy.io/", "title": "Take the spaCy user survey and help us improve the library!" From db7e046faa693ff759afb8449e720043157617c9 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 23:23:59 +0200 Subject: [PATCH 71/88] Update version --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index b9cd1d5ce..67a6a6c66 100644 --- a/README.rst +++ b/README.rst @@ -10,7 +10,7 @@ open-source software, released under the MIT license. 📊 **Help us improve the library!** `Take the spaCy user survey `_. -💫 **Version 1.7 out now!** `Read the release notes here. `_ +💫 **Version 1.8 out now!** `Read the release notes here. `_ .. image:: https://img.shields.io/travis/explosion/spaCy/master.svg?style=flat-square :target: https://travis-ci.org/explosion/spaCy From cffaf521520c7bf3643475081539713831f49acd Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 16 Apr 2017 23:34:14 +0200 Subject: [PATCH 72/88] Update README.rst --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index 67a6a6c66..d08860fb8 100644 --- a/README.rst +++ b/README.rst @@ -320,6 +320,7 @@ and ``--model`` are optional and enable additional tests: =========== ============== =========== Version Date Description =========== ============== =========== +`v1.8.0`_ ``2017-04-16`` Better NER training, saving and loading `v1.7.5`_ ``2017-04-07`` Bug fixes and new CLI commands `v1.7.3`_ ``2017-03-26`` Alpha support for Hebrew, new CLI commands and bug fixes `v1.7.2`_ ``2017-03-20`` Small fixes to beam parser and model linking @@ -350,6 +351,7 @@ Version Date Description `v0.93`_ ``2015-09-22`` Bug fixes to word vectors =========== ============== =========== +.. _v1.8.0: https://github.com/explosion/spaCy/releases/tag/v1.8.0 .. _v1.7.5: https://github.com/explosion/spaCy/releases/tag/v1.7.5 .. _v1.7.3: https://github.com/explosion/spaCy/releases/tag/v1.7.3 .. _v1.7.2: https://github.com/explosion/spaCy/releases/tag/v1.7.2 From 734b0a4e4ae0634a24e8f44303d7bccd49fbc31f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 16 Apr 2017 23:42:16 +0200 Subject: [PATCH 73/88] Update train_new_entity_type.py --- examples/training/train_new_entity_type.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index ef4070153..d5d9492f1 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -Example of training and additional entity type +Example of training an additional entity type This script shows how to add a new entity type to an existing pre-trained NER model. To keep the example short and simple, only four sentences are provided From e7ae3b7cc20ea29d02aadac6bc635102f00c4bc9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 16 Apr 2017 23:56:12 +0200 Subject: [PATCH 74/88] Fix formatting and typo (closes #967) --- examples/pos_tag.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/examples/pos_tag.py b/examples/pos_tag.py index c61d29636..1dd6add0f 100644 --- a/examples/pos_tag.py +++ b/examples/pos_tag.py @@ -1,7 +1,8 @@ -'''Print part-of-speech tagged, true-cased, (very roughly) sentence-separated +""" +Print part-of-speech tagged, true-cased, (very roughly) sentence-separated text, with each "sentence" on a newline, and spaces between tokens. Supports multi-processing. -''' +""" from __future__ import print_function, unicode_literals, division import io import bz2 @@ -22,14 +23,14 @@ def parallelize(func, iterator, n_jobs, extra): def iter_texts_from_json_bz2(loc): - ''' + """ Iterator of unicode strings, one per document (here, a comment). Expects a a path to a BZ2 file, which should be new-line delimited JSON. The document text should be in a string field titled 'body'. This is the data format of the Reddit comments corpus. - ''' + """ with bz2.BZ2File(loc) as file_: for i, line in enumerate(file_): yield ujson.loads(line)['body'] @@ -80,7 +81,7 @@ def is_sent_begin(word): def main(in_loc, out_dir, n_workers=4, batch_size=100000): if not path.exists(out_dir): path.join(out_dir) - texts = partition(batch_size, iter_texts(in_loc)) + texts = partition(batch_size, iter_texts_from_json_bz2(in_loc)) parallelize(transform_texts, enumerate(texts), n_workers, [out_dir]) From c6c3162c5031933fc26932668772a501fc7869de Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 00:00:27 +0200 Subject: [PATCH 75/88] Fix lightning tour example (closes #889) --- website/docs/usage/lightning-tour.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/lightning-tour.jade b/website/docs/usage/lightning-tour.jade index 31982d516..967d0c61e 100644 --- a/website/docs/usage/lightning-tour.jade +++ b/website/docs/usage/lightning-tour.jade @@ -137,7 +137,7 @@ p return word.ent_type != 0 def count_parent_verb_by_person(docs): - counts = defaultdict(defaultdict(int)) + counts = defaultdict(lambda: defaultdict(int)) for doc in docs: for ent in doc.ents: if ent.label_ == 'PERSON' and ent.root.head.pos == VERB: From 82f5f1f98fe572910b5c5c1762ed73ac8ba677e6 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:29:54 +0200 Subject: [PATCH 76/88] Replace str with compat.unicode_ --- spacy/cli/info.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 50844257f..1d1bc2f51 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import platform from pathlib import Path +from ..compat import unicode_ from .. import about from .. import util @@ -13,10 +14,10 @@ def info(model=None, markdown=False): data = util.parse_package_meta(util.get_data_path(), model, require=True) model_path = Path(__file__).parent / util.get_data_path() / model if model_path.resolve() != model_path: - data['link'] = str(model_path) - data['source'] = str(model_path.resolve()) + data['link'] = unicode_(model_path) + data['source'] = unicode_(model_path.resolve()) else: - data['source'] = str(model_path) + data['source'] = unicode_(model_path) print_info(data, "model " + model, markdown) else: data = get_spacy_data() @@ -34,7 +35,7 @@ def print_info(data, title, markdown): def get_spacy_data(): return { 'spaCy version': about.__version__, - 'Location': str(Path(__file__).parent.parent), + 'Location': unicode_(Path(__file__).parent.parent), 'Platform': platform.platform(), 'Python version': platform.python_version(), 'Installed models': ', '.join(list_models()) From e2299dc389bbf84ee1bd56edc23202ec5f9249e2 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:40:14 +0200 Subject: [PATCH 77/88] Ensure path in save_to_directory --- spacy/language.py | 1 + 1 file changed, 1 insertion(+) diff --git a/spacy/language.py b/spacy/language.py index f47b1d0cc..b356f4d8c 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -350,6 +350,7 @@ class Language(object): 'ner': self.entity.cfg if self.entity else {}, } + path = util.ensure_path(path) self.setup_directory(path, **configs) strings_loc = path / 'vocab' / 'strings.json' From 8e83f8e2fabef373faece4802737567b2768a357 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:40:26 +0200 Subject: [PATCH 78/88] Update docstrings --- spacy/language.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index b356f4d8c..de97f7a63 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -188,6 +188,9 @@ class Language(object): @classmethod def setup_directory(cls, path, **configs): + """ + Initialise a model directory. + """ for name, config in configs.items(): directory = path / name if directory.exists(): @@ -295,7 +298,7 @@ class Language(object): and can contain arbtrary whitespace. Alignment into the original string is preserved. - Args: + Argsuments: text (unicode): The text to be processed. Returns: @@ -344,6 +347,12 @@ class Language(object): yield doc def save_to_directory(self, path): + """ + Save the Vocab, StringStore and pipeline to a directory. + + Arguments: + path (string or pathlib path): Path to save the model. + """ configs = { 'pos': self.tagger.cfg if self.tagger else {}, 'deps': self.parser.cfg if self.parser else {}, From aad80a291f481fdbc75f9def859f146d2921da81 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:40:34 +0200 Subject: [PATCH 79/88] Add save_to_directory method to API docs --- website/docs/api/language.jade | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index 93e7ff213..e221b9142 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -136,3 +136,19 @@ p +cell yield +cell #[code Doc] +cell Containers for accessing the linguistic annotations. + ++h(2, "save_to_directory") Language.save_to_directory + +tag method + +p Save the #[code Vocab], #[code StringStore] and pipeline to a directory. + ++table(["Name", "Type", "Description"]) + +row + +cell #[code path] + +cell string or pathlib path + +cell Path to save the model. + + +footrow + +cell return + +cell #[code None] + +cell - From 7f776258f06bc88e908f725c1b0cf7b88ad4c73c Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:41:46 +0200 Subject: [PATCH 80/88] Add link to API docs --- website/docs/usage/saving-loading.jade | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 063c5dc50..9fa23aaa7 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -2,7 +2,8 @@ include ../../_includes/_mixins p | After training your model, you'll usually want to save its state, and load - | it back later. You can do this with the #[code Language.save_to_directory()] + | it back later. You can do this with the + | #[+api("language#save_to_directory") #[code Language.save_to_directory()]] | method: +code. From 2ab394d655677d56dcd508fdd520598c3b25093b Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:45:00 +0200 Subject: [PATCH 81/88] Fix whitespace --- website/docs/usage/saving-loading.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/usage/saving-loading.jade b/website/docs/usage/saving-loading.jade index 9fa23aaa7..a3edfce50 100644 --- a/website/docs/usage/saving-loading.jade +++ b/website/docs/usage/saving-loading.jade @@ -24,7 +24,7 @@ p | there can be a convenient solution to share them with your team. p - | spaCy comes with a handy CLI command that will create all required files, + | spaCy comes with a handy CLI command that will create all required files, | and walk you through generating the meta data. You can also create the | meta.json manually and place it in the model data directory, or supply a | path to it using the #[code --meta] flag. For more info on this, see the From f62b740961dbd7635206c2d93c54d2fe952d5822 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:46:14 +0200 Subject: [PATCH 82/88] Use compat.json_dumps --- spacy/language.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index de97f7a63..9ce90fdef 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -2,7 +2,6 @@ from __future__ import absolute_import, unicode_literals from contextlib import contextmanager import shutil -import ujson from .tokenizer import Tokenizer from .vocab import Vocab @@ -15,7 +14,7 @@ from .syntax.nonproj import PseudoProjectivity from .pipeline import DependencyParser, EntityRecognizer from .syntax.arc_eager import ArcEager from .syntax.ner import BiluoPushDown -from .compat import unicode_ +from .compat import json_dumps from .attrs import IS_STOP from . import attrs from . import orth @@ -197,9 +196,7 @@ class Language(object): shutil.rmtree(str(directory)) directory.mkdir() with (directory / 'config.json').open('wb') as file_: - data = ujson.dumps(config, indent=2) - if isinstance(data, unicode_): - data = data.encode('utf8') + data = json_dumps(config) file_.write(data) if not (path / 'vocab').exists(): (path / 'vocab').mkdir() From ddd5194088dbc229de0caf4f2f5128f8c974f5ee Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 17 Apr 2017 01:52:13 +0200 Subject: [PATCH 83/88] Update Language docs and docstrings --- spacy/language.py | 9 +++++++++ website/docs/api/language.jade | 4 ++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 9ce90fdef..854b0ebeb 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -242,6 +242,15 @@ class Language(object): self.save_to_directory(path) def __init__(self, **overrides): + """ + Create or load the pipeline. + + Arguments: + **overrides: Keyword arguments indicating which defaults to override. + + Returns: + Language: The newly constructed object. + """ if 'data_dir' in overrides and 'path' not in overrides: raise ValueError("The argument 'data_dir' has been renamed to 'path'") path = util.ensure_path(overrides.get('path', True)) diff --git a/website/docs/api/language.jade b/website/docs/api/language.jade index e221b9142..d7090c870 100644 --- a/website/docs/api/language.jade +++ b/website/docs/api/language.jade @@ -55,14 +55,14 @@ p Create or load the pipeline. +table(["Name", "Type", "Description"]) +row - +cell #[code **kwrags] + +cell #[code **overrides] +cell - +cell Keyword arguments indicating which defaults to override. +footrow +cell return +cell #[code Language] - +cell #[code self] + +cell The newly constructed object. +h(2, "call") Language.__call__ +tag method From 2f84626417b339b42fd1485b53abff8858a7ded3 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 18 Apr 2017 13:47:36 +0200 Subject: [PATCH 84/88] Fix train_new_entity_type example --- examples/training/train_new_entity_type.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index cbe2963d3..23cb86596 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -4,7 +4,6 @@ import random from pathlib import Path import spacy -from spacy.pipeline import EntityRecognizer from spacy.gold import GoldParse from spacy.tagger import Tagger @@ -25,10 +24,13 @@ def train_ner(nlp, train_data, output_dir): loss = nlp.entity.update(doc, gold) nlp.end_training() if output_dir: + if not output_dir.exists(): + output_dir.mkdir() nlp.save_to_directory(output_dir) def main(model_name, output_directory=None): + print("Loading initial model", model_name) nlp = spacy.load(model_name) if output_directory is not None: output_directory = Path(output_directory) @@ -52,13 +54,14 @@ def main(model_name, output_directory=None): ) ] nlp.entity.add_label('ANIMAL') - ner = train_ner(nlp, train_data, output_directory) + train_ner(nlp, train_data, output_directory) # Test that the entity is recognized doc = nlp('Do you like horses?') for ent in doc.ents: print(ent.label_, ent.text) if output_directory: + print("Loading from", output_directory) nlp2 = spacy.load('en', path=output_directory) nlp2.entity.add_label('ANIMAL') doc2 = nlp2('Do you like horses?') From 48da244058eac217aed80d59583fb77cc276bd96 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 19 Apr 2017 11:50:33 +0200 Subject: [PATCH 85/88] Use spacy.compat.json_dumps for Python 2/3 compatibility (resolves #991) --- spacy/cli/converters/conllu2json.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py index 3c5ebb0e4..cf473b4a0 100644 --- a/spacy/cli/converters/conllu2json.py +++ b/spacy/cli/converters/conllu2json.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import json +from ...compat import json_dumps from ... import util @@ -29,7 +30,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): output_filename = input_path.parts[-1].replace(".conllu", ".json") output_file = output_path / output_filename - json.dump(docs, output_file.open('w', encoding='utf-8'), indent=2) + with output_file.open('w', encoding='utf-8') as f: + f.write(json_dumps(docs)) util.print_msg("Created {} documents".format(len(docs)), title="Generated output file {}".format(output_file)) From b763e9b66d1d578b71397c17888f16e9ac485194 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 19 Apr 2017 12:00:12 +0200 Subject: [PATCH 86/88] Add note about variable naming --- CONTRIBUTING.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 327f3d58e..c8bed34ed 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -95,6 +95,8 @@ At the time of writing (v1.7), spaCy's serialization and deserialization functio Although spaCy uses a lot of classes, inheritance is viewed with some suspicion — it's seen as a mechanism of last resort. You should discuss plans to extend the class hierarchy before implementing. +We have a number of conventions around variable naming that are still being documented, and aren't 100% strict. A general policy is that instances of the class `Doc` should by default be called `doc`, `Token` `token`, `Lexeme` `lex`, `Vocab` `vocab` and `Language` `nlp`. You should avoid naming variables that are of other types these names. For instance, don't name a text string `doc` --- you should usually call this `text`. Two general code style preferences further help with naming. First, lean away from introducing temporary variables, as these clutter your namespace. This is one reason why comprehension expressions are often preferred. Second, keep your functions shortish, so that can work in a smaller scope. Of course, this is a question of trade-offs. + ### Cython conventions spaCy's core data structures are implemented as [Cython](http://cython.org/) `cdef` classes. Memory is managed through the `cymem.cymem.Pool` class, which allows you to allocate memory which will be freed when the `Pool` object is garbage collected. This means you usually don't have to worry about freeing memory. You just have to decide which Python object owns the memory, and make it own the `Pool`. When that object goes out of scope, the memory will be freed. You do have to take care that no pointers outlive the object that owns them — but this is generally quite easy. From 275fc9f78a9a1e9069a234fbb48da9bbe5e26532 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 19 Apr 2017 12:09:10 +0200 Subject: [PATCH 87/88] Update CONTRIBUTING.md --- CONTRIBUTING.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c8bed34ed..f1053405e 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -87,7 +87,16 @@ Code should loosely follow [pep8](https://www.python.org/dev/peps/pep-0008/). Re ### Python conventions -All Python code must be written in an **intersection of Python 2 and Python 3**. This is easy in Cython, but somewhat ugly in Python. We could use some extra utilities for this. Please pay particular attention to code that serialises json objects. +All Python code must be written in an **intersection of Python 2 and Python 3**. This is easy in Cython, but somewhat ugly in Python. Logic that deals with Python or platform compatibility should only live in [`spacy.compat`](spacy/compat.py). To distinguish them from the builtin functions, replacement functions are suffixed with an undersocre, for example `unicode_`. If you need to access the user's version or platform information, for example to show more specific error messages, you can use the `is_config()` helper function. + +```python +from .compat import unicode_, json_dumps, is_config + +compatible_unicode = unicode_('hello world') +compatible_json = json_dumps({'key': 'value'}) +if is_config(windows=True, python2=True): + print("You are using Python 2 on Windows.") +``` Code that interacts with the file-system should accept objects that follow the `pathlib.Path` API, without assuming that the object inherits from `pathlib.Path`. If the function is user-facing and takes a path as an argument, it should check whether the path is provided as a string. Strings should be converted to `pathlib.Path` objects. @@ -95,7 +104,7 @@ At the time of writing (v1.7), spaCy's serialization and deserialization functio Although spaCy uses a lot of classes, inheritance is viewed with some suspicion — it's seen as a mechanism of last resort. You should discuss plans to extend the class hierarchy before implementing. -We have a number of conventions around variable naming that are still being documented, and aren't 100% strict. A general policy is that instances of the class `Doc` should by default be called `doc`, `Token` `token`, `Lexeme` `lex`, `Vocab` `vocab` and `Language` `nlp`. You should avoid naming variables that are of other types these names. For instance, don't name a text string `doc` --- you should usually call this `text`. Two general code style preferences further help with naming. First, lean away from introducing temporary variables, as these clutter your namespace. This is one reason why comprehension expressions are often preferred. Second, keep your functions shortish, so that can work in a smaller scope. Of course, this is a question of trade-offs. +We have a number of conventions around variable naming that are still being documented, and aren't 100% strict. A general policy is that instances of the class `Doc` should by default be called `doc`, `Token` `token`, `Lexeme` `lex`, `Vocab` `vocab` and `Language` `nlp`. You should avoid naming variables that are of other types these names. For instance, don't name a text string `doc` — you should usually call this `text`. Two general code style preferences further help with naming. First, lean away from introducing temporary variables, as these clutter your namespace. This is one reason why comprehension expressions are often preferred. Second, keep your functions shortish, so that can work in a smaller scope. Of course, this is a question of trade-offs. ### Cython conventions @@ -128,7 +137,7 @@ cdef int c_total(const int* int_array, int length) nogil: return total ``` -If this is confusing, consider that the compiler couldn't deal with `for item in int_array:` — there's no length attached to a raw pointer, so how could we figure out where to stop? The length is provided in the slice notation as a solution to this. Note that we don't have to declare the type of `item` in the code above -- the compiler can easily infer it. This gives us tidy code that looks quite like Python, but is exactly as fast as C — because we've made sure the compilation to C is trivial. +If this is confusing, consider that the compiler couldn't deal with `for item in int_array:` — there's no length attached to a raw pointer, so how could we figure out where to stop? The length is provided in the slice notation as a solution to this. Note that we don't have to declare the type of `item` in the code above — the compiler can easily infer it. This gives us tidy code that looks quite like Python, but is exactly as fast as C — because we've made sure the compilation to C is trivial. Your functions cannot be declared `nogil` if they need to create Python objects or call Python functions. This is perfectly okay — you shouldn't torture your code just to get `nogil` functions. However, if your function isn't `nogil`, you should compile your module with `cython -a --cplus my_module.pyx` and open the resulting `my_module.html` file in a browser. This will let you see how Cython is compiling your code. Calls into the Python run-time will be in bright yellow. This lets you easily see whether Cython is able to correctly type your code, or whether there are unexpected problems. From 2bd89e7ade7bfb01a82380f65caa30d1d9fc9fb2 Mon Sep 17 00:00:00 2001 From: ines Date: Wed, 19 Apr 2017 19:28:00 +0200 Subject: [PATCH 88/88] Tidy up Hebrew tests and test for punctuation (see #995) --- spacy/tests/he/test_tokenizer.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/spacy/tests/he/test_tokenizer.py b/spacy/tests/he/test_tokenizer.py index a6c65805a..c2504a0e7 100644 --- a/spacy/tests/he/test_tokenizer.py +++ b/spacy/tests/he/test_tokenizer.py @@ -3,15 +3,21 @@ from __future__ import unicode_literals import pytest -ABBREVIATION_TESTS = [ - ('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית']) -] -TESTCASES = ABBREVIATION_TESTS - - -@pytest.mark.parametrize('text,expected_tokens', TESTCASES) -def test_tokenizer_handles_testcases(he_tokenizer, text, expected_tokens): +@pytest.mark.parametrize('text,expected_tokens', + [('פייתון היא שפת תכנות דינמית', ['פייתון', 'היא', 'שפת', 'תכנות', 'דינמית'])]) +def test_tokenizer_handles_abbreviation(he_tokenizer, text, expected_tokens): tokens = he_tokenizer(text) token_list = [token.text for token in tokens if not token.is_space] - assert expected_tokens == token_list \ No newline at end of file + assert expected_tokens == token_list + + +@pytest.mark.parametrize('text,expected_tokens', [ + pytest.mark.xfail(('עקבת אחריו בכל רחבי המדינה.', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '.'])), + ('עקבת אחריו בכל רחבי המדינה?', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '?']), + ('עקבת אחריו בכל רחבי המדינה!', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '!']), + ('עקבת אחריו בכל רחבי המדינה..', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '..']), + ('עקבת אחריו בכל רחבי המדינה...', ['עקבת', 'אחריו', 'בכל', 'רחבי', 'המדינה', '...'])]) +def test_tokenizer_handles_punct(he_tokenizer, text, expected_tokens): + tokens = he_tokenizer(text) + assert expected_tokens == [token.text for token in tokens]