From 98c29b79127ae62e9d8b69d9513cdded7a81ceb2 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 4 Nov 2017 00:23:23 +0100 Subject: [PATCH 1/9] Add padding vector in parser, to make gradient more correct --- spacy/syntax/nn_parser.pyx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 68301238d..554addd53 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -532,7 +532,9 @@ cdef class Parser: return None backprops = [] - d_tokvecs = state2vec.ops.allocate(tokvecs.shape) + # Add a padding vector to the d_tokvecs gradient, so that missing + # values don't affect the real gradient. + d_tokvecs = state2vec.ops.allocate((tokvecs.shape[0]+1, tokvecs.shape[1])) cdef float loss = 0. n_steps = 0 while todo: @@ -615,7 +617,9 @@ cdef class Parser: bp_vectors)) else: backprop_lower.append((ids, d_vector, bp_vectors)) - d_tokvecs = self.model[0].ops.allocate(tokvecs.shape) + # Add a padding vector to the d_tokvecs gradient, so that missing + # values don't affect the real gradient. + d_tokvecs = state2vec.ops.allocate((tokvecs.shape[0]+1, tokvecs.shape[1])) self._make_updates(d_tokvecs, bp_tokvecs, backprop_lower, sgd, cuda_stream) @@ -668,7 +672,8 @@ cdef class Parser: (ids.size, d_state_features.shape[2])) self.model[0].ops.scatter_add(d_tokvecs, ids, d_state_features) - bp_tokvecs(d_tokvecs, sgd=sgd) + # Padded -- see update() + bp_tokvecs(d_tokvecs[:-1], sgd=sgd) @property def move_names(self): From e4ec4be9485c2293fd15e3deb4fe27f6bb72d334 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 4 Nov 2017 00:23:45 +0100 Subject: [PATCH 2/9] Fix parser test --- spacy/tests/test_misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py index 5c69dae3e..fa571ce90 100644 --- a/spacy/tests/test_misc.py +++ b/spacy/tests/test_misc.py @@ -69,7 +69,7 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2): Y, get_dX = model.begin_update(tensor) assert Y.shape == (tensor.shape[0]+1, nF, nO, nP) assert model.d_pad.shape == (1, nF, nO, nP) - dY = model.ops.allocate((15, nF, nO, nP)) + dY = model.ops.allocate((15, nO, nP)) ids = model.ops.allocate((15, nF)) ids[1,2] = -1 dY[1,2] = 1 From a2162b89086aeecf8b92891ca516f40fc666efb1 Mon Sep 17 00:00:00 2001 From: uwol Date: Sun, 5 Nov 2017 12:25:10 +0100 Subject: [PATCH 3/9] tensorizer return parameter fix --- spacy/pipeline.pyx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index a159fad50..5a72dc946 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -318,7 +318,7 @@ class Tensorizer(Pipe): loss, d_scores = self.get_loss(docs, golds, scores) d_inputs = bp_scores(d_scores, sgd=sgd) d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1) - for d_input, bp_input in zip(d_inputs, bp_inputs): + for d_input, bp_input in zip(d_inputs, bp_inputs): bp_input(d_input, sgd=sgd) if losses is not None: losses.setdefault(self.name, 0.) @@ -777,7 +777,8 @@ class TextCategorizer(Pipe): def predict(self, docs): scores = self.model(docs) scores = self.model.ops.asarray(scores) - return scores + tensors = [doc.tensor for doc in docs] + return scores, tensors def set_annotations(self, docs, scores, tensors=None): for i, doc in enumerate(docs): From 9c9ed7890a57eccd6390632a73f13fc33565b513 Mon Sep 17 00:00:00 2001 From: uwol Date: Sun, 5 Nov 2017 12:33:43 +0100 Subject: [PATCH 4/9] added contributor agreement --- .github/contributors/uwol.md | 106 +++++++++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 .github/contributors/uwol.md diff --git a/.github/contributors/uwol.md b/.github/contributors/uwol.md new file mode 100644 index 000000000..ddc82d220 --- /dev/null +++ b/.github/contributors/uwol.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ulrich Wolffgang | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-11-05 | +| GitHub username | uwol | +| Website (optional) | https://uwol.github.io/ | From 0d4bd6414e011ff16b9987cf914978e91de91085 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Nov 2017 14:11:03 +0100 Subject: [PATCH 5/9] Fix Italian tag map --- spacy/lang/it/tag_map.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/lang/it/tag_map.py b/spacy/lang/it/tag_map.py index ef4fcf1da..420165f24 100644 --- a/spacy/lang/it/tag_map.py +++ b/spacy/lang/it/tag_map.py @@ -316,5 +316,5 @@ TAG_MAP = { "V__VerbForm=Ger": {"pos": "VERB"}, "V__VerbForm=Inf": {"pos": "VERB"}, "X___": {"pos": "X"}, - "_SP": {"pos": "_SP"} + "_SP": {"pos": "SPACE"} } From 00435d8f0cc906878cd6084c78c17cbc5a49b66e Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Nov 2017 14:39:57 +0100 Subject: [PATCH 6/9] Add extra beam parsing test --- spacy/tests/parser/test_beam_parse.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/spacy/tests/parser/test_beam_parse.py b/spacy/tests/parser/test_beam_parse.py index dd77c6805..59e307bcb 100644 --- a/spacy/tests/parser/test_beam_parse.py +++ b/spacy/tests/parser/test_beam_parse.py @@ -2,10 +2,22 @@ from __future__ import unicode_literals import pytest +from ...language import Language +from ...pipeline import DependencyParser @pytest.mark.models('en') -def test_beam_parse(EN): +def test_beam_parse_en(EN): doc = EN(u'Australia is a country', disable=['ner']) ents = EN.entity(doc, beam_width=2) print(ents) + + +def test_beam_parse(): + nlp = Language() + nlp.add_pipe(DependencyParser(nlp.vocab), name='parser') + nlp.parser.add_label('nsubj') + nlp.begin_training() + + doc = nlp.make_doc(u'Australia is a country') + nlp.parser(doc, beam_width=2) From 225cc249c920471ff677cb69d8eefa4f289dd7c6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Nov 2017 14:42:46 +0100 Subject: [PATCH 7/9] Pass string path to numpy, to fix #1479 --- spacy/vectors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index a96913109..8b85bba9c 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -358,7 +358,7 @@ cdef class Vectors: def load_vectors(path): xp = Model.ops.xp if path.exists(): - self.data = xp.load(path) + self.data = xp.load(str(path)) serializers = OrderedDict(( ('key2row', load_key2row), From 6f438b17c1ba27d7122ab53e7dd4633114be382f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Nov 2017 14:43:36 +0100 Subject: [PATCH 8/9] Increment version to v2.0.0a19 --- spacy/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/about.py b/spacy/about.py index 6f029bd9d..e9614eb40 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy-nightly' -__version__ = '2.0.0a18' +__version__ = '2.0.0a19' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Explosion AI' From 2b35bb76addc664d722cff0d00a2cf597610c347 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 5 Nov 2017 15:34:40 +0100 Subject: [PATCH 9/9] Fix tensorizer on GPU --- spacy/pipeline.pyx | 6 +++++- spacy/syntax/nn_parser.pyx | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 5a72dc946..f3defeeb9 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -415,7 +415,11 @@ class Tagger(Pipe): vocab.morphology.assign_tag_id(&doc.c[j], tag_id) idx += 1 if tensors is not None: - doc.extend_tensor(tensors[i]) + if isinstance(doc.tensor, numpy.ndarray) \ + and not isinstance(tensors[i], numpy.ndarray): + doc.extend_tensor(tensors[i].get()) + else: + doc.extend_tensor(tensors[i]) doc.is_tagged = True def update(self, docs, golds, drop=0., sgd=None, losses=None): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 6bfd729eb..08b01a88f 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -751,7 +751,11 @@ cdef class Parser: for j in range(doc.length): doc.c[j] = state.c._sent[j] if tensors is not None: - doc.extend_tensor(tensors[i]) + if isinstance(doc.tensor, numpy.ndarray) \ + and not isinstance(tensors[i], numpy.ndarray): + doc.extend_tensor(tensors[i].get()) + else: + doc.extend_tensor(tensors[i]) self.moves.finalize_doc(doc) for hook in self.postprocesses: