diff --git a/.github/contributors/ramananbalakrishnan.md b/.github/contributors/ramananbalakrishnan.md new file mode 100644 index 000000000..804c41f56 --- /dev/null +++ b/.github/contributors/ramananbalakrishnan.md @@ -0,0 +1,106 @@ +# spaCy contributor agreement + +This spaCy Contributor Agreement (**"SCA"**) is based on the +[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf). +The SCA applies to any contribution that you make to any product or project +managed by us (the **"project"**), and sets out the intellectual property rights +you grant to us in the contributed materials. The term **"us"** shall mean +[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term +**"you"** shall mean the person or entity identified below. + +If you agree to be bound by these terms, fill in the information requested +below and include the filled-in version with your first pull request, under the +folder [`.github/contributors/`](/.github/contributors/). The name of the file +should be your GitHub username, with the extension `.md`. For example, the user +example_user would create the file `.github/contributors/example_user.md`. + +Read this agreement carefully before signing. These terms and conditions +constitute a binding legal agreement. + +## Contributor Agreement + +1. The term "contribution" or "contributed materials" means any source code, +object code, patch, tool, sample, graphic, specification, manual, +documentation, or any other material posted or submitted by you to the project. + +2. With respect to any worldwide copyrights, or copyright applications and +registrations, in your contribution: + + * you hereby assign to us joint ownership, and to the extent that such + assignment is or becomes invalid, ineffective or unenforceable, you hereby + grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge, + royalty-free, unrestricted license to exercise all rights under those + copyrights. This includes, at our option, the right to sublicense these same + rights to third parties through multiple levels of sublicensees or other + licensing arrangements; + + * you agree that each of us can do all things in relation to your + contribution as if each of us were the sole owners, and if one of us makes + a derivative work of your contribution, the one who makes the derivative + work (or has it made will be the sole owner of that derivative work; + + * you agree that you will not assert any moral rights in your contribution + against us, our licensees or transferees; + + * you agree that we may register a copyright in your contribution and + exercise all ownership rights associated with it; and + + * you agree that neither of us has any duty to consult with, obtain the + consent of, pay or render an accounting to the other for any use or + distribution of your contribution. + +3. With respect to any patents you own, or that you can license without payment +to any third party, you hereby grant to us a perpetual, irrevocable, +non-exclusive, worldwide, no-charge, royalty-free license to: + + * make, have made, use, sell, offer to sell, import, and otherwise transfer + your contribution in whole or in part, alone or in combination with or + included in any product, work or materials arising out of the project to + which your contribution was submitted, and + + * at our option, to sublicense these same rights to third parties through + multiple levels of sublicensees or other licensing arrangements. + +4. Except as set out above, you keep all right, title, and interest in your +contribution. The rights that you grant to us under these terms are effective +on the date you first submitted a contribution to us, even if your submission +took place before the date you sign these terms. + +5. You covenant, represent, warrant and agree that: + + * Each contribution that you submit is and shall be an original work of + authorship and you can legally grant the rights set out in this SCA; + + * to the best of your knowledge, each contribution will not violate any + third party's copyrights, trademarks, patents, or other intellectual + property rights; and + + * each contribution shall be in compliance with U.S. export control laws and + other applicable export and import laws. You agree to notify us if you + become aware of any circumstance which would make any of the foregoing + representations inaccurate in any respect. We may publicly disclose your + participation in the project, including the fact that you have signed the SCA. + +6. This SCA is governed by the laws of the State of California and applicable +U.S. Federal law. Any choice of law rules will not apply. + +7. Please place an “x” on one of the applicable statement below. Please do NOT +mark both statements: + + * [x] I am signing on behalf of myself as an individual and no other person + or entity, including my employer, has or will have rights with respect to my + contributions. + + * [ ] I am signing on behalf of my employer or a legal entity and I have the + actual authority to contractually bind that entity. + +## Contributor Details + +| Field | Entry | +|------------------------------- | -------------------- | +| Name | Ramanan Balakrishnan | +| Company name (if applicable) | | +| Title or role (if applicable) | | +| Date | 2017-10-19 | +| GitHub username | ramananbalakrishnan | +| Website (optional) | | diff --git a/examples/training/train_new_entity_type.py b/examples/training/train_new_entity_type.py index ab69285a6..5f10beebc 100644 --- a/examples/training/train_new_entity_type.py +++ b/examples/training/train_new_entity_type.py @@ -56,8 +56,7 @@ def train_ner(nlp, train_data, output_dir): losses = {} for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3): docs, golds = zip(*batch) - nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True, - drop=0.35) + nlp.update(docs, golds, losses=losses, sgd=optimizer, drop=0.35) print(losses) if not output_dir: return @@ -100,9 +99,10 @@ def main(model_name, output_directory=None): ) ] - nlp.pipeline.append(TokenVectorEncoder(nlp.vocab)) - nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab)) - nlp.pipeline[-1].add_label('ANIMAL') + nlp.add_pipe(TokenVectorEncoder(nlp.vocab)) + ner = NeuralEntityRecognizer(nlp.vocab) + ner.add_label('ANIMAL') + nlp.add_pipe(ner) train_ner(nlp, train_data, output_directory) # Test that the entity is recognized diff --git a/examples/training/training-data.json b/examples/training/training-data.json new file mode 100644 index 000000000..7737b9a14 --- /dev/null +++ b/examples/training/training-data.json @@ -0,0 +1,641 @@ +[ + { + "id": "wsj_0200", + "paragraphs": [ + { + "raw": "In an Oct. 19 review of \"The Misanthrope\" at Chicago's Goodman Theatre (\"Revitalized Classics Take the Stage in Windy City,\" Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag. Ms. Haag plays Elianti.", + "sentences": [ + { + "tokens": [ + { + "head": 44, + "dep": "prep", + "tag": "IN", + "orth": "In", + "ner": "O", + "id": 0 + }, + { + "head": 3, + "dep": "det", + "tag": "DT", + "orth": "an", + "ner": "O", + "id": 1 + }, + { + "head": 2, + "dep": "nmod", + "tag": "NNP", + "orth": "Oct.", + "ner": "B-DATE", + "id": 2 + }, + { + "head": -1, + "dep": "nummod", + "tag": "CD", + "orth": "19", + "ner": "L-DATE", + "id": 3 + }, + { + "head": -4, + "dep": "pobj", + "tag": "NN", + "orth": "review", + "ner": "O", + "id": 4 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "of", + "ner": "O", + "id": 5 + }, + { + "head": 2, + "dep": "punct", + "tag": "``", + "orth": "``", + "ner": "O", + "id": 6 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "The", + "ner": "B-WORK_OF_ART", + "id": 7 + }, + { + "head": -3, + "dep": "pobj", + "tag": "NN", + "orth": "Misanthrope", + "ner": "L-WORK_OF_ART", + "id": 8 + }, + { + "head": -1, + "dep": "punct", + "tag": "''", + "orth": "''", + "ner": "O", + "id": 9 + }, + { + "head": -2, + "dep": "prep", + "tag": "IN", + "orth": "at", + "ner": "O", + "id": 10 + }, + { + "head": 3, + "dep": "poss", + "tag": "NNP", + "orth": "Chicago", + "ner": "U-GPE", + "id": 11 + }, + { + "head": -1, + "dep": "case", + "tag": "POS", + "orth": "'s", + "ner": "O", + "id": 12 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Goodman", + "ner": "B-FAC", + "id": 13 + }, + { + "head": -4, + "dep": "pobj", + "tag": "NNP", + "orth": "Theatre", + "ner": "L-FAC", + "id": 14 + }, + { + "head": 4, + "dep": "punct", + "tag": "-LRB-", + "orth": "(", + "ner": "O", + "id": 15 + }, + { + "head": 3, + "dep": "punct", + "tag": "``", + "orth": "``", + "ner": "O", + "id": 16 + }, + { + "head": 1, + "dep": "amod", + "tag": "VBN", + "orth": "Revitalized", + "ner": "B-WORK_OF_ART", + "id": 17 + }, + { + "head": 1, + "dep": "nsubj", + "tag": "NNS", + "orth": "Classics", + "ner": "I-WORK_OF_ART", + "id": 18 + }, + { + "head": -15, + "dep": "appos", + "tag": "VBP", + "orth": "Take", + "ner": "I-WORK_OF_ART", + "id": 19 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "the", + "ner": "I-WORK_OF_ART", + "id": 20 + }, + { + "head": -2, + "dep": "dobj", + "tag": "NN", + "orth": "Stage", + "ner": "I-WORK_OF_ART", + "id": 21 + }, + { + "head": -3, + "dep": "prep", + "tag": "IN", + "orth": "in", + "ner": "I-WORK_OF_ART", + "id": 22 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Windy", + "ner": "I-WORK_OF_ART", + "id": 23 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "City", + "ner": "L-WORK_OF_ART", + "id": 24 + }, + { + "head": -6, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 25 + }, + { + "head": -7, + "dep": "punct", + "tag": "''", + "orth": "''", + "ner": "O", + "id": 26 + }, + { + "head": -8, + "dep": "npadvmod", + "tag": "NN", + "orth": "Leisure", + "ner": "B-ORG", + "id": 27 + }, + { + "head": -1, + "dep": "cc", + "tag": "CC", + "orth": "&", + "ner": "I-ORG", + "id": 28 + }, + { + "head": -2, + "dep": "conj", + "tag": "NNS", + "orth": "Arts", + "ner": "L-ORG", + "id": 29 + }, + { + "head": -11, + "dep": "punct", + "tag": "-RRB-", + "orth": ")", + "ner": "O", + "id": 30 + }, + { + "head": 13, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 31 + }, + { + "head": 1, + "dep": "det", + "tag": "DT", + "orth": "the", + "ner": "O", + "id": 32 + }, + { + "head": 11, + "dep": "nsubjpass", + "tag": "NN", + "orth": "role", + "ner": "O", + "id": 33 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "of", + "ner": "O", + "id": 34 + }, + { + "head": -1, + "dep": "pobj", + "tag": "NNP", + "orth": "Celimene", + "ner": "U-PERSON", + "id": 35 + }, + { + "head": -3, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 36 + }, + { + "head": -4, + "dep": "acl", + "tag": "VBN", + "orth": "played", + "ner": "O", + "id": 37 + }, + { + "head": -1, + "dep": "agent", + "tag": "IN", + "orth": "by", + "ner": "O", + "id": 38 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Kim", + "ner": "B-PERSON", + "id": 39 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "Cattrall", + "ner": "L-PERSON", + "id": 40 + }, + { + "head": -8, + "dep": "punct", + "tag": ",", + "orth": ",", + "ner": "O", + "id": 41 + }, + { + "head": 2, + "dep": "auxpass", + "tag": "VBD", + "orth": "was", + "ner": "O", + "id": 42 + }, + { + "head": 1, + "dep": "advmod", + "tag": "RB", + "orth": "mistakenly", + "ner": "O", + "id": 43 + }, + { + "head": 0, + "dep": "root", + "tag": "VBN", + "orth": "attributed", + "ner": "O", + "id": 44 + }, + { + "head": -1, + "dep": "prep", + "tag": "IN", + "orth": "to", + "ner": "O", + "id": 45 + }, + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Christina", + "ner": "B-PERSON", + "id": 46 + }, + { + "head": -2, + "dep": "pobj", + "tag": "NNP", + "orth": "Haag", + "ner": "L-PERSON", + "id": 47 + }, + { + "head": -4, + "dep": "punct", + "tag": ".", + "orth": ".", + "ner": "O", + "id": 48 + } + ], + "brackets": [ + { + "first": 2, + "last": 3, + "label": "NML" + }, + { + "first": 1, + "last": 4, + "label": "NP" + }, + { + "first": 7, + "last": 8, + "label": "NP-TTL" + }, + { + "first": 11, + "last": 12, + "label": "NP" + }, + { + "first": 11, + "last": 14, + "label": "NP" + }, + { + "first": 10, + "last": 14, + "label": "PP-LOC" + }, + { + "first": 6, + "last": 14, + "label": "NP" + }, + { + "first": 5, + "last": 14, + "label": "PP" + }, + { + "first": 1, + "last": 14, + "label": "NP" + }, + { + "first": 17, + "last": 18, + "label": "NP-SBJ" + }, + { + "first": 20, + "last": 21, + "label": "NP" + }, + { + "first": 23, + "last": 24, + "label": "NP" + }, + { + "first": 22, + "last": 24, + "label": "PP-LOC" + }, + { + "first": 19, + "last": 24, + "label": "VP" + }, + { + "first": 17, + "last": 24, + "label": "S-HLN" + }, + { + "first": 27, + "last": 29, + "label": "NP-TMP" + }, + { + "first": 15, + "last": 30, + "label": "NP" + }, + { + "first": 1, + "last": 30, + "label": "NP" + }, + { + "first": 0, + "last": 30, + "label": "PP-LOC" + }, + { + "first": 32, + "last": 33, + "label": "NP" + }, + { + "first": 35, + "last": 35, + "label": "NP" + }, + { + "first": 34, + "last": 35, + "label": "PP" + }, + { + "first": 32, + "last": 35, + "label": "NP" + }, + { + "first": 39, + "last": 40, + "label": "NP-LGS" + }, + { + "first": 38, + "last": 40, + "label": "PP" + }, + { + "first": 37, + "last": 40, + "label": "VP" + }, + { + "first": 32, + "last": 41, + "label": "NP-SBJ-2" + }, + { + "first": 43, + "last": 43, + "label": "ADVP-MNR" + }, + { + "first": 46, + "last": 47, + "label": "NP" + }, + { + "first": 45, + "last": 47, + "label": "PP-CLR" + }, + { + "first": 44, + "last": 47, + "label": "VP" + }, + { + "first": 42, + "last": 47, + "label": "VP" + }, + { + "first": 0, + "last": 48, + "label": "S" + } + ] + }, + { + "tokens": [ + { + "head": 1, + "dep": "compound", + "tag": "NNP", + "orth": "Ms.", + "ner": "O", + "id": 0 + }, + { + "head": 1, + "dep": "nsubj", + "tag": "NNP", + "orth": "Haag", + "ner": "U-PERSON", + "id": 1 + }, + { + "head": 0, + "dep": "root", + "tag": "VBZ", + "orth": "plays", + "ner": "O", + "id": 2 + }, + { + "head": -1, + "dep": "dobj", + "tag": "NNP", + "orth": "Elianti", + "ner": "U-PERSON", + "id": 3 + }, + { + "head": -2, + "dep": "punct", + "tag": ".", + "orth": ".", + "ner": "O", + "id": 4 + } + ], + "brackets": [ + { + "first": 0, + "last": 1, + "label": "NP-SBJ" + }, + { + "first": 3, + "last": 3, + "label": "NP" + }, + { + "first": 2, + "last": 3, + "label": "VP" + }, + { + "first": 0, + "last": 4, + "label": "S" + } + ] + } + ] + } + ] + } + ] diff --git a/spacy/_ml.py b/spacy/_ml.py index 8d1b81048..644a27d9f 100644 --- a/spacy/_ml.py +++ b/spacy/_ml.py @@ -112,9 +112,10 @@ def _preprocess_doc(docs, drop=0.): nO=Dimension("Output size"), nP=Dimension("Maxout pieces"), W=Synapses("Weights matrix", - lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)), + lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI) if obj.nP >= 2 + else (obj.nF, obj.nO, obj.nI)), b=Biases("Bias vector", - lambda obj: (obj.nO, obj.nP)), + lambda obj: (obj.nO, obj.nP) if obj.nP >= 2 else (obj.nO,)), d_W=Gradient("W"), d_b=Gradient("b") ) @@ -129,17 +130,24 @@ class PrecomputableAffine(Model): def begin_update(self, X, drop=0.): tensordot = self.ops.xp.tensordot ascontiguous = self.ops.xp.ascontiguousarray - - Yf = tensordot(X, self.W, axes=[[1], [3]]) + if self.nP == 1: + Yf = tensordot(X, self.W, axes=[[1], [2]]) + else: + Yf = tensordot(X, self.W, axes=[[1], [3]]) def backward(dY_ids, sgd=None): dY, ids = dY_ids Xf = X[ids] - - dXf = tensordot(dY, self.W, axes=[[1,2], [1,2]]) + if self.nP == 1: + dXf = tensordot(dY, self.W, axes=[[1], [1]]) + else: + dXf = tensordot(dY, self.W, axes=[[1,2], [1,2]]) dW = tensordot(dY, Xf, axes=[[0], [0]]) # (o, p, f, i) --> (f, o, p, i) - self.d_W += dW.transpose((2, 0, 1, 3)) + if self.nP == 1: + self.d_W += dW.transpose((1, 0, 2)) + else: + self.d_W += dW.transpose((2, 0, 1, 3)) self.d_b += dY.sum(axis=0) if sgd is not None: @@ -169,7 +177,10 @@ class PrecomputableAffine(Model): def predict(ids, tokvecs): hiddens = model(tokvecs) - vector = model.ops.allocate((hiddens.shape[0], model.nO, model.nP)) + if model.nP == 1: + vector = model.ops.allocate((hiddens.shape[0], model.nO)) + else: + vector = model.ops.allocate((hiddens.shape[0], model.nO, model.nP)) model.ops.scatter_add(vector, ids, hiddens) vector += model.b if model.nP >= 2: diff --git a/spacy/about.py b/spacy/about.py index 699b61aff..45b91955a 100644 --- a/spacy/about.py +++ b/spacy/about.py @@ -3,7 +3,7 @@ # https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py __title__ = 'spacy-nightly' -__version__ = '2.0.0a17' +__version__ = '2.0.0a18' __summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython' __uri__ = 'https://spacy.io' __author__ = 'Explosion AI' diff --git a/spacy/cli/package.py b/spacy/cli/package.py index 5ffc493c3..83d4917f6 100644 --- a/spacy/cli/package.py +++ b/spacy/cli/package.py @@ -101,7 +101,7 @@ def generate_meta(): def generate_pipeline(): prints("If set to 'True', the default pipeline is used. If set to 'False', " "the pipeline will be disabled. Components should be specified as a " - "comma-separated list of component names, e.g. tensorizer, tagger, " + "comma-separated list of component names, e.g. tagger, " "parser, ner. For more information, see the docs on processing pipelines.", title="Enter your model's pipeline components") pipeline = util.get_raw_input("Pipeline components", True) diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py index d16bd17e0..730c15cfc 100644 --- a/spacy/lang/de/tag_map.py +++ b/spacy/lang/de/tag_map.py @@ -62,5 +62,5 @@ TAG_MAP = { "VVIZU": {POS: VERB, "VerbForm": "inf"}, "VVPP": {POS: VERB, "Aspect": "perf", "VerbForm": "part"}, "XY": {POS: X}, - "SP": {POS: SPACE} + "_SP": {POS: SPACE} } diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py index a674c17e3..fc3d2cc93 100644 --- a/spacy/lang/en/tag_map.py +++ b/spacy/lang/en/tag_map.py @@ -42,6 +42,7 @@ TAG_MAP = { "RBR": {POS: ADV, "Degree": "comp"}, "RBS": {POS: ADV, "Degree": "sup"}, "RP": {POS: PART}, + "SP": {POS: SPACE}, "SYM": {POS: SYM}, "TO": {POS: PART, "PartType": "inf", "VerbForm": "inf"}, "UH": {POS: INTJ}, @@ -55,11 +56,11 @@ TAG_MAP = { "WP": {POS: NOUN, "PronType": "int|rel"}, "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"}, "WRB": {POS: ADV, "PronType": "int|rel"}, - "SP": {POS: SPACE}, "ADD": {POS: X}, "NFP": {POS: PUNCT}, "GW": {POS: X}, "XX": {POS: X}, "BES": {POS: VERB}, - "HVS": {POS: VERB} + "HVS": {POS: VERB}, + "_SP": {POS: SPACE}, } diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py index 86dd48620..2095d23b1 100644 --- a/spacy/lang/es/tag_map.py +++ b/spacy/lang/es/tag_map.py @@ -303,5 +303,5 @@ TAG_MAP = { "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"}, "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"}, "X___": {"morph": "_", "pos": "X"}, - "SP": {"morph": "_", "pos": "SPACE"}, + "_SP": {"morph": "_", "pos": "SPACE"}, } diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py index 3a9c58fca..04cc013a4 100644 --- a/spacy/lang/ja/__init__.py +++ b/spacy/lang/ja/__init__.py @@ -33,8 +33,7 @@ class Japanese(Language): Defaults = JapaneseDefaults def make_doc(self, text): - words = self.tokenizer(text) - return Doc(self.vocab, words=words, spaces=[False]*len(words)) + return self.tokenizer(text) __all__ = ['Japanese'] diff --git a/spacy/lang/ja/examples.py b/spacy/lang/ja/examples.py new file mode 100644 index 000000000..623609205 --- /dev/null +++ b/spacy/lang/ja/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.ja.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + 'アップルがイギリスの新興企業を10億ドルで購入を検討', + '自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める', + '歩道を走る自動配達ロボ、サンフランシスコ市が走行禁止を検討', + 'ロンドンはイギリスの大都市です。' +] diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py index 40e5ac44c..570871820 100644 --- a/spacy/lang/th/tag_map.py +++ b/spacy/lang/th/tag_map.py @@ -19,63 +19,64 @@ TAG_MAP = { "NPRP": {POS: PRON}, # ADJ "ADJ": {POS: ADJ}, - "NONM": {POS: ADJ}, - "VATT": {POS: ADJ}, - "DONM": {POS: ADJ}, + "NONM": {POS: ADJ}, + "VATT": {POS: ADJ}, + "DONM": {POS: ADJ}, # ADV "ADV": {POS: ADV}, - "ADVN": {POS: ADV}, - "ADVI": {POS: ADV}, - "ADVP": {POS: ADV}, - "ADVS": {POS: ADV}, + "ADVN": {POS: ADV}, + "ADVI": {POS: ADV}, + "ADVP": {POS: ADV}, + "ADVS": {POS: ADV}, # INT "INT": {POS: INTJ}, # PRON "PROPN": {POS: PROPN}, - "PPRS": {POS: PROPN}, - "PDMN": {POS: PROPN}, - "PNTR": {POS: PROPN}, + "PPRS": {POS: PROPN}, + "PDMN": {POS: PROPN}, + "PNTR": {POS: PROPN}, # DET "DET": {POS: DET}, - "DDAN": {POS: DET}, - "DDAC": {POS: DET}, - "DDBQ": {POS: DET}, - "DDAQ": {POS: DET}, - "DIAC": {POS: DET}, - "DIBQ": {POS: DET}, - "DIAQ": {POS: DET}, - "DCNM": {POS: DET}, + "DDAN": {POS: DET}, + "DDAC": {POS: DET}, + "DDBQ": {POS: DET}, + "DDAQ": {POS: DET}, + "DIAC": {POS: DET}, + "DIBQ": {POS: DET}, + "DIAQ": {POS: DET}, + "DCNM": {POS: DET}, # NUM "NUM": {POS: NUM}, - "NCNM": {POS: NUM}, - "NLBL": {POS: NUM}, - "DCNM": {POS: NUM}, + "NCNM": {POS: NUM}, + "NLBL": {POS: NUM}, + "DCNM": {POS: NUM}, # AUX "AUX": {POS: AUX}, - "XVBM": {POS: AUX}, - "XVAM": {POS: AUX}, - "XVMM": {POS: AUX}, - "XVBB": {POS: AUX}, - "XVAE": {POS: AUX}, + "XVBM": {POS: AUX}, + "XVAM": {POS: AUX}, + "XVMM": {POS: AUX}, + "XVBB": {POS: AUX}, + "XVAE": {POS: AUX}, # ADP "ADP": {POS: ADP}, - "RPRE": {POS: ADP}, + "RPRE": {POS: ADP}, # CCONJ "CCONJ": {POS: CCONJ}, - "JCRG": {POS: CCONJ}, + "JCRG": {POS: CCONJ}, # SCONJ "SCONJ": {POS: SCONJ}, - "PREL": {POS: SCONJ}, - "JSBR": {POS: SCONJ}, - "JCMP": {POS: SCONJ}, + "PREL": {POS: SCONJ}, + "JSBR": {POS: SCONJ}, + "JCMP": {POS: SCONJ}, # PART - "PART": {POS: PART}, - "FIXN": {POS: PART}, - "FIXV": {POS: PART}, - "EAFF": {POS: PART}, - "AITT": {POS: PART}, - "NEG": {POS: PART}, + "PART": {POS: PART}, + "FIXN": {POS: PART}, + "FIXV": {POS: PART}, + "EAFF": {POS: PART}, + "AITT": {POS: PART}, + "NEG": {POS: PART}, # PUNCT "PUNCT": {POS: PUNCT}, - "PUNC": {POS: PUNCT} + "PUNC": {POS: PUNCT}, + "_SP": {POS: SPACE} } diff --git a/spacy/lang/zh/examples.py b/spacy/lang/zh/examples.py new file mode 100644 index 000000000..5e8a36119 --- /dev/null +++ b/spacy/lang/zh/examples.py @@ -0,0 +1,18 @@ +# coding: utf8 +from __future__ import unicode_literals + + +""" +Example sentences to test spaCy and its language models. + +>>> from spacy.lang.zh.examples import sentences +>>> docs = nlp.pipe(sentences) +""" + + +sentences = [ + "蘋果公司正考量用一億元買下英國的新創公司", + "自駕車將保險責任歸屬轉移至製造商", + "舊金山考慮禁止送貨機器人在人行道上行駛", + "倫敦是英國的大城市" +] diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index bd2ca766a..f3327a1d7 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -7,8 +7,8 @@ from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos class Lemmatizer(object): @classmethod - def load(cls, path, index=None, exc=None, rules=None): - return cls(index or {}, exc or {}, rules or {}) + def load(cls, path, index=None, exc=None, rules=None, lookup=None): + return cls(index or {}, exc or {}, rules or {}, lookup or {}) def __init__(self, index=None, exceptions=None, rules=None, lookup=None): self.index = index if index is not None else {} @@ -26,10 +26,10 @@ class Lemmatizer(object): elif univ_pos in (PUNCT, 'PUNCT', 'punct'): univ_pos = 'punct' else: - return set([string.lower()]) + return list(set([string.lower()])) # See Issue #435 for example of where this logic is requied. if self.is_base_form(univ_pos, morphology): - return set([string.lower()]) + return list(set([string.lower()])) lemmas = lemmatize(string, self.index.get(univ_pos, {}), self.exc.get(univ_pos, {}), self.rules.get(univ_pos, [])) @@ -108,4 +108,4 @@ def lemmatize(string, index, exceptions, rules): forms.extend(oov_forms) if not forms: forms.append(string) - return set(forms) + return list(set(forms)) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 24d0a9836..a0c69f4bf 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -69,6 +69,7 @@ cdef enum action_t: REPEAT ACCEPT ADVANCE_ZERO + ACCEPT_PREV PANIC # A "match expression" conists of one or more token patterns @@ -120,24 +121,27 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0: cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil: + lookahead = &pattern[1] for attr in pattern.attrs[:pattern.nr_attr]: if get_token_attr(token, attr.attr) != attr.value: if pattern.quantifier == ONE: return REJECT elif pattern.quantifier == ZERO: - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE + return ACCEPT if lookahead.nr_attr == 0 else ADVANCE elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS): - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE_ZERO + return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO else: return PANIC if pattern.quantifier == ZERO: return REJECT + elif lookahead.nr_attr == 0: + return ACCEPT elif pattern.quantifier in (ONE, ZERO_ONE): - return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE + return ADVANCE elif pattern.quantifier == ZERO_PLUS: # This is a bandaid over the 'shadowing' problem described here: # https://github.com/explosion/spaCy/issues/864 - next_action = get_action(pattern+1, token) + next_action = get_action(lookahead, token) if next_action is REJECT: return REPEAT else: @@ -345,6 +349,9 @@ cdef class Matcher: while action == ADVANCE_ZERO: state.second += 1 action = get_action(state.second, token) + if action == PANIC: + raise Exception("Error selecting action in matcher") + if action == REPEAT: # Leave the state in the queue, and advance to next slot # (i.e. we don't overwrite -- we want to greedily match more @@ -356,14 +363,15 @@ cdef class Matcher: partials[q] = state partials[q].second += 1 q += 1 - elif action == ACCEPT: + elif action in (ACCEPT, ACCEPT_PREV): # TODO: What to do about patterns starting with ZERO? Need to # adjust the start position. start = state.first - end = token_i+1 + end = token_i+1 if action == ACCEPT else token_i ent_id = state.second[1].attrs[0].value label = state.second[1].attrs[1].value matches.append((ent_id, start, end)) + partials.resize(q) # Check whether we open any new patterns on this token for pattern in self.patterns: @@ -383,15 +391,15 @@ cdef class Matcher: state.first = token_i state.second = pattern + 1 partials.push_back(state) - elif action == ACCEPT: + elif action in (ACCEPT, ACCEPT_PREV): start = token_i - end = token_i+1 + end = token_i+1 if action == ACCEPT else token_i ent_id = pattern[1].attrs[0].value label = pattern[1].attrs[1].value matches.append((ent_id, start, end)) # Look for open patterns that are actually satisfied for state in partials: - while state.second.quantifier in (ZERO, ZERO_PLUS): + while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS): state.second += 1 if state.second.nr_attr == 0: start = state.first diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd index be6711bfd..9192f351f 100644 --- a/spacy/morphology.pxd +++ b/spacy/morphology.pxd @@ -44,7 +44,7 @@ cdef class Morphology: cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1 -cpdef enum univ_morph_t: +cdef enum univ_morph_t: NIL = 0 Animacy_anim = symbols.Animacy_anim Animacy_inam diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 4a1a0aa54..91befaa1b 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -4,7 +4,7 @@ from __future__ import unicode_literals from libc.string cimport memset -from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT +from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE from .attrs cimport POS, IS_SPACE from .parts_of_speech import IDS as POS_IDS from .lexeme cimport Lexeme @@ -36,14 +36,22 @@ cdef class Morphology: def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None): self.mem = Pool() self.strings = string_store + # Add special space symbol. We prefix with underscore, to make sure it + # always sorts to the end. + space_attrs = tag_map.get('SP', {POS: SPACE}) + if '_SP' not in tag_map: + self.strings.add('_SP') + tag_map = dict(tag_map) + tag_map['_SP'] = space_attrs + self.tag_names = tuple(sorted(tag_map.keys())) self.tag_map = {} self.lemmatizer = lemmatizer self.n_tags = len(tag_map) - self.tag_names = tuple(sorted(tag_map.keys())) self.reverse_index = {} self.rich_tags = self.mem.alloc(self.n_tags+1, sizeof(RichTagC)) for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())): + self.strings.add(tag_str) self.tag_map[tag_str] = dict(attrs) attrs = _normalize_props(attrs) attrs = intify_attrs(attrs, self.strings, _do_deprecated=True) @@ -93,7 +101,7 @@ cdef class Morphology: # the statistical model fails. # Related to Issue #220 if Lexeme.c_check_flag(token.lex, IS_SPACE): - tag_id = self.reverse_index[self.strings.add('SP')] + tag_id = self.reverse_index[self.strings.add('_SP')] rich_tag = self.rich_tags[tag_id] analysis = self._cache.get(tag_id, token.lex.orth) if analysis is NULL: @@ -164,7 +172,7 @@ cdef class Morphology: cdef unicode py_string = self.strings[orth] if self.lemmatizer is None: return self.strings.add(py_string.lower()) - cdef set lemma_strings + cdef list lemma_strings cdef unicode lemma_string lemma_strings = self.lemmatizer(py_string, univ_pos, morphology) lemma_string = sorted(lemma_strings)[0] @@ -426,3 +434,7 @@ IDS = { NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])] +# Unfortunate hack here, to work around problem with long cpdef enum +# (which is generating an enormous amount of C++ in Cython 0.24+) +# We keep the enum cdef, and just make sure the names are available to Python +locals().update(IDS) diff --git a/spacy/symbols.pxd b/spacy/symbols.pxd index e981de6ae..6960681a3 100644 --- a/spacy/symbols.pxd +++ b/spacy/symbols.pxd @@ -13,12 +13,12 @@ cdef enum symbol_t: LIKE_EMAIL IS_STOP IS_OOV + IS_BRACKET + IS_QUOTE + IS_LEFT_PUNCT + IS_RIGHT_PUNCT - FLAG14 = 14 - FLAG15 - FLAG16 - FLAG17 - FLAG18 + FLAG18 = 18 FLAG19 FLAG20 FLAG21 @@ -455,15 +455,5 @@ cdef enum symbol_t: root xcomp -# Move these up to FLAG14--FLAG18 once we finish the functionality -# and are ready to regenerate the model. -#IS_BRACKET -#IS_QUOTE -#IS_LEFT_PUNCT -#IS_RIGHT_PUNCT - -# These symbols are currently missing. However, if we add them currently, -# we'll throw off the integer index and the model will have to be retrained. -# We therefore wait until the next data version to add them. -# acl - + acl + LAW diff --git a/spacy/symbols.pyx b/spacy/symbols.pyx index b7f1f4556..0e0337b6e 100644 --- a/spacy/symbols.pyx +++ b/spacy/symbols.pyx @@ -18,10 +18,11 @@ IDS = { "LIKE_EMAIL": LIKE_EMAIL, "IS_STOP": IS_STOP, "IS_OOV": IS_OOV, - "FLAG14": FLAG14, - "FLAG15": FLAG15, - "FLAG16": FLAG16, - "FLAG17": FLAG17, + "IS_BRACKET": IS_BRACKET, + "IS_QUOTE": IS_QUOTE, + "IS_LEFT_PUNCT": IS_LEFT_PUNCT, + "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT, + "FLAG18": FLAG18, "FLAG19": FLAG19, "FLAG20": FLAG20, @@ -457,7 +458,10 @@ IDS = { "quantmod": quantmod, "rcmod": rcmod, "root": root, - "xcomp": xcomp + "xcomp": xcomp, + + "acl": acl, + "LAW": LAW } def sort_nums(x): diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd index 4675d887e..803348b53 100644 --- a/spacy/syntax/_state.pxd +++ b/spacy/syntax/_state.pxd @@ -2,6 +2,8 @@ from libc.string cimport memcpy, memset, memmove from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint32_t, uint64_t +from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno + from murmurhash.mrmr cimport hash64 from ..vocab cimport EMPTY_LEXEME @@ -55,6 +57,11 @@ cdef cppclass StateC: this.shifted = calloc(length + (PADDING * 2), sizeof(bint)) this._sent = calloc(length + (PADDING * 2), sizeof(TokenC)) this._ents = calloc(length + (PADDING * 2), sizeof(Entity)) + if not (this._buffer and this._stack and this.shifted + and this._sent and this._ents): + with gil: + PyErr_SetFromErrno(MemoryError) + PyErr_CheckSignals() memset(&this._hist, 0, sizeof(this._hist)) this.offset = 0 cdef int i diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx index 9770383d1..8adb8e52c 100644 --- a/spacy/syntax/arc_eager.pyx +++ b/spacy/syntax/arc_eager.pyx @@ -212,7 +212,8 @@ cdef class LeftArc: cdef class RightArc: @staticmethod cdef bint is_valid(const StateC* st, attr_t label) nogil: - return st.B_(0).sent_start != 1 + # If there's (perhaps partial) parse pre-set, don't allow cycle. + return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0) @staticmethod cdef int transition(StateC* st, attr_t label) nogil: @@ -446,14 +447,19 @@ cdef class ArcEager(TransitionSystem): cdef int initialize_state(self, StateC* st) nogil: for i in range(st.length): - st._sent[i].l_edge = i - st._sent[i].r_edge = i + if st._sent[i].dep == 0: + st._sent[i].l_edge = i + st._sent[i].r_edge = i + st._sent[i].head = 0 + st._sent[i].dep = 0 + st._sent[i].l_kids = 0 + st._sent[i].r_kids = 0 st.fast_forward() cdef int finalize_state(self, StateC* st) nogil: cdef int i for i in range(st.length): - if st._sent[i].head == 0 and st._sent[i].dep == 0: + if st._sent[i].head == 0: st._sent[i].dep = self.root_label def finalize_doc(self, doc): diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index eb33d4a7b..04b2feb1c 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -1,5 +1,4 @@ # cython: infer_types=True -# cython: profile=True # cython: cdivision=True # cython: boundscheck=False # coding: utf-8 @@ -22,7 +21,7 @@ cimport numpy as np from libcpp.vector cimport vector from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF -from cpython.exc cimport PyErr_CheckSignals +from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno from libc.stdint cimport uint32_t, uint64_t from libc.string cimport memset, memcpy from libc.stdlib cimport malloc, calloc, free @@ -440,6 +439,7 @@ cdef class Parser: self._parseC(states[i], feat_weights, bias, hW, hb, nr_class, nr_hidden, nr_feat, nr_piece) + PyErr_CheckSignals() return state_objs cdef void _parseC(self, StateC* state, @@ -450,7 +450,11 @@ cdef class Parser: is_valid = calloc(nr_class, sizeof(int)) vectors = calloc(nr_hidden * nr_piece, sizeof(float)) scores = calloc(nr_class, sizeof(float)) - + if not (token_ids and is_valid and vectors and scores): + with gil: + PyErr_SetFromErrno(MemoryError) + PyErr_CheckSignals() + while not state.is_final(): state.set_context_tokens(token_ids, nr_feat) memset(vectors, 0, nr_hidden * nr_piece * sizeof(float)) diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py index dd87aa763..ff10394d1 100644 --- a/spacy/tests/doc/test_array.py +++ b/spacy/tests/doc/test_array.py @@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab): assert feats_array[0][0] != feats_array[0][1] +def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab): + text = "An example sentence" + tokens = en_tokenizer(text) + example = tokens.vocab["example"] + assert example.orth != example.shape + feats_array = tokens.to_array((ORTH, SHAPE)) + feats_array_stringy = tokens.to_array(("ORTH", "SHAPE")) + assert feats_array_stringy[0][0] == feats_array[0][0] + assert feats_array_stringy[0][1] == feats_array[0][1] + + +def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab): + text = "An example sentence" + tokens = en_tokenizer(text) + example = tokens.vocab["example"] + assert example.orth != example.shape + feats_array = tokens.to_array(ORTH) + assert feats_array.shape == (3,) + + def test_doc_array_tag(en_tokenizer): text = "A nice sentence." pos = ['DET', 'ADJ', 'NOUN', 'PUNCT'] diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py index cbe1bbc66..46c615973 100644 --- a/spacy/tests/doc/test_doc_api.py +++ b/spacy/tests/doc/test_doc_api.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals from ..util import get_doc +from ...tokens import Doc +from ...vocab import Vocab import pytest import numpy @@ -204,19 +206,20 @@ def test_doc_api_right_edge(en_tokenizer): assert doc[6].right_edge.text == ',' -@pytest.mark.xfail -@pytest.mark.parametrize('text,vectors', [ - ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"]) -]) -def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors): - text_file.write('\n'.join(vectors)) - text_file.seek(0) - vector_length = en_tokenizer.vocab.load_vectors(text_file) - assert vector_length == 3 - - doc = en_tokenizer(text) +def test_doc_api_has_vector(): + vocab = Vocab() + vocab.clear_vectors(2) + vocab.vectors.add('kitten', numpy.asarray([0., 2.], dtype='f')) + doc = Doc(vocab, words=['kitten']) assert doc.has_vector +def test_lowest_common_ancestor(en_tokenizer): + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc.get_lca_matrix() + assert(lca[1, 1] == 1) + assert(lca[0, 1] == 2) + assert(lca[1, 2] == 2) def test_parse_tree(en_tokenizer): """Tests doc.print_tree() method.""" diff --git a/spacy/tests/doc/test_token_api.py b/spacy/tests/doc/test_token_api.py index 00caa1445..0ab723f7a 100644 --- a/spacy/tests/doc/test_token_api.py +++ b/spacy/tests/doc/test_token_api.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from ...attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP from ..util import get_doc +from ...vocab import Vocab +from ...tokens import Doc import pytest import numpy @@ -68,26 +70,21 @@ def test_doc_token_api_is_properties(en_vocab): assert doc[5].like_email -@pytest.mark.xfail -@pytest.mark.parametrize('text,vectors', [ - ("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"]) -]) -def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors): - text_file.write('\n'.join(vectors)) - text_file.seek(0) - vector_length = en_tokenizer.vocab.load_vectors(text_file) - assert vector_length == 3 +def test_doc_token_api_vectors(): + vocab = Vocab() + vocab.clear_vectors(2) + vocab.vectors.add('apples', numpy.asarray([0., 2.], dtype='f')) + vocab.vectors.add('oranges', numpy.asarray([0., 1.], dtype='f')) + doc = Doc(vocab, words=['apples', 'oranges', 'oov']) + assert doc.has_vector - tokens = en_tokenizer(text) - assert tokens[0].has_vector - assert tokens[1].has_vector - assert not tokens[2].has_vector - assert tokens[0].similarity(tokens[1]) > tokens[0].similarity(tokens[2]) - assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0]) - assert sum(tokens[0].vector) != sum(tokens[1].vector) - assert numpy.isclose( - tokens[0].vector_norm, - numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector))) + assert doc[0].has_vector + assert doc[1].has_vector + assert not doc[2].has_vector + apples_norm = (0*0 + 2*2) ** 0.5 + oranges_norm = (0*0 + 1*1) ** 0.5 + cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm) + assert doc[0].similarity(doc[1]) == cosine def test_doc_token_api_ancestors(en_tokenizer): diff --git a/spacy/tests/regression/test_issue1242.py b/spacy/tests/regression/test_issue1242.py new file mode 100644 index 000000000..50dc8c37e --- /dev/null +++ b/spacy/tests/regression/test_issue1242.py @@ -0,0 +1,23 @@ +from __future__ import unicode_literals +import pytest +from ...lang.en import English +from ...util import load_model + + +def test_issue1242_empty_strings(): + nlp = English() + doc = nlp('') + assert len(doc) == 0 + docs = list(nlp.pipe(['', 'hello'])) + assert len(docs[0]) == 0 + assert len(docs[1]) == 1 + + +@pytest.mark.models('en') +def test_issue1242_empty_strings_en_core_web_sm(): + nlp = load_model('en_core_web_sm') + doc = nlp('') + assert len(doc) == 0 + docs = list(nlp.pipe(['', 'hello'])) + assert len(docs[0]) == 0 + assert len(docs[1]) == 1 diff --git a/spacy/tests/regression/test_issue1250.py b/spacy/tests/regression/test_issue1250.py new file mode 100644 index 000000000..3b6e0bbf2 --- /dev/null +++ b/spacy/tests/regression/test_issue1250.py @@ -0,0 +1,13 @@ +from __future__ import unicode_literals +from ...tokenizer import Tokenizer +from ...symbols import ORTH, LEMMA, POS +from ...lang.en import English + +def test_issue1250_cached_special_cases(): + nlp = English() + nlp.tokenizer.add_special_case(u'reimbur', [{ORTH: u'reimbur', LEMMA: u'reimburse', POS: u'VERB'}]) + + lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')] + assert lemmas == ['reimburse', ',', 'reimburse', '...'] + lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')] + assert lemmas == ['reimburse', ',', 'reimburse', '...'] diff --git a/spacy/tests/regression/test_issue1253.py b/spacy/tests/regression/test_issue1253.py new file mode 100644 index 000000000..2fe77d6d8 --- /dev/null +++ b/spacy/tests/regression/test_issue1253.py @@ -0,0 +1,20 @@ +from __future__ import unicode_literals +import pytest +import spacy + + +def ss(tt): + for i in range(len(tt)-1): + for j in range(i+1, len(tt)): + tt[i:j].root + + +@pytest.mark.models('en') +def test_access_parse_for_merged(): + nlp = spacy.load('en_core_web_sm') + t_t = nlp.tokenizer("Highly rated - I'll definitely") + nlp.tagger(t_t) + nlp.parser(t_t) + nlp.parser(t_t) + ss(t_t) + diff --git a/spacy/tests/regression/test_issue1305.py b/spacy/tests/regression/test_issue1305.py index e123ce0ba..d1d5eb93d 100644 --- a/spacy/tests/regression/test_issue1305.py +++ b/spacy/tests/regression/test_issue1305.py @@ -1,8 +1,11 @@ import pytest +import spacy -@pytest.mark.models('en') -def test_issue1305(EN): +#@pytest.mark.models('en') +def test_issue1305(): '''Test lemmatization of English VBZ''' - assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work']) - doc = EN(u'This app works well') + nlp = spacy.load('en_core_web_sm') + assert nlp.vocab.morphology.lemmatizer('works', 'verb') == ['work'] + doc = nlp(u'This app works well') + print([(w.text, w.tag_) for w in doc]) assert doc[2].lemma_ == 'work' diff --git a/spacy/tests/regression/test_issue1375.py b/spacy/tests/regression/test_issue1375.py new file mode 100644 index 000000000..6f74d9a6d --- /dev/null +++ b/spacy/tests/regression/test_issue1375.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals +import pytest +from ...vocab import Vocab +from ...tokens.doc import Doc + + +def test_issue1375(): + '''Test that token.nbor() raises IndexError for out-of-bounds access.''' + doc = Doc(Vocab(), words=['0', '1', '2']) + with pytest.raises(IndexError): + assert doc[0].nbor(-1) + assert doc[1].nbor(-1).text == '0' + with pytest.raises(IndexError): + assert doc[2].nbor(1) + assert doc[1].nbor(1).text == '2' + diff --git a/spacy/tests/regression/test_issue1434.py b/spacy/tests/regression/test_issue1434.py new file mode 100644 index 000000000..fc88cc3e6 --- /dev/null +++ b/spacy/tests/regression/test_issue1434.py @@ -0,0 +1,22 @@ +from __future__ import unicode_literals + +from ...vocab import Vocab +from ...lang.lex_attrs import LEX_ATTRS +from ...tokens import Doc +from ...matcher import Matcher + + +def test_issue1434(): + '''Test matches occur when optional element at end of short doc''' + vocab = Vocab(lex_attr_getters=LEX_ATTRS) + hello_world = Doc(vocab, words=['Hello', 'World']) + hello = Doc(vocab, words=['Hello']) + + matcher = Matcher(vocab) + matcher.add('MyMatcher', None, + [ {'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'} ]) + + matches = matcher(hello_world) + assert matches + matches = matcher(hello) + assert matches diff --git a/spacy/tests/regression/test_issue1450.py b/spacy/tests/regression/test_issue1450.py new file mode 100644 index 000000000..6f1d4f568 --- /dev/null +++ b/spacy/tests/regression/test_issue1450.py @@ -0,0 +1,58 @@ +from __future__ import unicode_literals +import pytest + +from ...matcher import Matcher +from ...tokens import Doc +from ...vocab import Vocab + + +@pytest.mark.parametrize( + 'string,start,end', + [ + ('a', 0, 1), + ('a b', 0, 2), + ('a c', 0, 1), + ('a b c', 0, 2), + ('a b b c', 0, 2), + ('a b b', 0, 2), + ] +) +def test_issue1450_matcher_end_zero_plus(string, start, end): + '''Test matcher works when patterns end with * operator. + + Original example (rewritten to avoid model usage) + + nlp = spacy.load('en_core_web_sm') + matcher = Matcher(nlp.vocab) + matcher.add( + "TSTEND", + on_match_1, + [ + {TAG: "JJ", LOWER: "new"}, + {TAG: "NN", 'OP': "*"} + ] + ) + doc = nlp(u'Could you create a new ticket for me?') + print([(w.tag_, w.text, w.lower_) for w in doc]) + matches = matcher(doc) + print(matches) + assert len(matches) == 1 + assert matches[0][1] == 4 + assert matches[0][2] == 5 + ''' + matcher = Matcher(Vocab()) + matcher.add( + "TSTEND", + None, + [ + {'ORTH': "a"}, + {'ORTH': "b", 'OP': "*"} + ] + ) + doc = Doc(Vocab(), words=string.split()) + matches = matcher(doc) + if start is None or end is None: + assert matches == [] + + assert matches[0][1] == start + assert matches[0][2] == end diff --git a/spacy/tests/regression/test_issue781.py b/spacy/tests/regression/test_issue781.py index e3f391a37..2c77e68cd 100644 --- a/spacy/tests/regression/test_issue781.py +++ b/spacy/tests/regression/test_issue781.py @@ -9,4 +9,4 @@ import pytest @pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])]) def test_issue781(EN, word, lemmas): lemmatizer = EN.Defaults.create_lemmatizer() - assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas) + assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == lemmas diff --git a/spacy/tests/spans/test_span.py b/spacy/tests/spans/test_span.py index 7ed9333b8..4050809b5 100644 --- a/spacy/tests/spans/test_span.py +++ b/spacy/tests/spans/test_span.py @@ -55,6 +55,17 @@ def test_spans_span_sent(doc): assert doc[6:7].sent.root.left_edge.text == 'This' +def test_spans_lca_matrix(en_tokenizer): + """Test span's lca matrix generation""" + tokens = en_tokenizer('the lazy dog slept') + doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0]) + lca = doc[:2].get_lca_matrix() + assert(lca[0, 0] == 0) + assert(lca[0, 1] == -1) + assert(lca[1, 0] == -1) + assert(lca[1, 1] == 1) + + def test_spans_default_sentiment(en_tokenizer): """Test span.sentiment property's default averaging behaviour""" text = "good stuff bad stuff" @@ -89,7 +100,7 @@ def test_spans_are_hashable(en_tokenizer): assert hash(span1) != hash(span2) span3 = tokens[0:2] assert hash(span3) == hash(span1) - + def test_spans_by_character(doc): span1 = doc[1:-2] @@ -106,3 +117,9 @@ def test_span_to_array(doc): assert arr[0, 0] == span[0].orth assert arr[0, 1] == len(span[0]) + +@pytest.mark.xfail +def test_span_as_doc(doc): + span = doc[4:10] + span_doc = span.as_doc() + assert span.text == span_doc.text diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py index 9fcb47305..5b08ede39 100644 --- a/spacy/tests/test_matcher.py +++ b/spacy/tests/test_matcher.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from ..matcher import Matcher, PhraseMatcher from .util import get_doc +from ..tokens import Doc import pytest @@ -212,3 +213,24 @@ def test_operator_combos(matcher): assert matches, (string, pattern_str) else: assert not matches, (string, pattern_str) + + +def test_matcher_end_zero_plus(matcher): + '''Test matcher works when patterns end with * operator. (issue 1450)''' + matcher = Matcher(matcher.vocab) + matcher.add( + "TSTEND", + None, + [ + {'ORTH': "a"}, + {'ORTH': "b", 'OP': "*"} + ] + ) + nlp = lambda string: Doc(matcher.vocab, words=string.split()) + assert len(matcher(nlp(u'a'))) == 1 + assert len(matcher(nlp(u'a b'))) == 1 + assert len(matcher(nlp(u'a b'))) == 1 + assert len(matcher(nlp(u'a c'))) == 1 + assert len(matcher(nlp(u'a b c'))) == 1 + assert len(matcher(nlp(u'a b b c'))) == 1 + assert len(matcher(nlp(u'a b b'))) == 1 diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py index 798871edd..74ac26a10 100644 --- a/spacy/tests/vectors/test_vectors.py +++ b/spacy/tests/vectors/test_vectors.py @@ -35,18 +35,18 @@ def vocab(en_vocab, vectors): def test_init_vectors_with_data(strings, data): - v = Vectors(strings, data) + v = Vectors(strings, data=data) assert v.shape == data.shape def test_init_vectors_with_width(strings): - v = Vectors(strings, 3) + v = Vectors(strings, width=3) for string in strings: v.add(string) assert v.shape == (len(strings), 3) def test_get_vector(strings, data): - v = Vectors(strings, data) + v = Vectors(strings, data=data) for string in strings: v.add(string) assert list(v[strings[0]]) == list(data[0]) @@ -56,7 +56,7 @@ def test_get_vector(strings, data): def test_set_vector(strings, data): orig = data.copy() - v = Vectors(strings, data) + v = Vectors(strings, data=data) for string in strings: v.add(string) assert list(v[strings[0]]) == list(orig[0]) diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd index 1a3e86b49..919b0928b 100644 --- a/spacy/tokenizer.pxd +++ b/spacy/tokenizer.pxd @@ -27,8 +27,9 @@ cdef class Tokenizer: cdef int _try_cache(self, hash_t key, Doc tokens) except -1 cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1 cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes, - vector[LexemeC*] *suffixes) + vector[LexemeC*] *suffixes, int* has_special) cdef int _attach_tokens(self, Doc tokens, unicode string, vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1 - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 + cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special, + int n) except -1 diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 692357c8a..bc09129de 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -20,7 +20,8 @@ cdef class Tokenizer: """Segment text, and create Doc objects with the discovered segment boundaries. """ - def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None): + def __init__(self, Vocab vocab, rules=None, prefix_search=None, + suffix_search=None, infix_finditer=None, token_match=None): """Create a `Tokenizer`, to create `Doc` objects given unicode text. vocab (Vocab): A storage container for lexical types. @@ -48,8 +49,9 @@ cdef class Tokenizer: self.infix_finditer = infix_finditer self.vocab = vocab self._rules = {} - for chunk, substrings in sorted(rules.items()): - self.add_special_case(chunk, substrings) + if rules is not None: + for chunk, substrings in sorted(rules.items()): + self.add_special_case(chunk, substrings) def __reduce__(self): args = (self.vocab, @@ -148,14 +150,18 @@ cdef class Tokenizer: cdef vector[LexemeC*] prefixes cdef vector[LexemeC*] suffixes cdef int orig_size + cdef int has_special orig_size = tokens.length - span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes) + span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes, + &has_special) self._attach_tokens(tokens, span, &prefixes, &suffixes) - self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size) + self._save_cached(&tokens.c[orig_size], orig_key, has_special, + tokens.length - orig_size) cdef unicode _split_affixes(self, Pool mem, unicode string, vector[const LexemeC*] *prefixes, - vector[const LexemeC*] *suffixes): + vector[const LexemeC*] *suffixes, + int* has_special): cdef size_t i cdef unicode prefix cdef unicode suffix @@ -174,6 +180,7 @@ cdef class Tokenizer: if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL: string = minus_pre prefixes.push_back(self.vocab.get(mem, prefix)) + has_special[0] = 1 break if self.token_match and self.token_match(string): break @@ -185,6 +192,7 @@ cdef class Tokenizer: if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL): string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) + has_special[0] = 1 break if pre_len and suf_len and (pre_len + suf_len) <= len(string): string = string[pre_len:-suf_len] @@ -197,6 +205,7 @@ cdef class Tokenizer: string = minus_suf suffixes.push_back(self.vocab.get(mem, suffix)) if string and (self._specials.get(hash_string(string)) != NULL): + has_special[0] = 1 break return string @@ -256,11 +265,15 @@ cdef class Tokenizer: preinc(it) tokens.push_back(lexeme, False) - cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1: + cdef int _save_cached(self, const TokenC* tokens, hash_t key, + int has_special, int n) except -1: cdef int i for i in range(n): if tokens[i].lex.id == 0: return 0 + # See https://github.com/explosion/spaCy/issues/1250 + if has_special: + return 0 cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached)) cached.length = n cached.is_lex = True diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 809f178f8..1bd61b256 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -21,7 +21,7 @@ from .token cimport Token from .printers import parse_tree from ..lexeme cimport Lexeme, EMPTY_LEXEME from ..typedefs cimport attr_t, flags_t -from ..attrs import intify_attrs +from ..attrs import intify_attrs, IDS from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE @@ -536,11 +536,15 @@ cdef class Doc: @cython.boundscheck(False) cpdef np.ndarray to_array(self, object py_attr_ids): - """Given a list of M attribute IDs, export the tokens to a numpy - `ndarray` of shape `(N, M)`, where `N` is the length of the document. - The values will be 32-bit integers. + """Export given token attributes to a numpy `ndarray`. - attr_ids (list[int]): A list of attribute ID ints. + If `attr_ids` is a sequence of M attributes, the output array will + be of shape `(N, M)`, where N is the length of the `Doc` + (in tokens). If `attr_ids` is a single attribute, the output shape will + be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA) + or string name (e.g. 'LEMMA' or 'lemma'). + + attr_ids (list[]): A list of attributes (int IDs or string names). RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row per word, and one column per attribute indicated in the input `attr_ids`. @@ -553,15 +557,25 @@ cdef class Doc: """ cdef int i, j cdef attr_id_t feature + cdef np.ndarray[attr_t, ndim=1] attr_ids cdef np.ndarray[attr_t, ndim=2] output + # Handle scalar/list inputs of strings/ints for py_attr_ids + if not hasattr(py_attr_ids, '__iter__'): + py_attr_ids = [py_attr_ids] + + # Allow strings, e.g. 'lemma' or 'LEMMA' + py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_) + for id_ in py_attr_ids] # Make an array from the attributes --- otherwise our inner loop is Python # dict iteration. - cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) + attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64) output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64) for i in range(self.length): for j, feature in enumerate(attr_ids): output[i, j] = get_token_attr(&self.c[i], feature) - return output + # Handle 1d case + return output if len(attr_ids) >= 2 else output.reshape((self.length,)) + def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None): """Count the frequencies of a given attribute. Produces a dict of @@ -660,6 +674,54 @@ cdef class Doc: self.is_tagged = bool(TAG in attrs or POS in attrs) return self + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy doc. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + # Efficiency notes: + # + # We can easily improve the performance here by iterating in Cython. + # To loop over the tokens in Cython, the easiest way is: + # for token in doc.c[:doc.c.length]: + # head = token + token.head + # Both token and head will be TokenC* here. The token.head attribute + # is an integer offset. + def __pairwise_lca(token_j, token_k, lca_matrix): + if lca_matrix[token_j.i][token_k.i] != -2: + return lca_matrix[token_j.i][token_k.i] + elif token_j == token_k: + lca_index = token_j.i + elif token_k.head == token_j: + lca_index = token_j.i + elif token_j.head == token_k: + lca_index = token_k.i + elif (token_j.head == token_j) and (token_k.head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix) + lca_matrix[token_j.i][token_k.i] = lca_index + lca_matrix[token_k.i][token_j.i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + for j in range(len(self)): + token_j = self[j] + for k in range(j, len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + def to_disk(self, path, **exclude): """Save the current state to a directory. diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx index 3b31c50c0..963292fdb 100644 --- a/spacy/tokens/span.pyx +++ b/spacy/tokens/span.pyx @@ -129,6 +129,7 @@ cdef class Span: def _(self): return Underscore(Underscore.span_extensions, self, start=self.start_char, end=self.end_char) + def as_doc(self): '''Create a Doc object view of the Span's data. @@ -177,6 +178,56 @@ cdef class Span: return 0.0 return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm) + def get_lca_matrix(self): + ''' + Calculates the lowest common ancestor matrix + for a given Spacy span. + Returns LCA matrix containing the integer index + of the ancestor, or -1 if no common ancestor is + found (ex if span excludes a necessary ancestor). + Apologies about the recursion, but the + impact on performance is negligible given + the natural limitations on the depth of a typical human sentence. + ''' + + def __pairwise_lca(token_j, token_k, lca_matrix, margins): + offset = margins[0] + token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k + token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j + token_j_i = token_j.i - offset + token_k_i = token_k.i - offset + + if lca_matrix[token_j_i][token_k_i] != -2: + return lca_matrix[token_j_i][token_k_i] + elif token_j == token_k: + lca_index = token_j_i + elif token_k_head == token_j: + lca_index = token_j_i + elif token_j_head == token_k: + lca_index = token_k_i + elif (token_j_head == token_j) and (token_k_head == token_k): + lca_index = -1 + else: + lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins) + + lca_matrix[token_j_i][token_k_i] = lca_index + lca_matrix[token_k_i][token_j_i] = lca_index + + return lca_index + + lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32) + lca_matrix.fill(-2) + margins = [self.start, self.end] + + for j in range(len(self)): + token_j = self[j] + for k in range(len(self)): + token_k = self[k] + lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins) + lca_matrix[k][j] = lca_matrix[j][k] + + return lca_matrix + cpdef np.ndarray to_array(self, object py_attr_ids): """Given a list of M attribute IDs, export the tokens to a numpy `ndarray` of shape `(N, M)`, where `N` is the length of the document. diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx index 9ff59eabe..514934ca7 100644 --- a/spacy/tokens/token.pyx +++ b/spacy/tokens/token.pyx @@ -127,6 +127,9 @@ cdef class Token: i (int): The relative position of the token to get. Defaults to 1. RETURNS (Token): The token at position `self.doc[self.i+i]`. """ + if self.i+i < 0 or (self.i+i >= len(self.doc)): + msg = "Error accessing doc[%d].nbor(%d), for doc of length %d" + raise IndexError(msg % (self.i, i, len(self.doc))) return self.doc[self.i+i] def similarity(self, other): diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 5512279ae..fa5fcf624 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -32,22 +32,24 @@ cdef class Vectors: cdef public object keys cdef public int i - def __init__(self, strings, data_or_width=0): + def __init__(self, strings, width=0, data=None): if isinstance(strings, StringStore): self.strings = strings else: self.strings = StringStore() for string in strings: self.strings.add(string) - if isinstance(data_or_width, int): - self.data = data = numpy.zeros((len(strings), data_or_width), - dtype='f') + if data is not None: + self.data = numpy.asarray(data, dtype='f') else: - data = data_or_width + self.data = numpy.zeros((len(self.strings), width), dtype='f') self.i = 0 - self.data = data self.key2row = {} - self.keys = np.ndarray((self.data.shape[0],), dtype='uint64') + self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64') + for i, string in enumerate(self.strings): + if i >= self.data.shape[0]: + break + self.add(self.strings[string], self.data[i]) def __reduce__(self): return (Vectors, (self.strings, self.data)) diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 205e5a2af..bcd1f3c10 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -62,12 +62,9 @@ cdef class Vocab: if strings: for string in strings: _ = self[string] - for name in tag_map.keys(): - if name: - self.strings.add(name) self.lex_attr_getters = lex_attr_getters self.morphology = Morphology(self.strings, tag_map, lemmatizer) - self.vectors = Vectors(self.strings) + self.vectors = Vectors(self.strings, width=0) property lang: def __get__(self): @@ -255,7 +252,7 @@ cdef class Vocab: """ if new_dim is None: new_dim = self.vectors.data.shape[1] - self.vectors = Vectors(self.strings, new_dim) + self.vectors = Vectors(self.strings, width=new_dim) def get_vector(self, orth): """Retrieve a vector for a word in the vocabulary. @@ -338,7 +335,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.to_bytes(exclude='strings.json') + return self.vectors.to_bytes() getters = OrderedDict(( ('strings', lambda: self.strings.to_bytes()), @@ -358,7 +355,7 @@ cdef class Vocab: if self.vectors is None: return None else: - return self.vectors.from_bytes(b, exclude='strings') + return self.vectors.from_bytes(b) setters = OrderedDict(( ('strings', lambda b: self.strings.from_bytes(b)), ('lexemes', lambda b: self.lexemes_from_bytes(b)), @@ -400,6 +397,7 @@ cdef class Vocab: cdef int j = 0 cdef SerializedLexemeC lex_data chunk_size = sizeof(lex_data.data) + cdef void* ptr cdef unsigned char* bytes_ptr = bytes_data for i in range(0, len(bytes_data), chunk_size): lexeme = self.mem.alloc(1, sizeof(LexemeC)) @@ -407,6 +405,9 @@ cdef class Vocab: lex_data.data[j] = bytes_ptr[i+j] Lexeme.c_from_bytes(lexeme, lex_data) + ptr = self.strings._map.get(lexeme.orth) + if ptr == NULL: + continue py_str = self.strings[lexeme.orth] assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth) key = hash_string(py_str) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 414ee809e..b7375e2e0 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -181,7 +181,7 @@ mixin codepen(slug, height, default_tab) alt_file - [string] alternative file path used in footer and link button height - [integer] height of code preview in px -mixin github(repo, file, alt_file, height) +mixin github(repo, file, alt_file, height, language) - var branch = ALPHA ? "develop" : "master" - var height = height || 250 diff --git a/website/api/_annotation/_named-entities.jade b/website/api/_annotation/_named-entities.jade index 476659d4a..93e705c72 100644 --- a/website/api/_annotation/_named-entities.jade +++ b/website/api/_annotation/_named-entities.jade @@ -37,6 +37,10 @@ +cell #[code WORK_OF_ART] +cell Titles of books, songs, etc. + +row + +cell #[code LAW] + +cell Named documents made into laws. + +row +cell #[code LANGUAGE] +cell Any named language. diff --git a/website/api/_annotation/_training.jade b/website/api/_annotation/_training.jade new file mode 100644 index 000000000..3b11eb2f5 --- /dev/null +++ b/website/api/_annotation/_training.jade @@ -0,0 +1,46 @@ +//- 💫 DOCS > API > ANNOTATION > TRAINING + +p + | spaCy takes training data in JSON format. The built-in + | #[+api("cli#convert") #[code convert]] command helps you convert the + | #[code .conllu] format used by the + | #[+a("https://github.com/UniversalDependencies") Universal Dependencies corpora] + | to spaCy's training format. + ++aside("Annotating entities") + | Named entities are provided in the #[+a("/api/annotation#biluo") BILUO] + | notation. Tokens outside an entity are set to #[code "O"] and tokens + | that are part of an entity are set to the entity label, prefixed by the + | BILUO marker. For example #[code "B-ORG"] describes the first token of + | a multi-token #[code ORG] entity and #[code "U-PERSON"] a single + | token representing a #[code PERSON] entity + ++code("Example structure"). + [{ + "id": int, # ID of the document within the corpus + "paragraphs": [{ # list of paragraphs in the corpus + "raw": string, # raw text of the paragraph + "sentences": [{ # list of sentences in the paragraph + "tokens": [{ # list of tokens in the sentence + "id": int, # index of the token in the document + "dep": string, # dependency label + "head": int, # offset of token head relative to token index + "tag": string, # part-of-speech tag + "orth": string, # verbatim text of the token + "ner": string # BILUO label, e.g. "O" or "B-ORG" + }], + "brackets": [{ # phrase structure (NOT USED by current models) + "first": int, # index of first token + "last": int, # index of last token + "label": string # phrase label + }] + }] + }] + }] + +p + | Here's an example of dependencies, part-of-speech tags and names + | entities, taken from the English Wall Street Journal portion of the Penn + | Treebank: + ++github("spacy", "examples/training/training-data.json", false, false, "json") diff --git a/website/api/_data.json b/website/api/_data.json index d85b103dc..ba7997690 100644 --- a/website/api/_data.json +++ b/website/api/_data.json @@ -154,13 +154,16 @@ "tokenizer": { "title": "Tokenizer", + "teaser": "Segment text into words, punctuations marks etc.", "tag": "class", "source": "spacy/tokenizer.pyx" }, "lemmatizer": { "title": "Lemmatizer", - "tag": "class" + "teaser": "Assign the base forms of words.", + "tag": "class", + "source": "spacy/lemmatizer.py" }, "tagger": { diff --git a/website/api/annotation.jade b/website/api/annotation.jade index efada23d7..c65cd3983 100644 --- a/website/api/annotation.jade +++ b/website/api/annotation.jade @@ -101,31 +101,4 @@ p This document describes the target annotations spaCy is trained to predict. +section("training") +h(2, "json-input") JSON input format for training - +under-construction - - p spaCy takes training data in the following format: - - +code("Example structure"). - doc: { - id: string, - paragraphs: [{ - raw: string, - sents: [int], - tokens: [{ - start: int, - tag: string, - head: int, - dep: string - }], - ner: [{ - start: int, - end: int, - label: string - }], - brackets: [{ - start: int, - end: int, - label: string - }] - }] - } + include _annotation/_training diff --git a/website/api/doc.jade b/website/api/doc.jade index dce6b89e0..ceb564c7a 100644 --- a/website/api/doc.jade +++ b/website/api/doc.jade @@ -336,28 +336,40 @@ p +tag method p - | Export the document annotations to a numpy array of shape #[code N*M] - | where #[code N] is the length of the document and #[code M] is the number - | of attribute IDs to export. The values will be 32-bit integers. + | Export given token attributes to a numpy #[code ndarray]. + | If #[code attr_ids] is a sequence of #[code M] attributes, + | the output array will be of shape #[code (N, M)], where #[code N] + | is the length of the #[code Doc] (in tokens). If #[code attr_ids] is + | a single attribute, the output shape will be #[code (N,)]. You can + | specify attributes by integer ID (e.g. #[code spacy.attrs.LEMMA]) + | or string name (e.g. 'LEMMA' or 'lemma'). The values will be 64-bit + | integers. +aside-code("Example"). from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA doc = nlp(text) # All strings mapped to integers, for easy export to numpy np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) + np_array = doc.to_array("POS") +table(["Name", "Type", "Description"]) +row +cell #[code attr_ids] - +cell list - +cell A list of attribute ID ints. + +cell list or int or string + +cell + | A list of attributes (int IDs or string names) or + | a single attribute (int ID or string name) +row("foot") +cell returns - +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']] + +cell + | #[code.u-break numpy.ndarray[ndim=2, dtype='uint64']] or + | #[code.u-break numpy.ndarray[ndim=1, dtype='uint64']] or +cell | The exported attributes as a 2D numpy array, with one row per - | token and one column per attribute. + | token and one column per attribute (when #[code attr_ids] is a + | list), or as a 1D numpy array, with one item per attribute (when + | #[code attr_ids] is a single value). +h(2, "from_array") Doc.from_array +tag method diff --git a/website/api/lemmatizer.jade b/website/api/lemmatizer.jade index 9699395b1..eb061f10a 100644 --- a/website/api/lemmatizer.jade +++ b/website/api/lemmatizer.jade @@ -2,4 +2,159 @@ include ../_includes/_mixins -+under-construction +p + | The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix + | rules and lookup tables. + ++h(2, "init") Lemmatizer.__init__ + +tag method + +p Create a #[code Lemmatizer]. + ++aside-code("Example"). + from spacy.lemmatizer import Lemmatizer + lemmatizer = Lemmatizer() + ++table(["Name", "Type", "Description"]) + +row + +cell #[code index] + +cell dict / #[code None] + +cell Inventory of lemmas in the language. + + +row + +cell #[code exceptions] + +cell dict / #[code None] + +cell Mapping of string forms to lemmas that bypass the #[code rules]. + + +row + +cell #[code rules] + +cell dict / #[code None] + +cell List of suffix rewrite rules. + + +row + +cell #[code lookup] + +cell dict / #[code None] + +cell Lookup table mapping string to their lemmas. + + +row("foot") + +cell returns + +cell #[code Lemmatizer] + +cell The newly created object. + ++h(2, "call") Lemmatizer.__call__ + +tag method + +p Lemmatize a string. + ++aside-code("Example"). + from spacy.lemmatizer import Lemmatizer + from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES + lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES) + lemmas = lemmatizer(u'ducks', u'NOUN') + assert lemmas == [u'duck'] + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to lemmatize, e.g. the token text. + + +row + +cell #[code univ_pos] + +cell unicode / int + +cell The token's universal part-of-speech tag. + + +row + +cell #[code morphology] + +cell dict / #[code None] + +cell + | Morphological features following the + | #[+a("http://universaldependencies.org/") Universal Dependencies] + | scheme. + + +row("foot") + +cell returns + +cell list + +cell The available lemmas for the string. + ++h(2, "lookup") Lemmatizer.lookup + +tag method + +tag-new(2) + +p + | Look up a lemma in the lookup table, if available. If no lemma is found, + | the original string is returned. Languages can provide a + | #[+a("/usage/adding-languages#lemmatizer") lookup table] via the + | #[code lemma_lookup] variable, set on the individual #[code Language] + | class. + ++aside-code("Example"). + lookup = {u'going': u'go'} + lemmatizer = Lemmatizer(lookup=lookup) + assert lemmatizer.lookup(u'going') == u'go' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code string] + +cell unicode + +cell The string to look up. + + +row("foot") + +cell returns + +cell unicode + +cell The lemma if the string was found, otherwise the original string. + ++h(2, "is_base_form") Lemmatizer.is_base_form + +tag method + +p + | Check whether we're dealing with an uninflected paradigm, so we can + | avoid lemmatization entirely. + ++aside-code("Example"). + pos = 'verb' + morph = {'VerbForm': 'inf'} + is_base_form = lemmatizer.is_base_form(pos, morph) + assert is_base_form == True + ++table(["Name", "Type", "Description"]) + +row + +cell #[code univ_pos] + +cell unicode / int + +cell The token's universal part-of-speech tag. + + +row + +cell #[code morphology] + +cell dict + +cell The token's morphological features. + + +row("foot") + +cell returns + +cell bool + +cell + | Whether the token's part-of-speech tag and morphological features + | describe a base form. + ++h(2, "attributes") Attributes + ++table(["Name", "Type", "Description"]) + +row + +cell #[code index] + +cell dict / #[code None] + +cell Inventory of lemmas in the language. + + +row + +cell #[code exc] + +cell dict / #[code None] + +cell Mapping of string forms to lemmas that bypass the #[code rules]. + + +row + +cell #[code rules] + +cell dict / #[code None] + +cell List of suffix rewrite rules. + + +row + +cell #[code lookup_table] + +tag-new(2) + +cell dict / #[code None] + +cell The lemma lookup table, if available. diff --git a/website/api/span.jade b/website/api/span.jade index 6bff45a9b..2a55409f1 100644 --- a/website/api/span.jade +++ b/website/api/span.jade @@ -284,7 +284,7 @@ p Retokenize the document, such that the span is merged into a single token. +aside-code("Example"). doc = nlp(u'I like New York in Autumn.') - span = doc[2:3] + span = doc[2:4] span.merge() assert len(doc) == 6 assert doc[2].text == 'New York' @@ -302,6 +302,25 @@ p Retokenize the document, such that the span is merged into a single token. +cell #[code Token] +cell The newly merged token. ++h(2, "as_doc") Span.as_doc + +p + | Create a #[code Doc] object view of the #[code Span]'s data. Mostly + | useful for C-typed interfaces. + ++aside-code("Example"). + doc = nlp(u'I like New York in Autumn.') + span = doc[2:4] + doc2 = span.as_doc() + assert doc2.text == 'New York' + ++table(["Name", "Type", "Description"]) + +row("foot") + +cell returns + +cell #[code Doc] + +cell A #[code Doc] object of the #[code Span]'s content. + + +h(2, "root") Span.root +tag property +tag-model("parse") diff --git a/website/api/token.jade b/website/api/token.jade index 465d44c66..4062594b4 100644 --- a/website/api/token.jade +++ b/website/api/token.jade @@ -586,6 +586,16 @@ p The L2 norm of the token's vector representation. +cell bool +cell Is the token punctuation? + +row + +cell #[code is_left_punct] + +cell bool + +cell Is the token a left punctuation mark, e.g. #[code (]? + + +row + +cell #[code is_right_punct] + +cell bool + +cell Is the token a right punctuation mark, e.g. #[code )]? + +row +cell #[code is_space] +cell bool @@ -593,6 +603,16 @@ p The L2 norm of the token's vector representation. | Does the token consist of whitespace characters? Equivalent to | #[code token.text.isspace()]. + +row + +cell #[code is_bracket] + +cell bool + +cell Is the token a bracket? + + +row + +cell #[code is_quote] + +cell bool + +cell Is the token a quotation mark? + +row +cell #[code like_url] +cell bool diff --git a/website/api/vectors.jade b/website/api/vectors.jade index a58736506..e08f34643 100644 --- a/website/api/vectors.jade +++ b/website/api/vectors.jade @@ -12,7 +12,7 @@ p p | Create a new vector store. To keep the vector table empty, pass - | #[code data_or_width=0]. You can also create the vector table and add + | #[code width=0]. You can also create the vector table and add | vectors one by one, or set the vector values directly on initialisation. +aside-code("Example"). @@ -21,11 +21,11 @@ p empty_vectors = Vectors(StringStore()) - vectors = Vectors([u'cat'], 300) + vectors = Vectors([u'cat'], width=300) vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,)) vector_table = numpy.zeros((3, 300), dtype='f') - vectors = Vectors(StringStore(), vector_table) + vectors = Vectors(StringStore(), data=vector_table) +table(["Name", "Type", "Description"]) +row @@ -36,9 +36,12 @@ p | that maps strings to hash values, and vice versa. +row - +cell #[code data_or_width] - +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] or int - +cell Vector data or number of dimensions. + +cell #[code data] + +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] + + +row + +cell #[code width] + +cell Number of dimensions. +row("foot") +cell returns diff --git a/website/assets/css/_components/_code.sass b/website/assets/css/_components/_code.sass index f83e96d29..eaf0980e1 100644 --- a/website/assets/css/_components/_code.sass +++ b/website/assets/css/_components/_code.sass @@ -63,7 +63,6 @@ code padding: 0.2rem 0.4rem border-radius: 0.25rem font-family: $font-code - white-space: nowrap margin: 0 box-decoration-break: clone white-space: nowrap diff --git a/website/assets/css/_components/_navigation.sass b/website/assets/css/_components/_navigation.sass index 0e4af8267..1543de5fb 100644 --- a/website/assets/css/_components/_navigation.sass +++ b/website/assets/css/_components/_navigation.sass @@ -14,9 +14,6 @@ width: 100% box-shadow: $box-shadow - //@include breakpoint(min, md) - // position: fixed - &.is-fixed animation: slideInDown 0.5s ease-in-out position: fixed diff --git a/website/usage/_training/_tagger-parser.jade b/website/usage/_training/_tagger-parser.jade index 4011464c7..a62b9d43e 100644 --- a/website/usage/_training/_tagger-parser.jade +++ b/website/usage/_training/_tagger-parser.jade @@ -1,3 +1,7 @@ //- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER +under-construction + ++h(3, "training-json") JSON format for training + +include ../../api/_annotation/_training