mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Update from develop
This commit is contained in:
		
						commit
						075e8118ea
					
				
							
								
								
									
										106
									
								
								.github/contributors/ramananbalakrishnan.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/ramananbalakrishnan.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your 
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Ramanan Balakrishnan |
 | 
				
			||||||
 | 
					| Company name (if applicable)   |                      |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |                      |
 | 
				
			||||||
 | 
					| Date                           | 2017-10-19           |
 | 
				
			||||||
 | 
					| GitHub username                | ramananbalakrishnan  |
 | 
				
			||||||
 | 
					| Website (optional)             |                      |
 | 
				
			||||||
| 
						 | 
					@ -56,8 +56,7 @@ def train_ner(nlp, train_data, output_dir):
 | 
				
			||||||
        losses = {}
 | 
					        losses = {}
 | 
				
			||||||
        for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
 | 
					        for batch in minibatch(get_gold_parses(nlp.make_doc, train_data), size=3):
 | 
				
			||||||
            docs, golds = zip(*batch)
 | 
					            docs, golds = zip(*batch)
 | 
				
			||||||
            nlp.update(docs, golds, losses=losses, sgd=optimizer, update_shared=True,
 | 
					            nlp.update(docs, golds, losses=losses, sgd=optimizer, drop=0.35)
 | 
				
			||||||
                       drop=0.35)
 | 
					 | 
				
			||||||
        print(losses)
 | 
					        print(losses)
 | 
				
			||||||
    if not output_dir:
 | 
					    if not output_dir:
 | 
				
			||||||
        return
 | 
					        return
 | 
				
			||||||
| 
						 | 
					@ -100,9 +99,10 @@ def main(model_name, output_directory=None):
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    nlp.pipeline.append(TokenVectorEncoder(nlp.vocab))
 | 
					    nlp.add_pipe(TokenVectorEncoder(nlp.vocab))
 | 
				
			||||||
    nlp.pipeline.append(NeuralEntityRecognizer(nlp.vocab))
 | 
					    ner = NeuralEntityRecognizer(nlp.vocab)
 | 
				
			||||||
    nlp.pipeline[-1].add_label('ANIMAL')
 | 
					    ner.add_label('ANIMAL')
 | 
				
			||||||
 | 
					    nlp.add_pipe(ner)
 | 
				
			||||||
    train_ner(nlp, train_data, output_directory)
 | 
					    train_ner(nlp, train_data, output_directory)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Test that the entity is recognized
 | 
					    # Test that the entity is recognized
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										641
									
								
								examples/training/training-data.json
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										641
									
								
								examples/training/training-data.json
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,641 @@
 | 
				
			||||||
 | 
					[
 | 
				
			||||||
 | 
					    {
 | 
				
			||||||
 | 
					      "id": "wsj_0200",
 | 
				
			||||||
 | 
					      "paragraphs": [
 | 
				
			||||||
 | 
					        {
 | 
				
			||||||
 | 
					          "raw": "In an Oct. 19 review of \"The Misanthrope\" at Chicago's Goodman Theatre (\"Revitalized Classics Take the Stage in Windy City,\" Leisure & Arts), the role of Celimene, played by Kim Cattrall, was mistakenly attributed to Christina Haag. Ms. Haag plays Elianti.",
 | 
				
			||||||
 | 
					          "sentences": [
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					              "tokens": [
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 44,
 | 
				
			||||||
 | 
					                  "dep": "prep",
 | 
				
			||||||
 | 
					                  "tag": "IN",
 | 
				
			||||||
 | 
					                  "orth": "In",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 0
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 3,
 | 
				
			||||||
 | 
					                  "dep": "det",
 | 
				
			||||||
 | 
					                  "tag": "DT",
 | 
				
			||||||
 | 
					                  "orth": "an",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 1
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 2,
 | 
				
			||||||
 | 
					                  "dep": "nmod",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Oct.",
 | 
				
			||||||
 | 
					                  "ner": "B-DATE",
 | 
				
			||||||
 | 
					                  "id": 2
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "nummod",
 | 
				
			||||||
 | 
					                  "tag": "CD",
 | 
				
			||||||
 | 
					                  "orth": "19",
 | 
				
			||||||
 | 
					                  "ner": "L-DATE",
 | 
				
			||||||
 | 
					                  "id": 3
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -4,
 | 
				
			||||||
 | 
					                  "dep": "pobj",
 | 
				
			||||||
 | 
					                  "tag": "NN",
 | 
				
			||||||
 | 
					                  "orth": "review",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 4
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "prep",
 | 
				
			||||||
 | 
					                  "tag": "IN",
 | 
				
			||||||
 | 
					                  "orth": "of",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 5
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 2,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": "``",
 | 
				
			||||||
 | 
					                  "orth": "``",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 6
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "det",
 | 
				
			||||||
 | 
					                  "tag": "DT",
 | 
				
			||||||
 | 
					                  "orth": "The",
 | 
				
			||||||
 | 
					                  "ner": "B-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 7
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -3,
 | 
				
			||||||
 | 
					                  "dep": "pobj",
 | 
				
			||||||
 | 
					                  "tag": "NN",
 | 
				
			||||||
 | 
					                  "orth": "Misanthrope",
 | 
				
			||||||
 | 
					                  "ner": "L-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 8
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": "''",
 | 
				
			||||||
 | 
					                  "orth": "''",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 9
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -2,
 | 
				
			||||||
 | 
					                  "dep": "prep",
 | 
				
			||||||
 | 
					                  "tag": "IN",
 | 
				
			||||||
 | 
					                  "orth": "at",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 10
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 3,
 | 
				
			||||||
 | 
					                  "dep": "poss",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Chicago",
 | 
				
			||||||
 | 
					                  "ner": "U-GPE",
 | 
				
			||||||
 | 
					                  "id": 11
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "case",
 | 
				
			||||||
 | 
					                  "tag": "POS",
 | 
				
			||||||
 | 
					                  "orth": "'s",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 12
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "compound",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Goodman",
 | 
				
			||||||
 | 
					                  "ner": "B-FAC",
 | 
				
			||||||
 | 
					                  "id": 13
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -4,
 | 
				
			||||||
 | 
					                  "dep": "pobj",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Theatre",
 | 
				
			||||||
 | 
					                  "ner": "L-FAC",
 | 
				
			||||||
 | 
					                  "id": 14
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 4,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": "-LRB-",
 | 
				
			||||||
 | 
					                  "orth": "(",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 15
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 3,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": "``",
 | 
				
			||||||
 | 
					                  "orth": "``",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 16
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "amod",
 | 
				
			||||||
 | 
					                  "tag": "VBN",
 | 
				
			||||||
 | 
					                  "orth": "Revitalized",
 | 
				
			||||||
 | 
					                  "ner": "B-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 17
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "nsubj",
 | 
				
			||||||
 | 
					                  "tag": "NNS",
 | 
				
			||||||
 | 
					                  "orth": "Classics",
 | 
				
			||||||
 | 
					                  "ner": "I-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 18
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -15,
 | 
				
			||||||
 | 
					                  "dep": "appos",
 | 
				
			||||||
 | 
					                  "tag": "VBP",
 | 
				
			||||||
 | 
					                  "orth": "Take",
 | 
				
			||||||
 | 
					                  "ner": "I-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 19
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "det",
 | 
				
			||||||
 | 
					                  "tag": "DT",
 | 
				
			||||||
 | 
					                  "orth": "the",
 | 
				
			||||||
 | 
					                  "ner": "I-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 20
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -2,
 | 
				
			||||||
 | 
					                  "dep": "dobj",
 | 
				
			||||||
 | 
					                  "tag": "NN",
 | 
				
			||||||
 | 
					                  "orth": "Stage",
 | 
				
			||||||
 | 
					                  "ner": "I-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 21
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -3,
 | 
				
			||||||
 | 
					                  "dep": "prep",
 | 
				
			||||||
 | 
					                  "tag": "IN",
 | 
				
			||||||
 | 
					                  "orth": "in",
 | 
				
			||||||
 | 
					                  "ner": "I-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 22
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "compound",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Windy",
 | 
				
			||||||
 | 
					                  "ner": "I-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 23
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -2,
 | 
				
			||||||
 | 
					                  "dep": "pobj",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "City",
 | 
				
			||||||
 | 
					                  "ner": "L-WORK_OF_ART",
 | 
				
			||||||
 | 
					                  "id": 24
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -6,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": ",",
 | 
				
			||||||
 | 
					                  "orth": ",",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 25
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -7,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": "''",
 | 
				
			||||||
 | 
					                  "orth": "''",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 26
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -8,
 | 
				
			||||||
 | 
					                  "dep": "npadvmod",
 | 
				
			||||||
 | 
					                  "tag": "NN",
 | 
				
			||||||
 | 
					                  "orth": "Leisure",
 | 
				
			||||||
 | 
					                  "ner": "B-ORG",
 | 
				
			||||||
 | 
					                  "id": 27
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "cc",
 | 
				
			||||||
 | 
					                  "tag": "CC",
 | 
				
			||||||
 | 
					                  "orth": "&",
 | 
				
			||||||
 | 
					                  "ner": "I-ORG",
 | 
				
			||||||
 | 
					                  "id": 28
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -2,
 | 
				
			||||||
 | 
					                  "dep": "conj",
 | 
				
			||||||
 | 
					                  "tag": "NNS",
 | 
				
			||||||
 | 
					                  "orth": "Arts",
 | 
				
			||||||
 | 
					                  "ner": "L-ORG",
 | 
				
			||||||
 | 
					                  "id": 29
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -11,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": "-RRB-",
 | 
				
			||||||
 | 
					                  "orth": ")",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 30
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 13,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": ",",
 | 
				
			||||||
 | 
					                  "orth": ",",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 31
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "det",
 | 
				
			||||||
 | 
					                  "tag": "DT",
 | 
				
			||||||
 | 
					                  "orth": "the",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 32
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 11,
 | 
				
			||||||
 | 
					                  "dep": "nsubjpass",
 | 
				
			||||||
 | 
					                  "tag": "NN",
 | 
				
			||||||
 | 
					                  "orth": "role",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 33
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "prep",
 | 
				
			||||||
 | 
					                  "tag": "IN",
 | 
				
			||||||
 | 
					                  "orth": "of",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 34
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "pobj",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Celimene",
 | 
				
			||||||
 | 
					                  "ner": "U-PERSON",
 | 
				
			||||||
 | 
					                  "id": 35
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -3,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": ",",
 | 
				
			||||||
 | 
					                  "orth": ",",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 36
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -4,
 | 
				
			||||||
 | 
					                  "dep": "acl",
 | 
				
			||||||
 | 
					                  "tag": "VBN",
 | 
				
			||||||
 | 
					                  "orth": "played",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 37
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "agent",
 | 
				
			||||||
 | 
					                  "tag": "IN",
 | 
				
			||||||
 | 
					                  "orth": "by",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 38
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "compound",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Kim",
 | 
				
			||||||
 | 
					                  "ner": "B-PERSON",
 | 
				
			||||||
 | 
					                  "id": 39
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -2,
 | 
				
			||||||
 | 
					                  "dep": "pobj",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Cattrall",
 | 
				
			||||||
 | 
					                  "ner": "L-PERSON",
 | 
				
			||||||
 | 
					                  "id": 40
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -8,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": ",",
 | 
				
			||||||
 | 
					                  "orth": ",",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 41
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 2,
 | 
				
			||||||
 | 
					                  "dep": "auxpass",
 | 
				
			||||||
 | 
					                  "tag": "VBD",
 | 
				
			||||||
 | 
					                  "orth": "was",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 42
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "advmod",
 | 
				
			||||||
 | 
					                  "tag": "RB",
 | 
				
			||||||
 | 
					                  "orth": "mistakenly",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 43
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 0,
 | 
				
			||||||
 | 
					                  "dep": "root",
 | 
				
			||||||
 | 
					                  "tag": "VBN",
 | 
				
			||||||
 | 
					                  "orth": "attributed",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 44
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "prep",
 | 
				
			||||||
 | 
					                  "tag": "IN",
 | 
				
			||||||
 | 
					                  "orth": "to",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 45
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "compound",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Christina",
 | 
				
			||||||
 | 
					                  "ner": "B-PERSON",
 | 
				
			||||||
 | 
					                  "id": 46
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -2,
 | 
				
			||||||
 | 
					                  "dep": "pobj",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Haag",
 | 
				
			||||||
 | 
					                  "ner": "L-PERSON",
 | 
				
			||||||
 | 
					                  "id": 47
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -4,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": ".",
 | 
				
			||||||
 | 
					                  "orth": ".",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 48
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					              ],
 | 
				
			||||||
 | 
					              "brackets": [
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 2,
 | 
				
			||||||
 | 
					                  "last": 3,
 | 
				
			||||||
 | 
					                  "label": "NML"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 1,
 | 
				
			||||||
 | 
					                  "last": 4,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 7,
 | 
				
			||||||
 | 
					                  "last": 8,
 | 
				
			||||||
 | 
					                  "label": "NP-TTL"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 11,
 | 
				
			||||||
 | 
					                  "last": 12,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 11,
 | 
				
			||||||
 | 
					                  "last": 14,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 10,
 | 
				
			||||||
 | 
					                  "last": 14,
 | 
				
			||||||
 | 
					                  "label": "PP-LOC"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 6,
 | 
				
			||||||
 | 
					                  "last": 14,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 5,
 | 
				
			||||||
 | 
					                  "last": 14,
 | 
				
			||||||
 | 
					                  "label": "PP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 1,
 | 
				
			||||||
 | 
					                  "last": 14,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 17,
 | 
				
			||||||
 | 
					                  "last": 18,
 | 
				
			||||||
 | 
					                  "label": "NP-SBJ"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 20,
 | 
				
			||||||
 | 
					                  "last": 21,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 23,
 | 
				
			||||||
 | 
					                  "last": 24,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 22,
 | 
				
			||||||
 | 
					                  "last": 24,
 | 
				
			||||||
 | 
					                  "label": "PP-LOC"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 19,
 | 
				
			||||||
 | 
					                  "last": 24,
 | 
				
			||||||
 | 
					                  "label": "VP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 17,
 | 
				
			||||||
 | 
					                  "last": 24,
 | 
				
			||||||
 | 
					                  "label": "S-HLN"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 27,
 | 
				
			||||||
 | 
					                  "last": 29,
 | 
				
			||||||
 | 
					                  "label": "NP-TMP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 15,
 | 
				
			||||||
 | 
					                  "last": 30,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 1,
 | 
				
			||||||
 | 
					                  "last": 30,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 0,
 | 
				
			||||||
 | 
					                  "last": 30,
 | 
				
			||||||
 | 
					                  "label": "PP-LOC"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 32,
 | 
				
			||||||
 | 
					                  "last": 33,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 35,
 | 
				
			||||||
 | 
					                  "last": 35,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 34,
 | 
				
			||||||
 | 
					                  "last": 35,
 | 
				
			||||||
 | 
					                  "label": "PP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 32,
 | 
				
			||||||
 | 
					                  "last": 35,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 39,
 | 
				
			||||||
 | 
					                  "last": 40,
 | 
				
			||||||
 | 
					                  "label": "NP-LGS"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 38,
 | 
				
			||||||
 | 
					                  "last": 40,
 | 
				
			||||||
 | 
					                  "label": "PP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 37,
 | 
				
			||||||
 | 
					                  "last": 40,
 | 
				
			||||||
 | 
					                  "label": "VP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 32,
 | 
				
			||||||
 | 
					                  "last": 41,
 | 
				
			||||||
 | 
					                  "label": "NP-SBJ-2"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 43,
 | 
				
			||||||
 | 
					                  "last": 43,
 | 
				
			||||||
 | 
					                  "label": "ADVP-MNR"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 46,
 | 
				
			||||||
 | 
					                  "last": 47,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 45,
 | 
				
			||||||
 | 
					                  "last": 47,
 | 
				
			||||||
 | 
					                  "label": "PP-CLR"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 44,
 | 
				
			||||||
 | 
					                  "last": 47,
 | 
				
			||||||
 | 
					                  "label": "VP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 42,
 | 
				
			||||||
 | 
					                  "last": 47,
 | 
				
			||||||
 | 
					                  "label": "VP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 0,
 | 
				
			||||||
 | 
					                  "last": 48,
 | 
				
			||||||
 | 
					                  "label": "S"
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					              ]
 | 
				
			||||||
 | 
					            },
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					              "tokens": [
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "compound",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Ms.",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 0
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 1,
 | 
				
			||||||
 | 
					                  "dep": "nsubj",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Haag",
 | 
				
			||||||
 | 
					                  "ner": "U-PERSON",
 | 
				
			||||||
 | 
					                  "id": 1
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": 0,
 | 
				
			||||||
 | 
					                  "dep": "root",
 | 
				
			||||||
 | 
					                  "tag": "VBZ",
 | 
				
			||||||
 | 
					                  "orth": "plays",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 2
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -1,
 | 
				
			||||||
 | 
					                  "dep": "dobj",
 | 
				
			||||||
 | 
					                  "tag": "NNP",
 | 
				
			||||||
 | 
					                  "orth": "Elianti",
 | 
				
			||||||
 | 
					                  "ner": "U-PERSON",
 | 
				
			||||||
 | 
					                  "id": 3
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "head": -2,
 | 
				
			||||||
 | 
					                  "dep": "punct",
 | 
				
			||||||
 | 
					                  "tag": ".",
 | 
				
			||||||
 | 
					                  "orth": ".",
 | 
				
			||||||
 | 
					                  "ner": "O",
 | 
				
			||||||
 | 
					                  "id": 4
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					              ],
 | 
				
			||||||
 | 
					              "brackets": [
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 0,
 | 
				
			||||||
 | 
					                  "last": 1,
 | 
				
			||||||
 | 
					                  "label": "NP-SBJ"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 3,
 | 
				
			||||||
 | 
					                  "last": 3,
 | 
				
			||||||
 | 
					                  "label": "NP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 2,
 | 
				
			||||||
 | 
					                  "last": 3,
 | 
				
			||||||
 | 
					                  "label": "VP"
 | 
				
			||||||
 | 
					                },
 | 
				
			||||||
 | 
					                {
 | 
				
			||||||
 | 
					                  "first": 0,
 | 
				
			||||||
 | 
					                  "last": 4,
 | 
				
			||||||
 | 
					                  "label": "S"
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					              ]
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
 | 
					          ]
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      ]
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					  ]
 | 
				
			||||||
							
								
								
									
										19
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										19
									
								
								spacy/_ml.py
									
									
									
									
									
								
							| 
						 | 
					@ -112,9 +112,10 @@ def _preprocess_doc(docs, drop=0.):
 | 
				
			||||||
    nO=Dimension("Output size"),
 | 
					    nO=Dimension("Output size"),
 | 
				
			||||||
    nP=Dimension("Maxout pieces"),
 | 
					    nP=Dimension("Maxout pieces"),
 | 
				
			||||||
    W=Synapses("Weights matrix",
 | 
					    W=Synapses("Weights matrix",
 | 
				
			||||||
        lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI)),
 | 
					        lambda obj: (obj.nF, obj.nO, obj.nP, obj.nI) if obj.nP >= 2
 | 
				
			||||||
 | 
					                    else (obj.nF, obj.nO, obj.nI)),
 | 
				
			||||||
    b=Biases("Bias vector",
 | 
					    b=Biases("Bias vector",
 | 
				
			||||||
        lambda obj: (obj.nO, obj.nP)),
 | 
					        lambda obj: (obj.nO, obj.nP) if obj.nP >= 2 else (obj.nO,)),
 | 
				
			||||||
    d_W=Gradient("W"),
 | 
					    d_W=Gradient("W"),
 | 
				
			||||||
    d_b=Gradient("b")
 | 
					    d_b=Gradient("b")
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
| 
						 | 
					@ -129,16 +130,23 @@ class PrecomputableAffine(Model):
 | 
				
			||||||
    def begin_update(self, X, drop=0.):
 | 
					    def begin_update(self, X, drop=0.):
 | 
				
			||||||
        tensordot = self.ops.xp.tensordot
 | 
					        tensordot = self.ops.xp.tensordot
 | 
				
			||||||
        ascontiguous = self.ops.xp.ascontiguousarray
 | 
					        ascontiguous = self.ops.xp.ascontiguousarray
 | 
				
			||||||
 | 
					        if self.nP == 1:
 | 
				
			||||||
 | 
					            Yf = tensordot(X, self.W, axes=[[1], [2]])
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
            Yf = tensordot(X, self.W, axes=[[1], [3]])
 | 
					            Yf = tensordot(X, self.W, axes=[[1], [3]])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def backward(dY_ids, sgd=None):
 | 
					        def backward(dY_ids, sgd=None):
 | 
				
			||||||
            dY, ids = dY_ids
 | 
					            dY, ids = dY_ids
 | 
				
			||||||
            Xf = X[ids]
 | 
					            Xf = X[ids]
 | 
				
			||||||
 | 
					            if self.nP == 1:
 | 
				
			||||||
 | 
					                dXf = tensordot(dY, self.W, axes=[[1], [1]])
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
                dXf = tensordot(dY, self.W, axes=[[1,2], [1,2]])
 | 
					                dXf = tensordot(dY, self.W, axes=[[1,2], [1,2]])
 | 
				
			||||||
            dW = tensordot(dY, Xf, axes=[[0], [0]])
 | 
					            dW = tensordot(dY, Xf, axes=[[0], [0]])
 | 
				
			||||||
            # (o, p, f, i) --> (f, o, p, i)
 | 
					            # (o, p, f, i) --> (f, o, p, i)
 | 
				
			||||||
 | 
					            if self.nP == 1:
 | 
				
			||||||
 | 
					                self.d_W += dW.transpose((1, 0, 2))
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
                self.d_W += dW.transpose((2, 0, 1, 3))
 | 
					                self.d_W += dW.transpose((2, 0, 1, 3))
 | 
				
			||||||
            self.d_b += dY.sum(axis=0)
 | 
					            self.d_b += dY.sum(axis=0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -169,6 +177,9 @@ class PrecomputableAffine(Model):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        def predict(ids, tokvecs):
 | 
					        def predict(ids, tokvecs):
 | 
				
			||||||
            hiddens = model(tokvecs)
 | 
					            hiddens = model(tokvecs)
 | 
				
			||||||
 | 
					            if model.nP == 1:
 | 
				
			||||||
 | 
					                vector = model.ops.allocate((hiddens.shape[0], model.nO))
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
                vector = model.ops.allocate((hiddens.shape[0], model.nO, model.nP))
 | 
					                vector = model.ops.allocate((hiddens.shape[0], model.nO, model.nP))
 | 
				
			||||||
            model.ops.scatter_add(vector, ids, hiddens)
 | 
					            model.ops.scatter_add(vector, ids, hiddens)
 | 
				
			||||||
            vector += model.b
 | 
					            vector += model.b
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@
 | 
				
			||||||
# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 | 
					# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__title__ = 'spacy-nightly'
 | 
					__title__ = 'spacy-nightly'
 | 
				
			||||||
__version__ = '2.0.0a17'
 | 
					__version__ = '2.0.0a18'
 | 
				
			||||||
__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 | 
					__summary__ = 'Industrial-strength Natural Language Processing (NLP) with Python and Cython'
 | 
				
			||||||
__uri__ = 'https://spacy.io'
 | 
					__uri__ = 'https://spacy.io'
 | 
				
			||||||
__author__ = 'Explosion AI'
 | 
					__author__ = 'Explosion AI'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -101,7 +101,7 @@ def generate_meta():
 | 
				
			||||||
def generate_pipeline():
 | 
					def generate_pipeline():
 | 
				
			||||||
    prints("If set to 'True', the default pipeline is used. If set to 'False', "
 | 
					    prints("If set to 'True', the default pipeline is used. If set to 'False', "
 | 
				
			||||||
           "the pipeline will be disabled. Components should be specified as a "
 | 
					           "the pipeline will be disabled. Components should be specified as a "
 | 
				
			||||||
           "comma-separated list of component names, e.g. tensorizer, tagger, "
 | 
					           "comma-separated list of component names, e.g. tagger, "
 | 
				
			||||||
           "parser, ner. For more information, see the docs on processing pipelines.",
 | 
					           "parser, ner. For more information, see the docs on processing pipelines.",
 | 
				
			||||||
           title="Enter your model's pipeline components")
 | 
					           title="Enter your model's pipeline components")
 | 
				
			||||||
    pipeline = util.get_raw_input("Pipeline components", True)
 | 
					    pipeline = util.get_raw_input("Pipeline components", True)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -62,5 +62,5 @@ TAG_MAP = {
 | 
				
			||||||
    "VVIZU":    {POS: VERB, "VerbForm": "inf"},
 | 
					    "VVIZU":    {POS: VERB, "VerbForm": "inf"},
 | 
				
			||||||
    "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
 | 
					    "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
 | 
				
			||||||
    "XY":       {POS: X},
 | 
					    "XY":       {POS: X},
 | 
				
			||||||
    "SP":       {POS: SPACE}
 | 
					    "_SP":      {POS: SPACE}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -42,6 +42,7 @@ TAG_MAP = {
 | 
				
			||||||
    "RBR":      {POS: ADV, "Degree": "comp"},
 | 
					    "RBR":      {POS: ADV, "Degree": "comp"},
 | 
				
			||||||
    "RBS":      {POS: ADV, "Degree": "sup"},
 | 
					    "RBS":      {POS: ADV, "Degree": "sup"},
 | 
				
			||||||
    "RP":       {POS: PART},
 | 
					    "RP":       {POS: PART},
 | 
				
			||||||
 | 
					    "SP":       {POS: SPACE},
 | 
				
			||||||
    "SYM":      {POS: SYM},
 | 
					    "SYM":      {POS: SYM},
 | 
				
			||||||
    "TO":       {POS: PART, "PartType": "inf", "VerbForm": "inf"},
 | 
					    "TO":       {POS: PART, "PartType": "inf", "VerbForm": "inf"},
 | 
				
			||||||
    "UH":       {POS: INTJ},
 | 
					    "UH":       {POS: INTJ},
 | 
				
			||||||
| 
						 | 
					@ -55,11 +56,11 @@ TAG_MAP = {
 | 
				
			||||||
    "WP":       {POS: NOUN, "PronType": "int|rel"},
 | 
					    "WP":       {POS: NOUN, "PronType": "int|rel"},
 | 
				
			||||||
    "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
 | 
					    "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
 | 
				
			||||||
    "WRB":      {POS: ADV, "PronType": "int|rel"},
 | 
					    "WRB":      {POS: ADV, "PronType": "int|rel"},
 | 
				
			||||||
    "SP":       {POS: SPACE},
 | 
					 | 
				
			||||||
    "ADD":      {POS: X},
 | 
					    "ADD":      {POS: X},
 | 
				
			||||||
    "NFP":      {POS: PUNCT},
 | 
					    "NFP":      {POS: PUNCT},
 | 
				
			||||||
    "GW":       {POS: X},
 | 
					    "GW":       {POS: X},
 | 
				
			||||||
    "XX":       {POS: X},
 | 
					    "XX":       {POS: X},
 | 
				
			||||||
    "BES":      {POS: VERB},
 | 
					    "BES":      {POS: VERB},
 | 
				
			||||||
    "HVS":      {POS: VERB}
 | 
					    "HVS":      {POS: VERB},
 | 
				
			||||||
 | 
					    "_SP":       {POS: SPACE},
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -303,5 +303,5 @@ TAG_MAP = {
 | 
				
			||||||
    "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
 | 
					    "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
 | 
				
			||||||
    "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
 | 
					    "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
 | 
				
			||||||
    "X___": {"morph": "_", "pos": "X"},
 | 
					    "X___": {"morph": "_", "pos": "X"},
 | 
				
			||||||
    "SP": {"morph": "_", "pos": "SPACE"},
 | 
					    "_SP": {"morph": "_", "pos": "SPACE"},
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,8 +33,7 @@ class Japanese(Language):
 | 
				
			||||||
    Defaults = JapaneseDefaults
 | 
					    Defaults = JapaneseDefaults
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def make_doc(self, text):
 | 
					    def make_doc(self, text):
 | 
				
			||||||
        words = self.tokenizer(text)
 | 
					        return self.tokenizer(text)
 | 
				
			||||||
        return Doc(self.vocab, words=words, spaces=[False]*len(words))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__all__ = ['Japanese']
 | 
					__all__ = ['Japanese']
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										18
									
								
								spacy/lang/ja/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/ja/examples.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,18 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					>>> from spacy.lang.ja.examples import sentences
 | 
				
			||||||
 | 
					>>> docs = nlp.pipe(sentences)
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sentences = [
 | 
				
			||||||
 | 
					    'アップルがイギリスの新興企業を10億ドルで購入を検討',
 | 
				
			||||||
 | 
					    '自動運転車の損害賠償責任、自動車メーカーに一定の負担を求める',
 | 
				
			||||||
 | 
					    '歩道を走る自動配達ロボ、サンフランシスコ市が走行禁止を検討',
 | 
				
			||||||
 | 
					    'ロンドンはイギリスの大都市です。'
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
| 
						 | 
					@ -77,5 +77,6 @@ TAG_MAP = {
 | 
				
			||||||
    "NEG":      {POS: PART},
 | 
					    "NEG":      {POS: PART},
 | 
				
			||||||
    # PUNCT
 | 
					    # PUNCT
 | 
				
			||||||
    "PUNCT":    {POS: PUNCT},
 | 
					    "PUNCT":    {POS: PUNCT},
 | 
				
			||||||
    "PUNC":    {POS: PUNCT}
 | 
					    "PUNC":     {POS: PUNCT},
 | 
				
			||||||
 | 
					    "_SP":      {POS: SPACE}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										18
									
								
								spacy/lang/zh/examples.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										18
									
								
								spacy/lang/zh/examples.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,18 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					>>> from spacy.lang.zh.examples import sentences
 | 
				
			||||||
 | 
					>>> docs = nlp.pipe(sentences)
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					sentences = [
 | 
				
			||||||
 | 
					    "蘋果公司正考量用一億元買下英國的新創公司",
 | 
				
			||||||
 | 
					    "自駕車將保險責任歸屬轉移至製造商",
 | 
				
			||||||
 | 
					    "舊金山考慮禁止送貨機器人在人行道上行駛",
 | 
				
			||||||
 | 
					    "倫敦是英國的大城市"
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
| 
						 | 
					@ -7,8 +7,8 @@ from .symbols import VerbForm_inf, VerbForm_none, Number_sing, Degree_pos
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Lemmatizer(object):
 | 
					class Lemmatizer(object):
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def load(cls, path, index=None, exc=None, rules=None):
 | 
					    def load(cls, path, index=None, exc=None, rules=None, lookup=None):
 | 
				
			||||||
        return cls(index or {}, exc or {}, rules or {})
 | 
					        return cls(index or {}, exc or {}, rules or {}, lookup or {})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
 | 
					    def __init__(self, index=None, exceptions=None, rules=None, lookup=None):
 | 
				
			||||||
        self.index = index if index is not None else {}
 | 
					        self.index = index if index is not None else {}
 | 
				
			||||||
| 
						 | 
					@ -26,10 +26,10 @@ class Lemmatizer(object):
 | 
				
			||||||
        elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
 | 
					        elif univ_pos in (PUNCT, 'PUNCT', 'punct'):
 | 
				
			||||||
            univ_pos = 'punct'
 | 
					            univ_pos = 'punct'
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return set([string.lower()])
 | 
					            return list(set([string.lower()]))
 | 
				
			||||||
        # See Issue #435 for example of where this logic is requied.
 | 
					        # See Issue #435 for example of where this logic is requied.
 | 
				
			||||||
        if self.is_base_form(univ_pos, morphology):
 | 
					        if self.is_base_form(univ_pos, morphology):
 | 
				
			||||||
            return set([string.lower()])
 | 
					            return list(set([string.lower()]))
 | 
				
			||||||
        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
 | 
					        lemmas = lemmatize(string, self.index.get(univ_pos, {}),
 | 
				
			||||||
                           self.exc.get(univ_pos, {}),
 | 
					                           self.exc.get(univ_pos, {}),
 | 
				
			||||||
                           self.rules.get(univ_pos, []))
 | 
					                           self.rules.get(univ_pos, []))
 | 
				
			||||||
| 
						 | 
					@ -108,4 +108,4 @@ def lemmatize(string, index, exceptions, rules):
 | 
				
			||||||
        forms.extend(oov_forms)
 | 
					        forms.extend(oov_forms)
 | 
				
			||||||
    if not forms:
 | 
					    if not forms:
 | 
				
			||||||
        forms.append(string)
 | 
					        forms.append(string)
 | 
				
			||||||
    return set(forms)
 | 
					    return list(set(forms))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -69,6 +69,7 @@ cdef enum action_t:
 | 
				
			||||||
    REPEAT
 | 
					    REPEAT
 | 
				
			||||||
    ACCEPT
 | 
					    ACCEPT
 | 
				
			||||||
    ADVANCE_ZERO
 | 
					    ADVANCE_ZERO
 | 
				
			||||||
 | 
					    ACCEPT_PREV
 | 
				
			||||||
    PANIC
 | 
					    PANIC
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# A "match expression" conists of one or more token patterns
 | 
					# A "match expression" conists of one or more token patterns
 | 
				
			||||||
| 
						 | 
					@ -120,24 +121,27 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
 | 
					cdef int get_action(const TokenPatternC* pattern, const TokenC* token) nogil:
 | 
				
			||||||
 | 
					    lookahead = &pattern[1]
 | 
				
			||||||
    for attr in pattern.attrs[:pattern.nr_attr]:
 | 
					    for attr in pattern.attrs[:pattern.nr_attr]:
 | 
				
			||||||
        if get_token_attr(token, attr.attr) != attr.value:
 | 
					        if get_token_attr(token, attr.attr) != attr.value:
 | 
				
			||||||
            if pattern.quantifier == ONE:
 | 
					            if pattern.quantifier == ONE:
 | 
				
			||||||
                return REJECT
 | 
					                return REJECT
 | 
				
			||||||
            elif pattern.quantifier == ZERO:
 | 
					            elif pattern.quantifier == ZERO:
 | 
				
			||||||
                return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
 | 
					                return ACCEPT if lookahead.nr_attr == 0 else ADVANCE
 | 
				
			||||||
            elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
 | 
					            elif pattern.quantifier in (ZERO_ONE, ZERO_PLUS):
 | 
				
			||||||
                return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE_ZERO
 | 
					                return ACCEPT_PREV if lookahead.nr_attr == 0 else ADVANCE_ZERO
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                return PANIC
 | 
					                return PANIC
 | 
				
			||||||
    if pattern.quantifier == ZERO:
 | 
					    if pattern.quantifier == ZERO:
 | 
				
			||||||
        return REJECT
 | 
					        return REJECT
 | 
				
			||||||
 | 
					    elif lookahead.nr_attr == 0:
 | 
				
			||||||
 | 
					        return ACCEPT
 | 
				
			||||||
    elif pattern.quantifier in (ONE, ZERO_ONE):
 | 
					    elif pattern.quantifier in (ONE, ZERO_ONE):
 | 
				
			||||||
        return ACCEPT if (pattern+1).nr_attr == 0 else ADVANCE
 | 
					        return ADVANCE
 | 
				
			||||||
    elif pattern.quantifier == ZERO_PLUS:
 | 
					    elif pattern.quantifier == ZERO_PLUS:
 | 
				
			||||||
        # This is a bandaid over the 'shadowing' problem described here:
 | 
					        # This is a bandaid over the 'shadowing' problem described here:
 | 
				
			||||||
        # https://github.com/explosion/spaCy/issues/864
 | 
					        # https://github.com/explosion/spaCy/issues/864
 | 
				
			||||||
        next_action = get_action(pattern+1, token)
 | 
					        next_action = get_action(lookahead, token)
 | 
				
			||||||
        if next_action is REJECT:
 | 
					        if next_action is REJECT:
 | 
				
			||||||
            return REPEAT
 | 
					            return REPEAT
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
| 
						 | 
					@ -345,6 +349,9 @@ cdef class Matcher:
 | 
				
			||||||
                while action == ADVANCE_ZERO:
 | 
					                while action == ADVANCE_ZERO:
 | 
				
			||||||
                    state.second += 1
 | 
					                    state.second += 1
 | 
				
			||||||
                    action = get_action(state.second, token)
 | 
					                    action = get_action(state.second, token)
 | 
				
			||||||
 | 
					                if action == PANIC:
 | 
				
			||||||
 | 
					                    raise Exception("Error selecting action in matcher")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                if action == REPEAT:
 | 
					                if action == REPEAT:
 | 
				
			||||||
                    # Leave the state in the queue, and advance to next slot
 | 
					                    # Leave the state in the queue, and advance to next slot
 | 
				
			||||||
                    # (i.e. we don't overwrite -- we want to greedily match more
 | 
					                    # (i.e. we don't overwrite -- we want to greedily match more
 | 
				
			||||||
| 
						 | 
					@ -356,14 +363,15 @@ cdef class Matcher:
 | 
				
			||||||
                    partials[q] = state
 | 
					                    partials[q] = state
 | 
				
			||||||
                    partials[q].second += 1
 | 
					                    partials[q].second += 1
 | 
				
			||||||
                    q += 1
 | 
					                    q += 1
 | 
				
			||||||
                elif action == ACCEPT:
 | 
					                elif action in (ACCEPT, ACCEPT_PREV):
 | 
				
			||||||
                    # TODO: What to do about patterns starting with ZERO? Need to
 | 
					                    # TODO: What to do about patterns starting with ZERO? Need to
 | 
				
			||||||
                    # adjust the start position.
 | 
					                    # adjust the start position.
 | 
				
			||||||
                    start = state.first
 | 
					                    start = state.first
 | 
				
			||||||
                    end = token_i+1
 | 
					                    end = token_i+1 if action == ACCEPT else token_i
 | 
				
			||||||
                    ent_id = state.second[1].attrs[0].value
 | 
					                    ent_id = state.second[1].attrs[0].value
 | 
				
			||||||
                    label = state.second[1].attrs[1].value
 | 
					                    label = state.second[1].attrs[1].value
 | 
				
			||||||
                    matches.append((ent_id, start, end))
 | 
					                    matches.append((ent_id, start, end))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            partials.resize(q)
 | 
					            partials.resize(q)
 | 
				
			||||||
            # Check whether we open any new patterns on this token
 | 
					            # Check whether we open any new patterns on this token
 | 
				
			||||||
            for pattern in self.patterns:
 | 
					            for pattern in self.patterns:
 | 
				
			||||||
| 
						 | 
					@ -383,15 +391,15 @@ cdef class Matcher:
 | 
				
			||||||
                    state.first = token_i
 | 
					                    state.first = token_i
 | 
				
			||||||
                    state.second = pattern + 1
 | 
					                    state.second = pattern + 1
 | 
				
			||||||
                    partials.push_back(state)
 | 
					                    partials.push_back(state)
 | 
				
			||||||
                elif action == ACCEPT:
 | 
					                elif action in (ACCEPT, ACCEPT_PREV):
 | 
				
			||||||
                    start = token_i
 | 
					                    start = token_i
 | 
				
			||||||
                    end = token_i+1
 | 
					                    end = token_i+1 if action == ACCEPT else token_i
 | 
				
			||||||
                    ent_id = pattern[1].attrs[0].value
 | 
					                    ent_id = pattern[1].attrs[0].value
 | 
				
			||||||
                    label = pattern[1].attrs[1].value
 | 
					                    label = pattern[1].attrs[1].value
 | 
				
			||||||
                    matches.append((ent_id, start, end))
 | 
					                    matches.append((ent_id, start, end))
 | 
				
			||||||
        # Look for open patterns that are actually satisfied
 | 
					        # Look for open patterns that are actually satisfied
 | 
				
			||||||
        for state in partials:
 | 
					        for state in partials:
 | 
				
			||||||
            while state.second.quantifier in (ZERO, ZERO_PLUS):
 | 
					            while state.second.quantifier in (ZERO, ZERO_ONE, ZERO_PLUS):
 | 
				
			||||||
                state.second += 1
 | 
					                state.second += 1
 | 
				
			||||||
                if state.second.nr_attr == 0:
 | 
					                if state.second.nr_attr == 0:
 | 
				
			||||||
                    start = state.first
 | 
					                    start = state.first
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -44,7 +44,7 @@ cdef class Morphology:
 | 
				
			||||||
    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 | 
					    cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef enum univ_morph_t:
 | 
					cdef enum univ_morph_t:
 | 
				
			||||||
    NIL = 0
 | 
					    NIL = 0
 | 
				
			||||||
    Animacy_anim = symbols.Animacy_anim
 | 
					    Animacy_anim = symbols.Animacy_anim
 | 
				
			||||||
    Animacy_inam
 | 
					    Animacy_inam
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.string cimport memset
 | 
					from libc.string cimport memset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
 | 
					from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
 | 
				
			||||||
from .attrs cimport POS, IS_SPACE
 | 
					from .attrs cimport POS, IS_SPACE
 | 
				
			||||||
from .parts_of_speech import IDS as POS_IDS
 | 
					from .parts_of_speech import IDS as POS_IDS
 | 
				
			||||||
from .lexeme cimport Lexeme
 | 
					from .lexeme cimport Lexeme
 | 
				
			||||||
| 
						 | 
					@ -36,14 +36,22 @@ cdef class Morphology:
 | 
				
			||||||
    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
 | 
					    def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        self.strings = string_store
 | 
					        self.strings = string_store
 | 
				
			||||||
 | 
					        # Add special space symbol. We prefix with underscore, to make sure it
 | 
				
			||||||
 | 
					        # always sorts to the end.
 | 
				
			||||||
 | 
					        space_attrs = tag_map.get('SP', {POS: SPACE})
 | 
				
			||||||
 | 
					        if '_SP' not in tag_map:
 | 
				
			||||||
 | 
					            self.strings.add('_SP')
 | 
				
			||||||
 | 
					            tag_map = dict(tag_map)
 | 
				
			||||||
 | 
					            tag_map['_SP'] = space_attrs
 | 
				
			||||||
 | 
					        self.tag_names = tuple(sorted(tag_map.keys()))
 | 
				
			||||||
        self.tag_map = {}
 | 
					        self.tag_map = {}
 | 
				
			||||||
        self.lemmatizer = lemmatizer
 | 
					        self.lemmatizer = lemmatizer
 | 
				
			||||||
        self.n_tags = len(tag_map)
 | 
					        self.n_tags = len(tag_map)
 | 
				
			||||||
        self.tag_names = tuple(sorted(tag_map.keys()))
 | 
					 | 
				
			||||||
        self.reverse_index = {}
 | 
					        self.reverse_index = {}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
 | 
					        self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
 | 
				
			||||||
        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
 | 
					        for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
 | 
				
			||||||
 | 
					            self.strings.add(tag_str)
 | 
				
			||||||
            self.tag_map[tag_str] = dict(attrs)
 | 
					            self.tag_map[tag_str] = dict(attrs)
 | 
				
			||||||
            attrs = _normalize_props(attrs)
 | 
					            attrs = _normalize_props(attrs)
 | 
				
			||||||
            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
 | 
					            attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
 | 
				
			||||||
| 
						 | 
					@ -93,7 +101,7 @@ cdef class Morphology:
 | 
				
			||||||
        # the statistical model fails.
 | 
					        # the statistical model fails.
 | 
				
			||||||
        # Related to Issue #220
 | 
					        # Related to Issue #220
 | 
				
			||||||
        if Lexeme.c_check_flag(token.lex, IS_SPACE):
 | 
					        if Lexeme.c_check_flag(token.lex, IS_SPACE):
 | 
				
			||||||
            tag_id = self.reverse_index[self.strings.add('SP')]
 | 
					            tag_id = self.reverse_index[self.strings.add('_SP')]
 | 
				
			||||||
        rich_tag = self.rich_tags[tag_id]
 | 
					        rich_tag = self.rich_tags[tag_id]
 | 
				
			||||||
        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
 | 
					        analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
 | 
				
			||||||
        if analysis is NULL:
 | 
					        if analysis is NULL:
 | 
				
			||||||
| 
						 | 
					@ -164,7 +172,7 @@ cdef class Morphology:
 | 
				
			||||||
        cdef unicode py_string = self.strings[orth]
 | 
					        cdef unicode py_string = self.strings[orth]
 | 
				
			||||||
        if self.lemmatizer is None:
 | 
					        if self.lemmatizer is None:
 | 
				
			||||||
            return self.strings.add(py_string.lower())
 | 
					            return self.strings.add(py_string.lower())
 | 
				
			||||||
        cdef set lemma_strings
 | 
					        cdef list lemma_strings
 | 
				
			||||||
        cdef unicode lemma_string
 | 
					        cdef unicode lemma_string
 | 
				
			||||||
        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
 | 
					        lemma_strings = self.lemmatizer(py_string, univ_pos, morphology)
 | 
				
			||||||
        lemma_string = sorted(lemma_strings)[0]
 | 
					        lemma_string = sorted(lemma_strings)[0]
 | 
				
			||||||
| 
						 | 
					@ -426,3 +434,7 @@ IDS = {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
 | 
					NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
 | 
				
			||||||
 | 
					# Unfortunate hack here, to work around problem with long cpdef enum
 | 
				
			||||||
 | 
					# (which is generating an enormous amount of C++ in Cython 0.24+)
 | 
				
			||||||
 | 
					# We keep the enum cdef, and just make sure the names are available to Python
 | 
				
			||||||
 | 
					locals().update(IDS)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,12 +13,12 @@ cdef enum symbol_t:
 | 
				
			||||||
    LIKE_EMAIL
 | 
					    LIKE_EMAIL
 | 
				
			||||||
    IS_STOP
 | 
					    IS_STOP
 | 
				
			||||||
    IS_OOV
 | 
					    IS_OOV
 | 
				
			||||||
 | 
					    IS_BRACKET
 | 
				
			||||||
 | 
					    IS_QUOTE
 | 
				
			||||||
 | 
					    IS_LEFT_PUNCT
 | 
				
			||||||
 | 
					    IS_RIGHT_PUNCT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    FLAG14 = 14
 | 
					    FLAG18 = 18
 | 
				
			||||||
    FLAG15
 | 
					 | 
				
			||||||
    FLAG16
 | 
					 | 
				
			||||||
    FLAG17
 | 
					 | 
				
			||||||
    FLAG18
 | 
					 | 
				
			||||||
    FLAG19
 | 
					    FLAG19
 | 
				
			||||||
    FLAG20
 | 
					    FLAG20
 | 
				
			||||||
    FLAG21
 | 
					    FLAG21
 | 
				
			||||||
| 
						 | 
					@ -455,15 +455,5 @@ cdef enum symbol_t:
 | 
				
			||||||
    root
 | 
					    root
 | 
				
			||||||
    xcomp
 | 
					    xcomp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Move these up to FLAG14--FLAG18 once we finish the functionality
 | 
					    acl
 | 
				
			||||||
# and are ready to regenerate the model.
 | 
					    LAW
 | 
				
			||||||
#IS_BRACKET
 | 
					 | 
				
			||||||
#IS_QUOTE
 | 
					 | 
				
			||||||
#IS_LEFT_PUNCT
 | 
					 | 
				
			||||||
#IS_RIGHT_PUNCT
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# These symbols are currently missing. However, if we add them currently,
 | 
					 | 
				
			||||||
# we'll throw off the integer index and the model will have to be retrained.
 | 
					 | 
				
			||||||
# We therefore wait until the next data version to add them.
 | 
					 | 
				
			||||||
# acl
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -18,10 +18,11 @@ IDS = {
 | 
				
			||||||
    "LIKE_EMAIL": LIKE_EMAIL,
 | 
					    "LIKE_EMAIL": LIKE_EMAIL,
 | 
				
			||||||
    "IS_STOP": IS_STOP,
 | 
					    "IS_STOP": IS_STOP,
 | 
				
			||||||
    "IS_OOV": IS_OOV,
 | 
					    "IS_OOV": IS_OOV,
 | 
				
			||||||
    "FLAG14": FLAG14,
 | 
					    "IS_BRACKET": IS_BRACKET,
 | 
				
			||||||
    "FLAG15": FLAG15,
 | 
					    "IS_QUOTE": IS_QUOTE,
 | 
				
			||||||
    "FLAG16": FLAG16,
 | 
					    "IS_LEFT_PUNCT": IS_LEFT_PUNCT,
 | 
				
			||||||
    "FLAG17": FLAG17,
 | 
					    "IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    "FLAG18": FLAG18,
 | 
					    "FLAG18": FLAG18,
 | 
				
			||||||
    "FLAG19": FLAG19,
 | 
					    "FLAG19": FLAG19,
 | 
				
			||||||
    "FLAG20": FLAG20,
 | 
					    "FLAG20": FLAG20,
 | 
				
			||||||
| 
						 | 
					@ -457,7 +458,10 @@ IDS = {
 | 
				
			||||||
    "quantmod": quantmod,
 | 
					    "quantmod": quantmod,
 | 
				
			||||||
    "rcmod": rcmod,
 | 
					    "rcmod": rcmod,
 | 
				
			||||||
    "root": root,
 | 
					    "root": root,
 | 
				
			||||||
    "xcomp": xcomp
 | 
					    "xcomp": xcomp,
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    "acl": acl,
 | 
				
			||||||
 | 
					    "LAW": LAW
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def sort_nums(x):
 | 
					def sort_nums(x):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,8 @@ from libc.string cimport memcpy, memset, memmove
 | 
				
			||||||
from libc.stdlib cimport malloc, calloc, free
 | 
					from libc.stdlib cimport malloc, calloc, free
 | 
				
			||||||
from libc.stdint cimport uint32_t, uint64_t
 | 
					from libc.stdint cimport uint32_t, uint64_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					from murmurhash.mrmr cimport hash64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..vocab cimport EMPTY_LEXEME
 | 
					from ..vocab cimport EMPTY_LEXEME
 | 
				
			||||||
| 
						 | 
					@ -55,6 +57,11 @@ cdef cppclass StateC:
 | 
				
			||||||
        this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
 | 
					        this.shifted = <bint*>calloc(length + (PADDING * 2), sizeof(bint))
 | 
				
			||||||
        this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
 | 
					        this._sent = <TokenC*>calloc(length + (PADDING * 2), sizeof(TokenC))
 | 
				
			||||||
        this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
 | 
					        this._ents = <Entity*>calloc(length + (PADDING * 2), sizeof(Entity))
 | 
				
			||||||
 | 
					        if not (this._buffer and this._stack and this.shifted
 | 
				
			||||||
 | 
					                and this._sent and this._ents):
 | 
				
			||||||
 | 
					            with gil:
 | 
				
			||||||
 | 
					                PyErr_SetFromErrno(MemoryError)
 | 
				
			||||||
 | 
					                PyErr_CheckSignals()
 | 
				
			||||||
        memset(&this._hist, 0, sizeof(this._hist))
 | 
					        memset(&this._hist, 0, sizeof(this._hist))
 | 
				
			||||||
        this.offset = 0
 | 
					        this.offset = 0
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -212,7 +212,8 @@ cdef class LeftArc:
 | 
				
			||||||
cdef class RightArc:
 | 
					cdef class RightArc:
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef bint is_valid(const StateC* st, attr_t label) nogil:
 | 
					    cdef bint is_valid(const StateC* st, attr_t label) nogil:
 | 
				
			||||||
        return st.B_(0).sent_start != 1
 | 
					        # If there's (perhaps partial) parse pre-set, don't allow cycle.
 | 
				
			||||||
 | 
					        return st.B_(0).sent_start != 1 and st.H(st.S(0)) != st.B(0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @staticmethod
 | 
					    @staticmethod
 | 
				
			||||||
    cdef int transition(StateC* st, attr_t label) nogil:
 | 
					    cdef int transition(StateC* st, attr_t label) nogil:
 | 
				
			||||||
| 
						 | 
					@ -446,14 +447,19 @@ cdef class ArcEager(TransitionSystem):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int initialize_state(self, StateC* st) nogil:
 | 
					    cdef int initialize_state(self, StateC* st) nogil:
 | 
				
			||||||
        for i in range(st.length):
 | 
					        for i in range(st.length):
 | 
				
			||||||
 | 
					            if st._sent[i].dep == 0:
 | 
				
			||||||
                st._sent[i].l_edge = i
 | 
					                st._sent[i].l_edge = i
 | 
				
			||||||
                st._sent[i].r_edge = i
 | 
					                st._sent[i].r_edge = i
 | 
				
			||||||
 | 
					                st._sent[i].head = 0
 | 
				
			||||||
 | 
					                st._sent[i].dep = 0
 | 
				
			||||||
 | 
					                st._sent[i].l_kids = 0
 | 
				
			||||||
 | 
					                st._sent[i].r_kids = 0
 | 
				
			||||||
        st.fast_forward()
 | 
					        st.fast_forward()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int finalize_state(self, StateC* st) nogil:
 | 
					    cdef int finalize_state(self, StateC* st) nogil:
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
        for i in range(st.length):
 | 
					        for i in range(st.length):
 | 
				
			||||||
            if st._sent[i].head == 0 and st._sent[i].dep == 0:
 | 
					            if st._sent[i].head == 0:
 | 
				
			||||||
                st._sent[i].dep = self.root_label
 | 
					                st._sent[i].dep = self.root_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def finalize_doc(self, doc):
 | 
					    def finalize_doc(self, doc):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,4 @@
 | 
				
			||||||
# cython: infer_types=True
 | 
					# cython: infer_types=True
 | 
				
			||||||
# cython: profile=True
 | 
					 | 
				
			||||||
# cython: cdivision=True
 | 
					# cython: cdivision=True
 | 
				
			||||||
# cython: boundscheck=False
 | 
					# cython: boundscheck=False
 | 
				
			||||||
# coding: utf-8
 | 
					# coding: utf-8
 | 
				
			||||||
| 
						 | 
					@ -22,7 +21,7 @@ cimport numpy as np
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libcpp.vector cimport vector
 | 
					from libcpp.vector cimport vector
 | 
				
			||||||
from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 | 
					from cpython.ref cimport PyObject, Py_INCREF, Py_XDECREF
 | 
				
			||||||
from cpython.exc cimport PyErr_CheckSignals
 | 
					from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 | 
				
			||||||
from libc.stdint cimport uint32_t, uint64_t
 | 
					from libc.stdint cimport uint32_t, uint64_t
 | 
				
			||||||
from libc.string cimport memset, memcpy
 | 
					from libc.string cimport memset, memcpy
 | 
				
			||||||
from libc.stdlib cimport malloc, calloc, free
 | 
					from libc.stdlib cimport malloc, calloc, free
 | 
				
			||||||
| 
						 | 
					@ -440,6 +439,7 @@ cdef class Parser:
 | 
				
			||||||
                self._parseC(states[i],
 | 
					                self._parseC(states[i],
 | 
				
			||||||
                    feat_weights, bias, hW, hb,
 | 
					                    feat_weights, bias, hW, hb,
 | 
				
			||||||
                    nr_class, nr_hidden, nr_feat, nr_piece)
 | 
					                    nr_class, nr_hidden, nr_feat, nr_piece)
 | 
				
			||||||
 | 
					        PyErr_CheckSignals()
 | 
				
			||||||
        return state_objs
 | 
					        return state_objs
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef void _parseC(self, StateC* state, 
 | 
					    cdef void _parseC(self, StateC* state, 
 | 
				
			||||||
| 
						 | 
					@ -450,6 +450,10 @@ cdef class Parser:
 | 
				
			||||||
        is_valid = <int*>calloc(nr_class, sizeof(int))
 | 
					        is_valid = <int*>calloc(nr_class, sizeof(int))
 | 
				
			||||||
        vectors = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
 | 
					        vectors = <float*>calloc(nr_hidden * nr_piece, sizeof(float))
 | 
				
			||||||
        scores = <float*>calloc(nr_class, sizeof(float))
 | 
					        scores = <float*>calloc(nr_class, sizeof(float))
 | 
				
			||||||
 | 
					        if not (token_ids and is_valid and vectors and scores):
 | 
				
			||||||
 | 
					            with gil:
 | 
				
			||||||
 | 
					                PyErr_SetFromErrno(MemoryError)
 | 
				
			||||||
 | 
					                PyErr_CheckSignals()
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        while not state.is_final():
 | 
					        while not state.is_final():
 | 
				
			||||||
            state.set_context_tokens(token_ids, nr_feat)
 | 
					            state.set_context_tokens(token_ids, nr_feat)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,6 +17,26 @@ def test_doc_array_attr_of_token(en_tokenizer, en_vocab):
 | 
				
			||||||
    assert feats_array[0][0] != feats_array[0][1]
 | 
					    assert feats_array[0][0] != feats_array[0][1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_doc_stringy_array_attr_of_token(en_tokenizer, en_vocab):
 | 
				
			||||||
 | 
					    text = "An example sentence"
 | 
				
			||||||
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
 | 
					    example = tokens.vocab["example"]
 | 
				
			||||||
 | 
					    assert example.orth != example.shape
 | 
				
			||||||
 | 
					    feats_array = tokens.to_array((ORTH, SHAPE))
 | 
				
			||||||
 | 
					    feats_array_stringy = tokens.to_array(("ORTH", "SHAPE"))
 | 
				
			||||||
 | 
					    assert feats_array_stringy[0][0] == feats_array[0][0]
 | 
				
			||||||
 | 
					    assert feats_array_stringy[0][1] == feats_array[0][1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_doc_scalar_attr_of_token(en_tokenizer, en_vocab):
 | 
				
			||||||
 | 
					    text = "An example sentence"
 | 
				
			||||||
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
 | 
					    example = tokens.vocab["example"]
 | 
				
			||||||
 | 
					    assert example.orth != example.shape
 | 
				
			||||||
 | 
					    feats_array = tokens.to_array(ORTH)
 | 
				
			||||||
 | 
					    assert feats_array.shape == (3,)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_doc_array_tag(en_tokenizer):
 | 
					def test_doc_array_tag(en_tokenizer):
 | 
				
			||||||
    text = "A nice sentence."
 | 
					    text = "A nice sentence."
 | 
				
			||||||
    pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
 | 
					    pos = ['DET', 'ADJ', 'NOUN', 'PUNCT']
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,6 +2,8 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..util import get_doc
 | 
					from ..util import get_doc
 | 
				
			||||||
 | 
					from ...tokens import Doc
 | 
				
			||||||
 | 
					from ...vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					@ -204,19 +206,20 @@ def test_doc_api_right_edge(en_tokenizer):
 | 
				
			||||||
    assert doc[6].right_edge.text == ','
 | 
					    assert doc[6].right_edge.text == ','
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					def test_doc_api_has_vector():
 | 
				
			||||||
@pytest.mark.parametrize('text,vectors', [
 | 
					    vocab = Vocab()
 | 
				
			||||||
    ("apple orange pear", ["apple -1 -1 -1", "orange -1 -1 0", "pear -1 0 -1"])
 | 
					    vocab.clear_vectors(2)
 | 
				
			||||||
])
 | 
					    vocab.vectors.add('kitten', numpy.asarray([0., 2.], dtype='f'))
 | 
				
			||||||
def test_doc_api_has_vector(en_tokenizer, text_file, text, vectors):
 | 
					    doc = Doc(vocab, words=['kitten'])
 | 
				
			||||||
    text_file.write('\n'.join(vectors))
 | 
					 | 
				
			||||||
    text_file.seek(0)
 | 
					 | 
				
			||||||
    vector_length = en_tokenizer.vocab.load_vectors(text_file)
 | 
					 | 
				
			||||||
    assert vector_length == 3
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    doc = en_tokenizer(text)
 | 
					 | 
				
			||||||
    assert doc.has_vector
 | 
					    assert doc.has_vector
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_lowest_common_ancestor(en_tokenizer):
 | 
				
			||||||
 | 
					    tokens = en_tokenizer('the lazy dog slept')
 | 
				
			||||||
 | 
					    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
 | 
				
			||||||
 | 
					    lca = doc.get_lca_matrix()
 | 
				
			||||||
 | 
					    assert(lca[1, 1] == 1)
 | 
				
			||||||
 | 
					    assert(lca[0, 1] == 2)
 | 
				
			||||||
 | 
					    assert(lca[1, 2] == 2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_parse_tree(en_tokenizer):
 | 
					def test_parse_tree(en_tokenizer):
 | 
				
			||||||
    """Tests doc.print_tree() method."""
 | 
					    """Tests doc.print_tree() method."""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,8 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
 | 
					from ...attrs import IS_ALPHA, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_TITLE, IS_STOP
 | 
				
			||||||
from ..util import get_doc
 | 
					from ..util import get_doc
 | 
				
			||||||
 | 
					from ...vocab import Vocab
 | 
				
			||||||
 | 
					from ...tokens import Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
| 
						 | 
					@ -68,26 +70,21 @@ def test_doc_token_api_is_properties(en_vocab):
 | 
				
			||||||
    assert doc[5].like_email
 | 
					    assert doc[5].like_email
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.xfail
 | 
					def test_doc_token_api_vectors():
 | 
				
			||||||
@pytest.mark.parametrize('text,vectors', [
 | 
					    vocab = Vocab()
 | 
				
			||||||
    ("apples oranges ldskbjls", ["apples -1 -1 -1", "oranges -1 -1 0"])
 | 
					    vocab.clear_vectors(2)
 | 
				
			||||||
])
 | 
					    vocab.vectors.add('apples', numpy.asarray([0., 2.], dtype='f'))
 | 
				
			||||||
def test_doc_token_api_vectors(en_tokenizer, text_file, text, vectors):
 | 
					    vocab.vectors.add('oranges', numpy.asarray([0., 1.], dtype='f'))
 | 
				
			||||||
    text_file.write('\n'.join(vectors))
 | 
					    doc = Doc(vocab, words=['apples', 'oranges', 'oov'])
 | 
				
			||||||
    text_file.seek(0)
 | 
					    assert doc.has_vector
 | 
				
			||||||
    vector_length = en_tokenizer.vocab.load_vectors(text_file)
 | 
					 | 
				
			||||||
    assert vector_length == 3
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    assert doc[0].has_vector
 | 
				
			||||||
    assert tokens[0].has_vector
 | 
					    assert doc[1].has_vector
 | 
				
			||||||
    assert tokens[1].has_vector
 | 
					    assert not doc[2].has_vector
 | 
				
			||||||
    assert not tokens[2].has_vector
 | 
					    apples_norm = (0*0 + 2*2) ** 0.5
 | 
				
			||||||
    assert tokens[0].similarity(tokens[1]) > tokens[0].similarity(tokens[2])
 | 
					    oranges_norm = (0*0 + 1*1) ** 0.5
 | 
				
			||||||
    assert tokens[0].similarity(tokens[1]) == tokens[1].similarity(tokens[0])
 | 
					    cosine = ((0*0) + (2*1)) / (apples_norm * oranges_norm)
 | 
				
			||||||
    assert sum(tokens[0].vector) != sum(tokens[1].vector)
 | 
					    assert doc[0].similarity(doc[1]) == cosine
 | 
				
			||||||
    assert numpy.isclose(
 | 
					 | 
				
			||||||
        tokens[0].vector_norm,
 | 
					 | 
				
			||||||
        numpy.sqrt(numpy.dot(tokens[0].vector, tokens[0].vector)))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_doc_token_api_ancestors(en_tokenizer):
 | 
					def test_doc_token_api_ancestors(en_tokenizer):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										23
									
								
								spacy/tests/regression/test_issue1242.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								spacy/tests/regression/test_issue1242.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,23 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					from ...lang.en import English
 | 
				
			||||||
 | 
					from ...util import load_model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_issue1242_empty_strings():
 | 
				
			||||||
 | 
					    nlp = English()
 | 
				
			||||||
 | 
					    doc = nlp('')
 | 
				
			||||||
 | 
					    assert len(doc) == 0
 | 
				
			||||||
 | 
					    docs = list(nlp.pipe(['', 'hello']))
 | 
				
			||||||
 | 
					    assert len(docs[0]) == 0
 | 
				
			||||||
 | 
					    assert len(docs[1]) == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models('en')
 | 
				
			||||||
 | 
					def test_issue1242_empty_strings_en_core_web_sm():
 | 
				
			||||||
 | 
					    nlp = load_model('en_core_web_sm')
 | 
				
			||||||
 | 
					    doc = nlp('')
 | 
				
			||||||
 | 
					    assert len(doc) == 0
 | 
				
			||||||
 | 
					    docs = list(nlp.pipe(['', 'hello']))
 | 
				
			||||||
 | 
					    assert len(docs[0]) == 0
 | 
				
			||||||
 | 
					    assert len(docs[1]) == 1
 | 
				
			||||||
							
								
								
									
										13
									
								
								spacy/tests/regression/test_issue1250.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								spacy/tests/regression/test_issue1250.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,13 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					from ...tokenizer import Tokenizer
 | 
				
			||||||
 | 
					from ...symbols import ORTH, LEMMA, POS
 | 
				
			||||||
 | 
					from ...lang.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_issue1250_cached_special_cases():
 | 
				
			||||||
 | 
					    nlp = English()
 | 
				
			||||||
 | 
					    nlp.tokenizer.add_special_case(u'reimbur', [{ORTH: u'reimbur', LEMMA: u'reimburse', POS: u'VERB'}])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')]
 | 
				
			||||||
 | 
					    assert lemmas == ['reimburse', ',', 'reimburse', '...']
 | 
				
			||||||
 | 
					    lemmas = [w.lemma_ for w in nlp(u'reimbur, reimbur...')]
 | 
				
			||||||
 | 
					    assert lemmas == ['reimburse', ',', 'reimburse', '...']
 | 
				
			||||||
							
								
								
									
										20
									
								
								spacy/tests/regression/test_issue1253.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										20
									
								
								spacy/tests/regression/test_issue1253.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,20 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def ss(tt):
 | 
				
			||||||
 | 
					    for i in range(len(tt)-1):
 | 
				
			||||||
 | 
					        for j in range(i+1, len(tt)):
 | 
				
			||||||
 | 
					            tt[i:j].root
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.models('en')
 | 
				
			||||||
 | 
					def test_access_parse_for_merged():
 | 
				
			||||||
 | 
					    nlp = spacy.load('en_core_web_sm')
 | 
				
			||||||
 | 
					    t_t = nlp.tokenizer("Highly rated - I'll definitely")
 | 
				
			||||||
 | 
					    nlp.tagger(t_t)
 | 
				
			||||||
 | 
					    nlp.parser(t_t)
 | 
				
			||||||
 | 
					    nlp.parser(t_t)
 | 
				
			||||||
 | 
					    ss(t_t)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,8 +1,11 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.models('en')
 | 
					#@pytest.mark.models('en')
 | 
				
			||||||
def test_issue1305(EN):
 | 
					def test_issue1305():
 | 
				
			||||||
    '''Test lemmatization of English VBZ'''
 | 
					    '''Test lemmatization of English VBZ'''
 | 
				
			||||||
    assert EN.vocab.morphology.lemmatizer('works', 'verb') == set(['work'])
 | 
					    nlp = spacy.load('en_core_web_sm')
 | 
				
			||||||
    doc = EN(u'This app works well')
 | 
					    assert nlp.vocab.morphology.lemmatizer('works', 'verb') == ['work']
 | 
				
			||||||
 | 
					    doc = nlp(u'This app works well')
 | 
				
			||||||
 | 
					    print([(w.text, w.tag_) for w in doc])
 | 
				
			||||||
    assert doc[2].lemma_ == 'work'
 | 
					    assert doc[2].lemma_ == 'work'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										16
									
								
								spacy/tests/regression/test_issue1375.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								spacy/tests/regression/test_issue1375.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,16 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					from ...vocab import Vocab
 | 
				
			||||||
 | 
					from ...tokens.doc import Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_issue1375():
 | 
				
			||||||
 | 
					    '''Test that token.nbor() raises IndexError for out-of-bounds access.'''
 | 
				
			||||||
 | 
					    doc = Doc(Vocab(), words=['0', '1', '2'])
 | 
				
			||||||
 | 
					    with pytest.raises(IndexError):
 | 
				
			||||||
 | 
					        assert doc[0].nbor(-1)
 | 
				
			||||||
 | 
					    assert doc[1].nbor(-1).text == '0'
 | 
				
			||||||
 | 
					    with pytest.raises(IndexError):
 | 
				
			||||||
 | 
					        assert doc[2].nbor(1)
 | 
				
			||||||
 | 
					    assert doc[1].nbor(1).text == '2'
 | 
				
			||||||
 | 
					 
 | 
				
			||||||
							
								
								
									
										22
									
								
								spacy/tests/regression/test_issue1434.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/tests/regression/test_issue1434.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,22 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ...vocab import Vocab
 | 
				
			||||||
 | 
					from ...lang.lex_attrs import LEX_ATTRS
 | 
				
			||||||
 | 
					from ...tokens import Doc
 | 
				
			||||||
 | 
					from ...matcher import Matcher
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_issue1434():
 | 
				
			||||||
 | 
					    '''Test matches occur when optional element at end of short doc'''
 | 
				
			||||||
 | 
					    vocab = Vocab(lex_attr_getters=LEX_ATTRS)
 | 
				
			||||||
 | 
					    hello_world = Doc(vocab, words=['Hello', 'World'])
 | 
				
			||||||
 | 
					    hello = Doc(vocab, words=['Hello'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    matcher = Matcher(vocab)
 | 
				
			||||||
 | 
					    matcher.add('MyMatcher', None,
 | 
				
			||||||
 | 
					        [ {'ORTH': 'Hello' }, {'IS_ALPHA': True, 'OP': '?'} ])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    matches = matcher(hello_world)
 | 
				
			||||||
 | 
					    assert matches
 | 
				
			||||||
 | 
					    matches = matcher(hello)
 | 
				
			||||||
 | 
					    assert matches
 | 
				
			||||||
							
								
								
									
										58
									
								
								spacy/tests/regression/test_issue1450.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								spacy/tests/regression/test_issue1450.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,58 @@
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ...matcher import Matcher
 | 
				
			||||||
 | 
					from ...tokens import Doc
 | 
				
			||||||
 | 
					from ...vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    'string,start,end',
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        ('a', 0, 1),
 | 
				
			||||||
 | 
					        ('a b', 0, 2),
 | 
				
			||||||
 | 
					        ('a c', 0, 1),
 | 
				
			||||||
 | 
					        ('a b c', 0, 2),
 | 
				
			||||||
 | 
					        ('a b b c', 0, 2),
 | 
				
			||||||
 | 
					        ('a b b', 0, 2),
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_issue1450_matcher_end_zero_plus(string, start, end):
 | 
				
			||||||
 | 
					    '''Test matcher works when patterns end with * operator.
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    Original example (rewritten to avoid model usage)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    nlp = spacy.load('en_core_web_sm')
 | 
				
			||||||
 | 
					    matcher = Matcher(nlp.vocab)
 | 
				
			||||||
 | 
					    matcher.add(
 | 
				
			||||||
 | 
					        "TSTEND",
 | 
				
			||||||
 | 
					        on_match_1,
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            {TAG: "JJ", LOWER: "new"},
 | 
				
			||||||
 | 
					            {TAG: "NN", 'OP': "*"}
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    doc = nlp(u'Could you create a new ticket for me?')
 | 
				
			||||||
 | 
					    print([(w.tag_, w.text, w.lower_) for w in doc])
 | 
				
			||||||
 | 
					    matches = matcher(doc)
 | 
				
			||||||
 | 
					    print(matches)
 | 
				
			||||||
 | 
					    assert len(matches) == 1
 | 
				
			||||||
 | 
					    assert matches[0][1] == 4
 | 
				
			||||||
 | 
					    assert matches[0][2] == 5
 | 
				
			||||||
 | 
					    '''
 | 
				
			||||||
 | 
					    matcher = Matcher(Vocab())
 | 
				
			||||||
 | 
					    matcher.add(
 | 
				
			||||||
 | 
					        "TSTEND",
 | 
				
			||||||
 | 
					        None,
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            {'ORTH': "a"},
 | 
				
			||||||
 | 
					            {'ORTH': "b", 'OP': "*"}
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    doc = Doc(Vocab(), words=string.split())
 | 
				
			||||||
 | 
					    matches = matcher(doc)
 | 
				
			||||||
 | 
					    if start is None or end is None:
 | 
				
			||||||
 | 
					        assert matches == []
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					    assert matches[0][1] == start
 | 
				
			||||||
 | 
					    assert matches[0][2] == end
 | 
				
			||||||
| 
						 | 
					@ -9,4 +9,4 @@ import pytest
 | 
				
			||||||
@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
 | 
					@pytest.mark.parametrize('word,lemmas', [("chromosomes", ["chromosome"]), ("endosomes", ["endosome"]), ("colocalizes", ["colocalize", "colocaliz"])])
 | 
				
			||||||
def test_issue781(EN, word, lemmas):
 | 
					def test_issue781(EN, word, lemmas):
 | 
				
			||||||
    lemmatizer = EN.Defaults.create_lemmatizer()
 | 
					    lemmatizer = EN.Defaults.create_lemmatizer()
 | 
				
			||||||
    assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == set(lemmas)
 | 
					    assert lemmatizer(word, 'noun', morphology={'number': 'plur'}) == lemmas
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -55,6 +55,17 @@ def test_spans_span_sent(doc):
 | 
				
			||||||
    assert doc[6:7].sent.root.left_edge.text == 'This'
 | 
					    assert doc[6:7].sent.root.left_edge.text == 'This'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_spans_lca_matrix(en_tokenizer):
 | 
				
			||||||
 | 
					    """Test span's lca matrix generation"""
 | 
				
			||||||
 | 
					    tokens = en_tokenizer('the lazy dog slept')
 | 
				
			||||||
 | 
					    doc = get_doc(tokens.vocab, [t.text for t in tokens], heads=[2, 1, 1, 0])
 | 
				
			||||||
 | 
					    lca = doc[:2].get_lca_matrix()
 | 
				
			||||||
 | 
					    assert(lca[0, 0] == 0)
 | 
				
			||||||
 | 
					    assert(lca[0, 1] == -1)
 | 
				
			||||||
 | 
					    assert(lca[1, 0] == -1)
 | 
				
			||||||
 | 
					    assert(lca[1, 1] == 1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_spans_default_sentiment(en_tokenizer):
 | 
					def test_spans_default_sentiment(en_tokenizer):
 | 
				
			||||||
    """Test span.sentiment property's default averaging behaviour"""
 | 
					    """Test span.sentiment property's default averaging behaviour"""
 | 
				
			||||||
    text = "good stuff bad stuff"
 | 
					    text = "good stuff bad stuff"
 | 
				
			||||||
| 
						 | 
					@ -106,3 +117,9 @@ def test_span_to_array(doc):
 | 
				
			||||||
    assert arr[0, 0] == span[0].orth
 | 
					    assert arr[0, 0] == span[0].orth
 | 
				
			||||||
    assert arr[0, 1] == len(span[0])
 | 
					    assert arr[0, 1] == len(span[0])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.xfail
 | 
				
			||||||
 | 
					def test_span_as_doc(doc):
 | 
				
			||||||
 | 
					    span = doc[4:10]
 | 
				
			||||||
 | 
					    span_doc = span.as_doc()
 | 
				
			||||||
 | 
					    assert span.text == span_doc.text
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..matcher import Matcher, PhraseMatcher
 | 
					from ..matcher import Matcher, PhraseMatcher
 | 
				
			||||||
from .util import get_doc
 | 
					from .util import get_doc
 | 
				
			||||||
 | 
					from ..tokens import Doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -212,3 +213,24 @@ def test_operator_combos(matcher):
 | 
				
			||||||
            assert matches, (string, pattern_str)
 | 
					            assert matches, (string, pattern_str)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            assert not matches, (string, pattern_str)
 | 
					            assert not matches, (string, pattern_str)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_matcher_end_zero_plus(matcher):
 | 
				
			||||||
 | 
					    '''Test matcher works when patterns end with * operator. (issue 1450)'''
 | 
				
			||||||
 | 
					    matcher = Matcher(matcher.vocab)
 | 
				
			||||||
 | 
					    matcher.add(
 | 
				
			||||||
 | 
					        "TSTEND",
 | 
				
			||||||
 | 
					        None,
 | 
				
			||||||
 | 
					        [
 | 
				
			||||||
 | 
					            {'ORTH': "a"},
 | 
				
			||||||
 | 
					            {'ORTH': "b", 'OP': "*"}
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    nlp = lambda string: Doc(matcher.vocab, words=string.split())
 | 
				
			||||||
 | 
					    assert len(matcher(nlp(u'a'))) == 1
 | 
				
			||||||
 | 
					    assert len(matcher(nlp(u'a b'))) == 1
 | 
				
			||||||
 | 
					    assert len(matcher(nlp(u'a b'))) == 1
 | 
				
			||||||
 | 
					    assert len(matcher(nlp(u'a c'))) == 1
 | 
				
			||||||
 | 
					    assert len(matcher(nlp(u'a b c'))) == 1
 | 
				
			||||||
 | 
					    assert len(matcher(nlp(u'a b b c'))) == 1
 | 
				
			||||||
 | 
					    assert len(matcher(nlp(u'a b b'))) == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,18 +35,18 @@ def vocab(en_vocab, vectors):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_init_vectors_with_data(strings, data):
 | 
					def test_init_vectors_with_data(strings, data):
 | 
				
			||||||
    v = Vectors(strings, data)
 | 
					    v = Vectors(strings, data=data)
 | 
				
			||||||
    assert v.shape == data.shape
 | 
					    assert v.shape == data.shape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_init_vectors_with_width(strings):
 | 
					def test_init_vectors_with_width(strings):
 | 
				
			||||||
    v = Vectors(strings, 3)
 | 
					    v = Vectors(strings, width=3)
 | 
				
			||||||
    for string in strings:
 | 
					    for string in strings:
 | 
				
			||||||
        v.add(string)
 | 
					        v.add(string)
 | 
				
			||||||
    assert v.shape == (len(strings), 3)
 | 
					    assert v.shape == (len(strings), 3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_get_vector(strings, data):
 | 
					def test_get_vector(strings, data):
 | 
				
			||||||
    v = Vectors(strings, data)
 | 
					    v = Vectors(strings, data=data)
 | 
				
			||||||
    for string in strings:
 | 
					    for string in strings:
 | 
				
			||||||
        v.add(string)
 | 
					        v.add(string)
 | 
				
			||||||
    assert list(v[strings[0]]) == list(data[0])
 | 
					    assert list(v[strings[0]]) == list(data[0])
 | 
				
			||||||
| 
						 | 
					@ -56,7 +56,7 @@ def test_get_vector(strings, data):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_set_vector(strings, data):
 | 
					def test_set_vector(strings, data):
 | 
				
			||||||
    orig = data.copy()
 | 
					    orig = data.copy()
 | 
				
			||||||
    v = Vectors(strings, data)
 | 
					    v = Vectors(strings, data=data)
 | 
				
			||||||
    for string in strings:
 | 
					    for string in strings:
 | 
				
			||||||
        v.add(string)
 | 
					        v.add(string)
 | 
				
			||||||
    assert list(v[strings[0]]) == list(orig[0])
 | 
					    assert list(v[strings[0]]) == list(orig[0])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,8 +27,9 @@ cdef class Tokenizer:
 | 
				
			||||||
    cdef int _try_cache(self, hash_t key, Doc tokens) except -1
 | 
					    cdef int _try_cache(self, hash_t key, Doc tokens) except -1
 | 
				
			||||||
    cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
 | 
					    cdef int _tokenize(self, Doc tokens, unicode span, hash_t key) except -1
 | 
				
			||||||
    cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
 | 
					    cdef unicode _split_affixes(self, Pool mem, unicode string, vector[LexemeC*] *prefixes,
 | 
				
			||||||
                             vector[LexemeC*] *suffixes)
 | 
					                             vector[LexemeC*] *suffixes, int* has_special)
 | 
				
			||||||
    cdef int _attach_tokens(self, Doc tokens, unicode string,
 | 
					    cdef int _attach_tokens(self, Doc tokens, unicode string,
 | 
				
			||||||
                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
 | 
					                            vector[LexemeC*] *prefixes, vector[LexemeC*] *suffixes) except -1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
 | 
					    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int has_special,
 | 
				
			||||||
 | 
					                          int n) except -1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,7 +20,8 @@ cdef class Tokenizer:
 | 
				
			||||||
    """Segment text, and create Doc objects with the discovered segment
 | 
					    """Segment text, and create Doc objects with the discovered segment
 | 
				
			||||||
    boundaries.
 | 
					    boundaries.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer, token_match=None):
 | 
					    def __init__(self, Vocab vocab, rules=None, prefix_search=None,
 | 
				
			||||||
 | 
					            suffix_search=None, infix_finditer=None, token_match=None):
 | 
				
			||||||
        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 | 
					        """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        vocab (Vocab): A storage container for lexical types.
 | 
					        vocab (Vocab): A storage container for lexical types.
 | 
				
			||||||
| 
						 | 
					@ -48,6 +49,7 @@ cdef class Tokenizer:
 | 
				
			||||||
        self.infix_finditer = infix_finditer
 | 
					        self.infix_finditer = infix_finditer
 | 
				
			||||||
        self.vocab = vocab
 | 
					        self.vocab = vocab
 | 
				
			||||||
        self._rules = {}
 | 
					        self._rules = {}
 | 
				
			||||||
 | 
					        if rules is not None:
 | 
				
			||||||
            for chunk, substrings in sorted(rules.items()):
 | 
					            for chunk, substrings in sorted(rules.items()):
 | 
				
			||||||
                self.add_special_case(chunk, substrings)
 | 
					                self.add_special_case(chunk, substrings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -148,14 +150,18 @@ cdef class Tokenizer:
 | 
				
			||||||
        cdef vector[LexemeC*] prefixes
 | 
					        cdef vector[LexemeC*] prefixes
 | 
				
			||||||
        cdef vector[LexemeC*] suffixes
 | 
					        cdef vector[LexemeC*] suffixes
 | 
				
			||||||
        cdef int orig_size
 | 
					        cdef int orig_size
 | 
				
			||||||
 | 
					        cdef int has_special
 | 
				
			||||||
        orig_size = tokens.length
 | 
					        orig_size = tokens.length
 | 
				
			||||||
        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes)
 | 
					        span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
 | 
				
			||||||
 | 
					                                   &has_special)
 | 
				
			||||||
        self._attach_tokens(tokens, span, &prefixes, &suffixes)
 | 
					        self._attach_tokens(tokens, span, &prefixes, &suffixes)
 | 
				
			||||||
        self._save_cached(&tokens.c[orig_size], orig_key, tokens.length - orig_size)
 | 
					        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
 | 
				
			||||||
 | 
					                          tokens.length - orig_size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef unicode _split_affixes(self, Pool mem, unicode string,
 | 
					    cdef unicode _split_affixes(self, Pool mem, unicode string,
 | 
				
			||||||
                                vector[const LexemeC*] *prefixes,
 | 
					                                vector[const LexemeC*] *prefixes,
 | 
				
			||||||
                                vector[const LexemeC*] *suffixes):
 | 
					                                vector[const LexemeC*] *suffixes,
 | 
				
			||||||
 | 
					                                int* has_special):
 | 
				
			||||||
        cdef size_t i
 | 
					        cdef size_t i
 | 
				
			||||||
        cdef unicode prefix
 | 
					        cdef unicode prefix
 | 
				
			||||||
        cdef unicode suffix
 | 
					        cdef unicode suffix
 | 
				
			||||||
| 
						 | 
					@ -174,6 +180,7 @@ cdef class Tokenizer:
 | 
				
			||||||
                if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
 | 
					                if minus_pre and self._specials.get(hash_string(minus_pre)) != NULL:
 | 
				
			||||||
                    string = minus_pre
 | 
					                    string = minus_pre
 | 
				
			||||||
                    prefixes.push_back(self.vocab.get(mem, prefix))
 | 
					                    prefixes.push_back(self.vocab.get(mem, prefix))
 | 
				
			||||||
 | 
					                    has_special[0] = 1
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
                if self.token_match and self.token_match(string):
 | 
					                if self.token_match and self.token_match(string):
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
| 
						 | 
					@ -185,6 +192,7 @@ cdef class Tokenizer:
 | 
				
			||||||
                if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
 | 
					                if minus_suf and (self._specials.get(hash_string(minus_suf)) != NULL):
 | 
				
			||||||
                    string = minus_suf
 | 
					                    string = minus_suf
 | 
				
			||||||
                    suffixes.push_back(self.vocab.get(mem, suffix))
 | 
					                    suffixes.push_back(self.vocab.get(mem, suffix))
 | 
				
			||||||
 | 
					                    has_special[0] = 1
 | 
				
			||||||
                    break
 | 
					                    break
 | 
				
			||||||
            if pre_len and suf_len and (pre_len + suf_len) <= len(string):
 | 
					            if pre_len and suf_len and (pre_len + suf_len) <= len(string):
 | 
				
			||||||
                string = string[pre_len:-suf_len]
 | 
					                string = string[pre_len:-suf_len]
 | 
				
			||||||
| 
						 | 
					@ -197,6 +205,7 @@ cdef class Tokenizer:
 | 
				
			||||||
                string = minus_suf
 | 
					                string = minus_suf
 | 
				
			||||||
                suffixes.push_back(self.vocab.get(mem, suffix))
 | 
					                suffixes.push_back(self.vocab.get(mem, suffix))
 | 
				
			||||||
            if string and (self._specials.get(hash_string(string)) != NULL):
 | 
					            if string and (self._specials.get(hash_string(string)) != NULL):
 | 
				
			||||||
 | 
					                has_special[0] = 1
 | 
				
			||||||
                break
 | 
					                break
 | 
				
			||||||
        return string
 | 
					        return string
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -256,11 +265,15 @@ cdef class Tokenizer:
 | 
				
			||||||
            preinc(it)
 | 
					            preinc(it)
 | 
				
			||||||
            tokens.push_back(lexeme, False)
 | 
					            tokens.push_back(lexeme, False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1:
 | 
					    cdef int _save_cached(self, const TokenC* tokens, hash_t key,
 | 
				
			||||||
 | 
					                          int has_special, int n) except -1:
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
        for i in range(n):
 | 
					        for i in range(n):
 | 
				
			||||||
            if tokens[i].lex.id == 0:
 | 
					            if tokens[i].lex.id == 0:
 | 
				
			||||||
                return 0
 | 
					                return 0
 | 
				
			||||||
 | 
					        # See https://github.com/explosion/spaCy/issues/1250
 | 
				
			||||||
 | 
					        if has_special:
 | 
				
			||||||
 | 
					            return 0
 | 
				
			||||||
        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
 | 
					        cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
 | 
				
			||||||
        cached.length = n
 | 
					        cached.length = n
 | 
				
			||||||
        cached.is_lex = True
 | 
					        cached.is_lex = True
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -21,7 +21,7 @@ from .token cimport Token
 | 
				
			||||||
from .printers import parse_tree
 | 
					from .printers import parse_tree
 | 
				
			||||||
from ..lexeme cimport Lexeme, EMPTY_LEXEME
 | 
					from ..lexeme cimport Lexeme, EMPTY_LEXEME
 | 
				
			||||||
from ..typedefs cimport attr_t, flags_t
 | 
					from ..typedefs cimport attr_t, flags_t
 | 
				
			||||||
from ..attrs import intify_attrs
 | 
					from ..attrs import intify_attrs, IDS
 | 
				
			||||||
from ..attrs cimport attr_id_t
 | 
					from ..attrs cimport attr_id_t
 | 
				
			||||||
from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 | 
					from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 | 
				
			||||||
from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 | 
					from ..attrs cimport LENGTH, POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 | 
				
			||||||
| 
						 | 
					@ -536,11 +536,15 @@ cdef class Doc:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @cython.boundscheck(False)
 | 
					    @cython.boundscheck(False)
 | 
				
			||||||
    cpdef np.ndarray to_array(self, object py_attr_ids):
 | 
					    cpdef np.ndarray to_array(self, object py_attr_ids):
 | 
				
			||||||
        """Given a list of M attribute IDs, export the tokens to a numpy
 | 
					        """Export given token attributes to a numpy `ndarray`.
 | 
				
			||||||
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
 | 
					 | 
				
			||||||
        The values will be 32-bit integers.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        attr_ids (list[int]): A list of attribute ID ints.
 | 
						If `attr_ids` is a sequence of M attributes, the output array will
 | 
				
			||||||
 | 
						be of shape `(N, M)`, where N is the length of the `Doc`
 | 
				
			||||||
 | 
						(in tokens). If `attr_ids` is a single attribute, the output shape will
 | 
				
			||||||
 | 
						be (N,). You can specify attributes by integer ID (e.g. spacy.attrs.LEMMA)
 | 
				
			||||||
 | 
						or string name (e.g. 'LEMMA' or 'lemma').
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        attr_ids (list[]): A list of attributes (int IDs or string names).
 | 
				
			||||||
        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
 | 
					        RETURNS (numpy.ndarray[long, ndim=2]): A feature matrix, with one row
 | 
				
			||||||
            per word, and one column per attribute indicated in the input
 | 
					            per word, and one column per attribute indicated in the input
 | 
				
			||||||
            `attr_ids`.
 | 
					            `attr_ids`.
 | 
				
			||||||
| 
						 | 
					@ -553,15 +557,25 @@ cdef class Doc:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef int i, j
 | 
					        cdef int i, j
 | 
				
			||||||
        cdef attr_id_t feature
 | 
					        cdef attr_id_t feature
 | 
				
			||||||
 | 
					        cdef np.ndarray[attr_t, ndim=1] attr_ids
 | 
				
			||||||
        cdef np.ndarray[attr_t, ndim=2] output
 | 
					        cdef np.ndarray[attr_t, ndim=2] output
 | 
				
			||||||
 | 
					        # Handle scalar/list inputs of strings/ints for py_attr_ids
 | 
				
			||||||
 | 
					        if not hasattr(py_attr_ids, '__iter__'):
 | 
				
			||||||
 | 
					            py_attr_ids = [py_attr_ids]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Allow strings, e.g. 'lemma' or 'LEMMA'
 | 
				
			||||||
 | 
					        py_attr_ids = [(IDS[id_.upper()] if hasattr(id_, 'upper') else id_)
 | 
				
			||||||
 | 
					                       for id_ in py_attr_ids]
 | 
				
			||||||
        # Make an array from the attributes --- otherwise our inner loop is Python
 | 
					        # Make an array from the attributes --- otherwise our inner loop is Python
 | 
				
			||||||
        # dict iteration.
 | 
					        # dict iteration.
 | 
				
			||||||
        cdef np.ndarray[attr_t, ndim=1] attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
 | 
					        attr_ids = numpy.asarray(py_attr_ids, dtype=numpy.uint64)
 | 
				
			||||||
        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
 | 
					        output = numpy.ndarray(shape=(self.length, len(attr_ids)), dtype=numpy.uint64)
 | 
				
			||||||
        for i in range(self.length):
 | 
					        for i in range(self.length):
 | 
				
			||||||
            for j, feature in enumerate(attr_ids):
 | 
					            for j, feature in enumerate(attr_ids):
 | 
				
			||||||
                output[i, j] = get_token_attr(&self.c[i], feature)
 | 
					                output[i, j] = get_token_attr(&self.c[i], feature)
 | 
				
			||||||
        return output
 | 
					        # Handle 1d case
 | 
				
			||||||
 | 
					        return output if len(attr_ids) >= 2 else output.reshape((self.length,))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
 | 
					    def count_by(self, attr_id_t attr_id, exclude=None, PreshCounter counts=None):
 | 
				
			||||||
        """Count the frequencies of a given attribute. Produces a dict of
 | 
					        """Count the frequencies of a given attribute. Produces a dict of
 | 
				
			||||||
| 
						 | 
					@ -660,6 +674,54 @@ cdef class Doc:
 | 
				
			||||||
        self.is_tagged = bool(TAG in attrs or POS in attrs)
 | 
					        self.is_tagged = bool(TAG in attrs or POS in attrs)
 | 
				
			||||||
        return self
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_lca_matrix(self):
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        Calculates the lowest common ancestor matrix
 | 
				
			||||||
 | 
					        for a given Spacy doc.
 | 
				
			||||||
 | 
					        Returns LCA matrix containing the integer index
 | 
				
			||||||
 | 
					        of the ancestor, or -1 if no common ancestor is
 | 
				
			||||||
 | 
					        found (ex if span excludes a necessary ancestor).
 | 
				
			||||||
 | 
					        Apologies about the recursion, but the
 | 
				
			||||||
 | 
					        impact on performance is negligible given
 | 
				
			||||||
 | 
					        the natural limitations on the depth of a typical human sentence.
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        # Efficiency notes:
 | 
				
			||||||
 | 
					        #
 | 
				
			||||||
 | 
					        # We can easily improve the performance here by iterating in Cython.
 | 
				
			||||||
 | 
					        # To loop over the tokens in Cython, the easiest way is:
 | 
				
			||||||
 | 
					        # for token in doc.c[:doc.c.length]:
 | 
				
			||||||
 | 
					        #     head = token + token.head
 | 
				
			||||||
 | 
					        # Both token and head will be TokenC* here. The token.head attribute
 | 
				
			||||||
 | 
					        # is an integer offset.
 | 
				
			||||||
 | 
					        def __pairwise_lca(token_j, token_k, lca_matrix):
 | 
				
			||||||
 | 
					            if lca_matrix[token_j.i][token_k.i] != -2:
 | 
				
			||||||
 | 
					                return lca_matrix[token_j.i][token_k.i]
 | 
				
			||||||
 | 
					            elif token_j == token_k:
 | 
				
			||||||
 | 
					                lca_index = token_j.i
 | 
				
			||||||
 | 
					            elif token_k.head == token_j:
 | 
				
			||||||
 | 
					                lca_index = token_j.i
 | 
				
			||||||
 | 
					            elif token_j.head == token_k:
 | 
				
			||||||
 | 
					                lca_index = token_k.i
 | 
				
			||||||
 | 
					            elif (token_j.head == token_j) and (token_k.head == token_k):
 | 
				
			||||||
 | 
					                lca_index = -1
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                lca_index = __pairwise_lca(token_j.head, token_k.head, lca_matrix)
 | 
				
			||||||
 | 
					            lca_matrix[token_j.i][token_k.i] = lca_index
 | 
				
			||||||
 | 
					            lca_matrix[token_k.i][token_j.i] = lca_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            return lca_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
 | 
				
			||||||
 | 
					        lca_matrix.fill(-2)
 | 
				
			||||||
 | 
					        for j in range(len(self)):
 | 
				
			||||||
 | 
					            token_j = self[j]
 | 
				
			||||||
 | 
					            for k in range(j, len(self)):
 | 
				
			||||||
 | 
					                token_k = self[k]
 | 
				
			||||||
 | 
					                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix)
 | 
				
			||||||
 | 
					                lca_matrix[k][j] = lca_matrix[j][k]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return lca_matrix
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def to_disk(self, path, **exclude):
 | 
					    def to_disk(self, path, **exclude):
 | 
				
			||||||
        """Save the current state to a directory.
 | 
					        """Save the current state to a directory.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -129,6 +129,7 @@ cdef class Span:
 | 
				
			||||||
    def _(self):
 | 
					    def _(self):
 | 
				
			||||||
        return Underscore(Underscore.span_extensions, self,
 | 
					        return Underscore(Underscore.span_extensions, self,
 | 
				
			||||||
                          start=self.start_char, end=self.end_char)
 | 
					                          start=self.start_char, end=self.end_char)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def as_doc(self):
 | 
					    def as_doc(self):
 | 
				
			||||||
        '''Create a Doc object view of the Span's data.
 | 
					        '''Create a Doc object view of the Span's data.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -177,6 +178,56 @@ cdef class Span:
 | 
				
			||||||
            return 0.0
 | 
					            return 0.0
 | 
				
			||||||
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
 | 
					        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_lca_matrix(self):
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					        Calculates the lowest common ancestor matrix
 | 
				
			||||||
 | 
					        for a given Spacy span.
 | 
				
			||||||
 | 
					        Returns LCA matrix containing the integer index
 | 
				
			||||||
 | 
					        of the ancestor, or -1 if no common ancestor is
 | 
				
			||||||
 | 
					        found (ex if span excludes a necessary ancestor).
 | 
				
			||||||
 | 
					        Apologies about the recursion, but the
 | 
				
			||||||
 | 
					        impact on performance is negligible given
 | 
				
			||||||
 | 
					        the natural limitations on the depth of a typical human sentence.
 | 
				
			||||||
 | 
					        '''
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def __pairwise_lca(token_j, token_k, lca_matrix, margins):
 | 
				
			||||||
 | 
					            offset = margins[0]
 | 
				
			||||||
 | 
					            token_k_head = token_k.head if token_k.head.i in range(*margins) else token_k
 | 
				
			||||||
 | 
					            token_j_head = token_j.head if token_j.head.i in range(*margins) else token_j
 | 
				
			||||||
 | 
					            token_j_i = token_j.i - offset
 | 
				
			||||||
 | 
					            token_k_i = token_k.i - offset
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if lca_matrix[token_j_i][token_k_i] != -2:
 | 
				
			||||||
 | 
					                return lca_matrix[token_j_i][token_k_i]
 | 
				
			||||||
 | 
					            elif token_j == token_k:
 | 
				
			||||||
 | 
					                lca_index = token_j_i
 | 
				
			||||||
 | 
					            elif token_k_head == token_j:
 | 
				
			||||||
 | 
					                lca_index = token_j_i
 | 
				
			||||||
 | 
					            elif token_j_head == token_k:
 | 
				
			||||||
 | 
					                lca_index = token_k_i
 | 
				
			||||||
 | 
					            elif (token_j_head == token_j) and (token_k_head == token_k):
 | 
				
			||||||
 | 
					                lca_index = -1
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                lca_index = __pairwise_lca(token_j_head, token_k_head, lca_matrix, margins)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            lca_matrix[token_j_i][token_k_i] = lca_index
 | 
				
			||||||
 | 
					            lca_matrix[token_k_i][token_j_i] = lca_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            return lca_index
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        lca_matrix = numpy.empty((len(self), len(self)), dtype=numpy.int32)
 | 
				
			||||||
 | 
					        lca_matrix.fill(-2)
 | 
				
			||||||
 | 
					        margins = [self.start, self.end]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for j in range(len(self)):
 | 
				
			||||||
 | 
					            token_j = self[j]
 | 
				
			||||||
 | 
					            for k in range(len(self)):
 | 
				
			||||||
 | 
					                token_k = self[k]
 | 
				
			||||||
 | 
					                lca_matrix[j][k] = __pairwise_lca(token_j, token_k, lca_matrix, margins)
 | 
				
			||||||
 | 
					                lca_matrix[k][j] = lca_matrix[j][k]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return lca_matrix
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cpdef np.ndarray to_array(self, object py_attr_ids):
 | 
					    cpdef np.ndarray to_array(self, object py_attr_ids):
 | 
				
			||||||
        """Given a list of M attribute IDs, export the tokens to a numpy
 | 
					        """Given a list of M attribute IDs, export the tokens to a numpy
 | 
				
			||||||
        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
 | 
					        `ndarray` of shape `(N, M)`, where `N` is the length of the document.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -127,6 +127,9 @@ cdef class Token:
 | 
				
			||||||
        i (int): The relative position of the token to get. Defaults to 1.
 | 
					        i (int): The relative position of the token to get. Defaults to 1.
 | 
				
			||||||
        RETURNS (Token): The token at position `self.doc[self.i+i]`.
 | 
					        RETURNS (Token): The token at position `self.doc[self.i+i]`.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					        if self.i+i < 0 or (self.i+i >= len(self.doc)):
 | 
				
			||||||
 | 
					            msg = "Error accessing doc[%d].nbor(%d), for doc of length %d"
 | 
				
			||||||
 | 
					            raise IndexError(msg % (self.i, i, len(self.doc)))
 | 
				
			||||||
        return self.doc[self.i+i]
 | 
					        return self.doc[self.i+i]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def similarity(self, other):
 | 
					    def similarity(self, other):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -32,22 +32,24 @@ cdef class Vectors:
 | 
				
			||||||
    cdef public object keys
 | 
					    cdef public object keys
 | 
				
			||||||
    cdef public int i
 | 
					    cdef public int i
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(self, strings, data_or_width=0):
 | 
					    def __init__(self, strings, width=0, data=None):
 | 
				
			||||||
        if isinstance(strings, StringStore):
 | 
					        if isinstance(strings, StringStore):
 | 
				
			||||||
            self.strings = strings
 | 
					            self.strings = strings
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self.strings = StringStore()
 | 
					            self.strings = StringStore()
 | 
				
			||||||
            for string in strings:
 | 
					            for string in strings:
 | 
				
			||||||
                self.strings.add(string)
 | 
					                self.strings.add(string)
 | 
				
			||||||
        if isinstance(data_or_width, int):
 | 
					        if data is not None:
 | 
				
			||||||
            self.data = data = numpy.zeros((len(strings), data_or_width),
 | 
					            self.data = numpy.asarray(data, dtype='f')
 | 
				
			||||||
                                           dtype='f')
 | 
					 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            data = data_or_width
 | 
					            self.data = numpy.zeros((len(self.strings), width), dtype='f')
 | 
				
			||||||
        self.i = 0
 | 
					        self.i = 0
 | 
				
			||||||
        self.data = data
 | 
					 | 
				
			||||||
        self.key2row = {}
 | 
					        self.key2row = {}
 | 
				
			||||||
        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
 | 
					        self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
 | 
				
			||||||
 | 
					        for i, string in enumerate(self.strings):
 | 
				
			||||||
 | 
					            if i >= self.data.shape[0]:
 | 
				
			||||||
 | 
					                break
 | 
				
			||||||
 | 
					            self.add(self.strings[string], self.data[i])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __reduce__(self):
 | 
					    def __reduce__(self):
 | 
				
			||||||
        return (Vectors, (self.strings, self.data))
 | 
					        return (Vectors, (self.strings, self.data))
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -62,12 +62,9 @@ cdef class Vocab:
 | 
				
			||||||
        if strings:
 | 
					        if strings:
 | 
				
			||||||
            for string in strings:
 | 
					            for string in strings:
 | 
				
			||||||
                _ = self[string]
 | 
					                _ = self[string]
 | 
				
			||||||
        for name in tag_map.keys():
 | 
					 | 
				
			||||||
            if name:
 | 
					 | 
				
			||||||
                self.strings.add(name)
 | 
					 | 
				
			||||||
        self.lex_attr_getters = lex_attr_getters
 | 
					        self.lex_attr_getters = lex_attr_getters
 | 
				
			||||||
        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
 | 
					        self.morphology = Morphology(self.strings, tag_map, lemmatizer)
 | 
				
			||||||
        self.vectors = Vectors(self.strings)
 | 
					        self.vectors = Vectors(self.strings, width=0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property lang:
 | 
					    property lang:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					@ -255,7 +252,7 @@ cdef class Vocab:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        if new_dim is None:
 | 
					        if new_dim is None:
 | 
				
			||||||
            new_dim = self.vectors.data.shape[1]
 | 
					            new_dim = self.vectors.data.shape[1]
 | 
				
			||||||
        self.vectors = Vectors(self.strings, new_dim)
 | 
					        self.vectors = Vectors(self.strings, width=new_dim)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_vector(self, orth):
 | 
					    def get_vector(self, orth):
 | 
				
			||||||
        """Retrieve a vector for a word in the vocabulary.
 | 
					        """Retrieve a vector for a word in the vocabulary.
 | 
				
			||||||
| 
						 | 
					@ -338,7 +335,7 @@ cdef class Vocab:
 | 
				
			||||||
            if self.vectors is None:
 | 
					            if self.vectors is None:
 | 
				
			||||||
                return None
 | 
					                return None
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                return self.vectors.to_bytes(exclude='strings.json')
 | 
					                return self.vectors.to_bytes()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        getters = OrderedDict((
 | 
					        getters = OrderedDict((
 | 
				
			||||||
            ('strings', lambda: self.strings.to_bytes()),
 | 
					            ('strings', lambda: self.strings.to_bytes()),
 | 
				
			||||||
| 
						 | 
					@ -358,7 +355,7 @@ cdef class Vocab:
 | 
				
			||||||
            if self.vectors is None:
 | 
					            if self.vectors is None:
 | 
				
			||||||
                return None
 | 
					                return None
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                return self.vectors.from_bytes(b, exclude='strings')
 | 
					                return self.vectors.from_bytes(b)
 | 
				
			||||||
        setters = OrderedDict((
 | 
					        setters = OrderedDict((
 | 
				
			||||||
            ('strings', lambda b: self.strings.from_bytes(b)),
 | 
					            ('strings', lambda b: self.strings.from_bytes(b)),
 | 
				
			||||||
            ('lexemes', lambda b: self.lexemes_from_bytes(b)),
 | 
					            ('lexemes', lambda b: self.lexemes_from_bytes(b)),
 | 
				
			||||||
| 
						 | 
					@ -400,6 +397,7 @@ cdef class Vocab:
 | 
				
			||||||
        cdef int j = 0
 | 
					        cdef int j = 0
 | 
				
			||||||
        cdef SerializedLexemeC lex_data
 | 
					        cdef SerializedLexemeC lex_data
 | 
				
			||||||
        chunk_size = sizeof(lex_data.data)
 | 
					        chunk_size = sizeof(lex_data.data)
 | 
				
			||||||
 | 
					        cdef void* ptr
 | 
				
			||||||
        cdef unsigned char* bytes_ptr = bytes_data
 | 
					        cdef unsigned char* bytes_ptr = bytes_data
 | 
				
			||||||
        for i in range(0, len(bytes_data), chunk_size):
 | 
					        for i in range(0, len(bytes_data), chunk_size):
 | 
				
			||||||
            lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
 | 
					            lexeme = <LexemeC*>self.mem.alloc(1, sizeof(LexemeC))
 | 
				
			||||||
| 
						 | 
					@ -407,6 +405,9 @@ cdef class Vocab:
 | 
				
			||||||
                lex_data.data[j] = bytes_ptr[i+j]
 | 
					                lex_data.data[j] = bytes_ptr[i+j]
 | 
				
			||||||
            Lexeme.c_from_bytes(lexeme, lex_data)
 | 
					            Lexeme.c_from_bytes(lexeme, lex_data)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            ptr = self.strings._map.get(lexeme.orth)
 | 
				
			||||||
 | 
					            if ptr == NULL:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
            py_str = self.strings[lexeme.orth]
 | 
					            py_str = self.strings[lexeme.orth]
 | 
				
			||||||
            assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
 | 
					            assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
 | 
				
			||||||
            key = hash_string(py_str)
 | 
					            key = hash_string(py_str)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -181,7 +181,7 @@ mixin codepen(slug, height, default_tab)
 | 
				
			||||||
    alt_file - [string] alternative file path used in footer and link button
 | 
					    alt_file - [string] alternative file path used in footer and link button
 | 
				
			||||||
    height   - [integer] height of code preview in px
 | 
					    height   - [integer] height of code preview in px
 | 
				
			||||||
 | 
					
 | 
				
			||||||
mixin github(repo, file, alt_file, height)
 | 
					mixin github(repo, file, alt_file, height, language)
 | 
				
			||||||
    - var branch = ALPHA ? "develop" : "master"
 | 
					    - var branch = ALPHA ? "develop" : "master"
 | 
				
			||||||
    - var height = height || 250
 | 
					    - var height = height || 250
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -37,6 +37,10 @@
 | 
				
			||||||
        +cell #[code WORK_OF_ART]
 | 
					        +cell #[code WORK_OF_ART]
 | 
				
			||||||
        +cell Titles of books, songs, etc.
 | 
					        +cell Titles of books, songs, etc.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code LAW]
 | 
				
			||||||
 | 
					        +cell Named documents made into laws.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    +row
 | 
					    +row
 | 
				
			||||||
        +cell #[code LANGUAGE]
 | 
					        +cell #[code LANGUAGE]
 | 
				
			||||||
        +cell Any named language.
 | 
					        +cell Any named language.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										46
									
								
								website/api/_annotation/_training.jade
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										46
									
								
								website/api/_annotation/_training.jade
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,46 @@
 | 
				
			||||||
 | 
					//- 💫 DOCS > API > ANNOTATION > TRAINING
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					p
 | 
				
			||||||
 | 
					    |  spaCy takes training data in JSON format. The built-in
 | 
				
			||||||
 | 
					    |  #[+api("cli#convert") #[code convert]] command helps you convert the
 | 
				
			||||||
 | 
					    |  #[code .conllu] format used by the
 | 
				
			||||||
 | 
					    |  #[+a("https://github.com/UniversalDependencies") Universal Dependencies corpora]
 | 
				
			||||||
 | 
					    |  to spaCy's training format.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+aside("Annotating entities")
 | 
				
			||||||
 | 
					    |  Named entities are provided in the #[+a("/api/annotation#biluo") BILUO]
 | 
				
			||||||
 | 
					    |  notation. Tokens outside an entity are set to #[code "O"] and tokens
 | 
				
			||||||
 | 
					    |  that are part of an entity are set to the entity label, prefixed by the
 | 
				
			||||||
 | 
					    |  BILUO marker. For example #[code "B-ORG"] describes the first token of
 | 
				
			||||||
 | 
					    |  a multi-token #[code ORG] entity and #[code "U-PERSON"] a single
 | 
				
			||||||
 | 
					    |  token representing a #[code PERSON] entity
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+code("Example structure").
 | 
				
			||||||
 | 
					    [{
 | 
				
			||||||
 | 
					        "id": int,                      # ID of the document within the corpus
 | 
				
			||||||
 | 
					        "paragraphs": [{                # list of paragraphs in the corpus
 | 
				
			||||||
 | 
					            "raw": string,              # raw text of the paragraph
 | 
				
			||||||
 | 
					            "sentences": [{             # list of sentences in the paragraph
 | 
				
			||||||
 | 
					                "tokens": [{            # list of tokens in the sentence
 | 
				
			||||||
 | 
					                    "id": int,          # index of the token in the document
 | 
				
			||||||
 | 
					                    "dep": string,      # dependency label
 | 
				
			||||||
 | 
					                    "head": int,        # offset of token head relative to token index
 | 
				
			||||||
 | 
					                    "tag": string,      # part-of-speech tag
 | 
				
			||||||
 | 
					                    "orth": string,     # verbatim text of the token
 | 
				
			||||||
 | 
					                    "ner": string       # BILUO label, e.g. "O" or "B-ORG"
 | 
				
			||||||
 | 
					                }],
 | 
				
			||||||
 | 
					                "brackets": [{          # phrase structure (NOT USED by current models)
 | 
				
			||||||
 | 
					                    "first": int,       # index of first token
 | 
				
			||||||
 | 
					                    "last": int,        # index of last token
 | 
				
			||||||
 | 
					                    "label": string     # phrase label
 | 
				
			||||||
 | 
					                }]
 | 
				
			||||||
 | 
					            }]
 | 
				
			||||||
 | 
					        }]
 | 
				
			||||||
 | 
					    }]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					p
 | 
				
			||||||
 | 
					    |  Here's an example of dependencies, part-of-speech tags and names
 | 
				
			||||||
 | 
					    |  entities, taken from the English Wall Street Journal portion of the Penn
 | 
				
			||||||
 | 
					    |  Treebank:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+github("spacy", "examples/training/training-data.json", false, false, "json")
 | 
				
			||||||
| 
						 | 
					@ -154,13 +154,16 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    "tokenizer": {
 | 
					    "tokenizer": {
 | 
				
			||||||
        "title": "Tokenizer",
 | 
					        "title": "Tokenizer",
 | 
				
			||||||
 | 
					        "teaser": "Segment text into words, punctuations marks etc.",
 | 
				
			||||||
        "tag": "class",
 | 
					        "tag": "class",
 | 
				
			||||||
        "source": "spacy/tokenizer.pyx"
 | 
					        "source": "spacy/tokenizer.pyx"
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    "lemmatizer": {
 | 
					    "lemmatizer": {
 | 
				
			||||||
        "title": "Lemmatizer",
 | 
					        "title": "Lemmatizer",
 | 
				
			||||||
        "tag": "class"
 | 
					        "teaser": "Assign the base forms of words.",
 | 
				
			||||||
 | 
					        "tag": "class",
 | 
				
			||||||
 | 
					        "source": "spacy/lemmatizer.py"
 | 
				
			||||||
    },
 | 
					    },
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    "tagger": {
 | 
					    "tagger": {
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -101,31 +101,4 @@ p This document describes the target annotations spaCy is trained to predict.
 | 
				
			||||||
+section("training")
 | 
					+section("training")
 | 
				
			||||||
    +h(2, "json-input") JSON input format for training
 | 
					    +h(2, "json-input") JSON input format for training
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    +under-construction
 | 
					    include _annotation/_training
 | 
				
			||||||
 | 
					 | 
				
			||||||
    p spaCy takes training data in the following format:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    +code("Example structure").
 | 
					 | 
				
			||||||
        doc: {
 | 
					 | 
				
			||||||
            id: string,
 | 
					 | 
				
			||||||
            paragraphs: [{
 | 
					 | 
				
			||||||
                raw: string,
 | 
					 | 
				
			||||||
                sents: [int],
 | 
					 | 
				
			||||||
                tokens: [{
 | 
					 | 
				
			||||||
                    start: int,
 | 
					 | 
				
			||||||
                    tag: string,
 | 
					 | 
				
			||||||
                    head: int,
 | 
					 | 
				
			||||||
                    dep: string
 | 
					 | 
				
			||||||
                }],
 | 
					 | 
				
			||||||
                ner: [{
 | 
					 | 
				
			||||||
                    start: int,
 | 
					 | 
				
			||||||
                    end: int,
 | 
					 | 
				
			||||||
                    label: string
 | 
					 | 
				
			||||||
                }],
 | 
					 | 
				
			||||||
                brackets: [{
 | 
					 | 
				
			||||||
                    start: int,
 | 
					 | 
				
			||||||
                    end: int,
 | 
					 | 
				
			||||||
                    label: string
 | 
					 | 
				
			||||||
                }]
 | 
					 | 
				
			||||||
            }]
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -336,28 +336,40 @@ p
 | 
				
			||||||
    +tag method
 | 
					    +tag method
 | 
				
			||||||
 | 
					
 | 
				
			||||||
p
 | 
					p
 | 
				
			||||||
    |  Export the document annotations to a numpy array of shape #[code N*M]
 | 
					    |  Export given token attributes to a numpy #[code ndarray].
 | 
				
			||||||
    |  where #[code N] is the length of the document and #[code M] is the number
 | 
					    |  If #[code attr_ids] is a sequence of #[code M] attributes,
 | 
				
			||||||
    |  of attribute IDs to export. The values will be 32-bit integers.
 | 
					    |  the output array will  be of shape #[code (N, M)], where #[code N]
 | 
				
			||||||
 | 
					    |  is the length of the #[code Doc] (in tokens). If #[code attr_ids] is
 | 
				
			||||||
 | 
					    |  a single attribute, the output shape will be #[code (N,)]. You can
 | 
				
			||||||
 | 
					    |  specify attributes by integer ID (e.g. #[code spacy.attrs.LEMMA])
 | 
				
			||||||
 | 
					    |  or string name (e.g. 'LEMMA' or 'lemma'). The values will be 64-bit
 | 
				
			||||||
 | 
					    |  integers.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+aside-code("Example").
 | 
					+aside-code("Example").
 | 
				
			||||||
    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
 | 
					    from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
 | 
				
			||||||
    doc = nlp(text)
 | 
					    doc = nlp(text)
 | 
				
			||||||
    # All strings mapped to integers, for easy export to numpy
 | 
					    # All strings mapped to integers, for easy export to numpy
 | 
				
			||||||
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
 | 
					    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
 | 
				
			||||||
 | 
					    np_array = doc.to_array("POS")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+table(["Name", "Type", "Description"])
 | 
					+table(["Name", "Type", "Description"])
 | 
				
			||||||
    +row
 | 
					    +row
 | 
				
			||||||
        +cell #[code attr_ids]
 | 
					        +cell #[code attr_ids]
 | 
				
			||||||
        +cell list
 | 
					        +cell list or int or string
 | 
				
			||||||
        +cell A list of attribute ID ints.
 | 
					        +cell
 | 
				
			||||||
 | 
					            | A list of attributes (int IDs or string names) or
 | 
				
			||||||
 | 
					            | a single attribute (int ID or string name)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    +row("foot")
 | 
					    +row("foot")
 | 
				
			||||||
        +cell returns
 | 
					        +cell returns
 | 
				
			||||||
        +cell #[code.u-break numpy.ndarray[ndim=2, dtype='int32']]
 | 
					        +cell
 | 
				
			||||||
 | 
					            | #[code.u-break numpy.ndarray[ndim=2, dtype='uint64']] or
 | 
				
			||||||
 | 
					            | #[code.u-break numpy.ndarray[ndim=1, dtype='uint64']] or
 | 
				
			||||||
        +cell
 | 
					        +cell
 | 
				
			||||||
            |  The exported attributes as a 2D numpy array, with one row per
 | 
					            |  The exported attributes as a 2D numpy array, with one row per
 | 
				
			||||||
            |  token and one column per attribute.
 | 
					            |  token and one column per attribute (when #[code attr_ids] is a
 | 
				
			||||||
 | 
					            |  list), or as a 1D numpy array, with one item per attribute (when
 | 
				
			||||||
 | 
					            |  #[code attr_ids] is a single value).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+h(2, "from_array") Doc.from_array
 | 
					+h(2, "from_array") Doc.from_array
 | 
				
			||||||
    +tag method
 | 
					    +tag method
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,4 +2,159 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
include ../_includes/_mixins
 | 
					include ../_includes/_mixins
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+under-construction
 | 
					p
 | 
				
			||||||
 | 
					    |  The #[code Lemmatizer] supports simple part-of-speech-sensitive suffix
 | 
				
			||||||
 | 
					    |  rules and lookup tables.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+h(2, "init") Lemmatizer.__init__
 | 
				
			||||||
 | 
					    +tag method
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					p Create a #[code Lemmatizer].
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+aside-code("Example").
 | 
				
			||||||
 | 
					    from spacy.lemmatizer import Lemmatizer
 | 
				
			||||||
 | 
					    lemmatizer = Lemmatizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+table(["Name", "Type", "Description"])
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code index]
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell Inventory of lemmas in the language.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code exceptions]
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell Mapping of string forms to lemmas that bypass the #[code rules].
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code rules]
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell List of suffix rewrite rules.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code lookup]
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell Lookup table mapping string to their lemmas.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row("foot")
 | 
				
			||||||
 | 
					        +cell returns
 | 
				
			||||||
 | 
					        +cell #[code Lemmatizer]
 | 
				
			||||||
 | 
					        +cell The newly created object.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+h(2, "call") Lemmatizer.__call__
 | 
				
			||||||
 | 
					    +tag method
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					p Lemmatize a string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+aside-code("Example").
 | 
				
			||||||
 | 
					    from spacy.lemmatizer import Lemmatizer
 | 
				
			||||||
 | 
					    from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
 | 
				
			||||||
 | 
					    lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
 | 
				
			||||||
 | 
					    lemmas = lemmatizer(u'ducks', u'NOUN')
 | 
				
			||||||
 | 
					    assert lemmas == [u'duck']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+table(["Name", "Type", "Description"])
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code string]
 | 
				
			||||||
 | 
					        +cell unicode
 | 
				
			||||||
 | 
					        +cell The string to lemmatize, e.g. the token text.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code univ_pos]
 | 
				
			||||||
 | 
					        +cell unicode / int
 | 
				
			||||||
 | 
					        +cell The token's universal part-of-speech tag.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code morphology]
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell
 | 
				
			||||||
 | 
					            |  Morphological features following the
 | 
				
			||||||
 | 
					            |  #[+a("http://universaldependencies.org/") Universal Dependencies]
 | 
				
			||||||
 | 
					            |  scheme.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row("foot")
 | 
				
			||||||
 | 
					        +cell returns
 | 
				
			||||||
 | 
					        +cell list
 | 
				
			||||||
 | 
					        +cell The available lemmas for the string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+h(2, "lookup") Lemmatizer.lookup
 | 
				
			||||||
 | 
					    +tag method
 | 
				
			||||||
 | 
					    +tag-new(2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					p
 | 
				
			||||||
 | 
					    |  Look up a lemma in the lookup table, if available. If no lemma is found,
 | 
				
			||||||
 | 
					    |  the original string is returned. Languages can provide a
 | 
				
			||||||
 | 
					    |  #[+a("/usage/adding-languages#lemmatizer") lookup table] via the
 | 
				
			||||||
 | 
					    |  #[code lemma_lookup] variable, set on the individual #[code Language]
 | 
				
			||||||
 | 
					    |  class.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+aside-code("Example").
 | 
				
			||||||
 | 
					    lookup = {u'going': u'go'}
 | 
				
			||||||
 | 
					    lemmatizer = Lemmatizer(lookup=lookup)
 | 
				
			||||||
 | 
					    assert lemmatizer.lookup(u'going') == u'go'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+table(["Name", "Type", "Description"])
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code string]
 | 
				
			||||||
 | 
					        +cell unicode
 | 
				
			||||||
 | 
					        +cell The string to look up.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row("foot")
 | 
				
			||||||
 | 
					        +cell returns
 | 
				
			||||||
 | 
					        +cell unicode
 | 
				
			||||||
 | 
					        +cell The lemma if the string was found, otherwise the original string.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+h(2, "is_base_form") Lemmatizer.is_base_form
 | 
				
			||||||
 | 
					    +tag method
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					p
 | 
				
			||||||
 | 
					    |  Check whether we're dealing with an uninflected paradigm, so we can
 | 
				
			||||||
 | 
					    |  avoid lemmatization entirely.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+aside-code("Example").
 | 
				
			||||||
 | 
					    pos = 'verb'
 | 
				
			||||||
 | 
					    morph = {'VerbForm': 'inf'}
 | 
				
			||||||
 | 
					    is_base_form = lemmatizer.is_base_form(pos, morph)
 | 
				
			||||||
 | 
					    assert is_base_form == True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+table(["Name", "Type", "Description"])
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code univ_pos]
 | 
				
			||||||
 | 
					        +cell unicode / int
 | 
				
			||||||
 | 
					        +cell The token's universal part-of-speech tag.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code morphology]
 | 
				
			||||||
 | 
					        +cell dict
 | 
				
			||||||
 | 
					        +cell The token's morphological features.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row("foot")
 | 
				
			||||||
 | 
					        +cell returns
 | 
				
			||||||
 | 
					        +cell bool
 | 
				
			||||||
 | 
					        +cell
 | 
				
			||||||
 | 
					            |  Whether the token's part-of-speech tag and morphological features
 | 
				
			||||||
 | 
					            |  describe a base form.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+h(2, "attributes") Attributes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+table(["Name", "Type", "Description"])
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code index]
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell Inventory of lemmas in the language.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code exc]
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell Mapping of string forms to lemmas that bypass the #[code rules].
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code rules]
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell List of suffix rewrite rules.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code lookup_table]
 | 
				
			||||||
 | 
					            +tag-new(2)
 | 
				
			||||||
 | 
					        +cell dict / #[code None]
 | 
				
			||||||
 | 
					        +cell The lemma lookup table, if available.
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -284,7 +284,7 @@ p Retokenize the document, such that the span is merged into a single token.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+aside-code("Example").
 | 
					+aside-code("Example").
 | 
				
			||||||
    doc = nlp(u'I like New York in Autumn.')
 | 
					    doc = nlp(u'I like New York in Autumn.')
 | 
				
			||||||
    span = doc[2:3]
 | 
					    span = doc[2:4]
 | 
				
			||||||
    span.merge()
 | 
					    span.merge()
 | 
				
			||||||
    assert len(doc) == 6
 | 
					    assert len(doc) == 6
 | 
				
			||||||
    assert doc[2].text == 'New York'
 | 
					    assert doc[2].text == 'New York'
 | 
				
			||||||
| 
						 | 
					@ -302,6 +302,25 @@ p Retokenize the document, such that the span is merged into a single token.
 | 
				
			||||||
        +cell #[code Token]
 | 
					        +cell #[code Token]
 | 
				
			||||||
        +cell The newly merged token.
 | 
					        +cell The newly merged token.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+h(2, "as_doc") Span.as_doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					p
 | 
				
			||||||
 | 
					    |  Create a #[code Doc] object view of the #[code Span]'s data. Mostly
 | 
				
			||||||
 | 
					    |  useful for C-typed interfaces.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+aside-code("Example").
 | 
				
			||||||
 | 
					    doc = nlp(u'I like New York in Autumn.')
 | 
				
			||||||
 | 
					    span = doc[2:4]
 | 
				
			||||||
 | 
					    doc2 = span.as_doc()
 | 
				
			||||||
 | 
					    assert doc2.text == 'New York'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+table(["Name", "Type", "Description"])
 | 
				
			||||||
 | 
					    +row("foot")
 | 
				
			||||||
 | 
					        +cell returns
 | 
				
			||||||
 | 
					        +cell #[code Doc]
 | 
				
			||||||
 | 
					        +cell A #[code Doc] object of the #[code Span]'s content.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+h(2, "root") Span.root
 | 
					+h(2, "root") Span.root
 | 
				
			||||||
    +tag property
 | 
					    +tag property
 | 
				
			||||||
    +tag-model("parse")
 | 
					    +tag-model("parse")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -586,6 +586,16 @@ p The L2 norm of the token's vector representation.
 | 
				
			||||||
        +cell bool
 | 
					        +cell bool
 | 
				
			||||||
        +cell Is the token punctuation?
 | 
					        +cell Is the token punctuation?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code is_left_punct]
 | 
				
			||||||
 | 
					        +cell bool
 | 
				
			||||||
 | 
					        +cell Is the token a left punctuation mark, e.g. #[code (]?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code is_right_punct]
 | 
				
			||||||
 | 
					        +cell bool
 | 
				
			||||||
 | 
					        +cell Is the token a right punctuation mark, e.g. #[code )]?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    +row
 | 
					    +row
 | 
				
			||||||
        +cell #[code is_space]
 | 
					        +cell #[code is_space]
 | 
				
			||||||
        +cell bool
 | 
					        +cell bool
 | 
				
			||||||
| 
						 | 
					@ -593,6 +603,16 @@ p The L2 norm of the token's vector representation.
 | 
				
			||||||
            |  Does the token consist of whitespace characters? Equivalent to
 | 
					            |  Does the token consist of whitespace characters? Equivalent to
 | 
				
			||||||
            |  #[code token.text.isspace()].
 | 
					            |  #[code token.text.isspace()].
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code is_bracket]
 | 
				
			||||||
 | 
					        +cell bool
 | 
				
			||||||
 | 
					        +cell Is the token a bracket?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code is_quote]
 | 
				
			||||||
 | 
					        +cell bool
 | 
				
			||||||
 | 
					        +cell Is the token a quotation mark?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    +row
 | 
					    +row
 | 
				
			||||||
        +cell #[code like_url]
 | 
					        +cell #[code like_url]
 | 
				
			||||||
        +cell bool
 | 
					        +cell bool
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,7 +12,7 @@ p
 | 
				
			||||||
 | 
					
 | 
				
			||||||
p
 | 
					p
 | 
				
			||||||
    |  Create a new vector store. To keep the vector table empty, pass
 | 
					    |  Create a new vector store. To keep the vector table empty, pass
 | 
				
			||||||
    |  #[code data_or_width=0]. You can also create the vector table and add
 | 
					    |  #[code width=0]. You can also create the vector table and add
 | 
				
			||||||
    |  vectors one by one, or set the vector values directly on initialisation.
 | 
					    |  vectors one by one, or set the vector values directly on initialisation.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+aside-code("Example").
 | 
					+aside-code("Example").
 | 
				
			||||||
| 
						 | 
					@ -21,11 +21,11 @@ p
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    empty_vectors = Vectors(StringStore())
 | 
					    empty_vectors = Vectors(StringStore())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    vectors = Vectors([u'cat'], 300)
 | 
					    vectors = Vectors([u'cat'], width=300)
 | 
				
			||||||
    vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
 | 
					    vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    vector_table = numpy.zeros((3, 300), dtype='f')
 | 
					    vector_table = numpy.zeros((3, 300), dtype='f')
 | 
				
			||||||
    vectors = Vectors(StringStore(), vector_table)
 | 
					    vectors = Vectors(StringStore(), data=vector_table)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+table(["Name", "Type", "Description"])
 | 
					+table(["Name", "Type", "Description"])
 | 
				
			||||||
    +row
 | 
					    +row
 | 
				
			||||||
| 
						 | 
					@ -36,9 +36,12 @@ p
 | 
				
			||||||
            |  that maps strings to hash values, and vice versa.
 | 
					            |  that maps strings to hash values, and vice versa.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    +row
 | 
					    +row
 | 
				
			||||||
        +cell #[code data_or_width]
 | 
					        +cell #[code data]
 | 
				
			||||||
        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] or int
 | 
					        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
 | 
				
			||||||
        +cell Vector data or number of dimensions.
 | 
					
 | 
				
			||||||
 | 
					    +row
 | 
				
			||||||
 | 
					        +cell #[code width]
 | 
				
			||||||
 | 
					        +cell Number of dimensions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    +row("foot")
 | 
					    +row("foot")
 | 
				
			||||||
        +cell returns
 | 
					        +cell returns
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -63,7 +63,6 @@ code
 | 
				
			||||||
    padding: 0.2rem 0.4rem
 | 
					    padding: 0.2rem 0.4rem
 | 
				
			||||||
    border-radius: 0.25rem
 | 
					    border-radius: 0.25rem
 | 
				
			||||||
    font-family: $font-code
 | 
					    font-family: $font-code
 | 
				
			||||||
    white-space: nowrap
 | 
					 | 
				
			||||||
    margin: 0
 | 
					    margin: 0
 | 
				
			||||||
    box-decoration-break: clone
 | 
					    box-decoration-break: clone
 | 
				
			||||||
    white-space: nowrap
 | 
					    white-space: nowrap
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,9 +14,6 @@
 | 
				
			||||||
    width: 100%
 | 
					    width: 100%
 | 
				
			||||||
    box-shadow: $box-shadow
 | 
					    box-shadow: $box-shadow
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    //@include breakpoint(min, md)
 | 
					 | 
				
			||||||
    //    position: fixed
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    &.is-fixed
 | 
					    &.is-fixed
 | 
				
			||||||
        animation: slideInDown 0.5s ease-in-out
 | 
					        animation: slideInDown 0.5s ease-in-out
 | 
				
			||||||
        position: fixed
 | 
					        position: fixed
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,7 @@
 | 
				
			||||||
//- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER
 | 
					//- 💫 DOCS > USAGE > TRAINING > TAGGER & PARSER
 | 
				
			||||||
 | 
					
 | 
				
			||||||
+under-construction
 | 
					+under-construction
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					+h(3, "training-json") JSON format for training
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					include ../../api/_annotation/_training
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user