mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'whatif/arrow' of https://github.com/explosion/spaCy into whatif/arrow
This commit is contained in:
		
						commit
						1d4322eb04
					
				
							
								
								
									
										106
									
								
								.github/contributors/Arvindcheenu.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/Arvindcheenu.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Arvind Srinivasan    |
 | 
				
			||||||
 | 
					| Company name (if applicable)   |                      |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |                      |
 | 
				
			||||||
 | 
					| Date                           | 2020-06-13           |
 | 
				
			||||||
 | 
					| GitHub username                | arvindcheenu         |
 | 
				
			||||||
 | 
					| Website (optional)             |                      |
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/JannisTriesToCode.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/JannisTriesToCode.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                         |
 | 
				
			||||||
 | 
					|------------------------------- | ----------------------------- |
 | 
				
			||||||
 | 
					| Name                           | Jannis Rauschke               |
 | 
				
			||||||
 | 
					| Company name (if applicable)   |                               |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |                               |
 | 
				
			||||||
 | 
					| Date                           | 22.05.2020                    |
 | 
				
			||||||
 | 
					| GitHub username                | JannisTriesToCode             |
 | 
				
			||||||
 | 
					| Website (optional)             | https://twitter.com/JRauschke |
 | 
				
			||||||
							
								
								
									
										4
									
								
								.github/contributors/MartinoMensio.md
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/contributors/MartinoMensio.md
									
									
									
									
										vendored
									
									
								
							| 
						 | 
					@ -99,8 +99,8 @@ mark both statements:
 | 
				
			||||||
| Field                          | Entry                              |
 | 
					| Field                          | Entry                              |
 | 
				
			||||||
|------------------------------- | --------------------               |
 | 
					|------------------------------- | --------------------               |
 | 
				
			||||||
| Name                           | Martino Mensio                     |
 | 
					| Name                           | Martino Mensio                     |
 | 
				
			||||||
| Company name (if applicable)   | Polytechnic University of Turin    |
 | 
					| Company name (if applicable)   | The Open University                |
 | 
				
			||||||
| Title or role (if applicable)  | Student                            |
 | 
					| Title or role (if applicable)  | PhD Student                        |
 | 
				
			||||||
| Date                           | 17 November 2017                   |
 | 
					| Date                           | 17 November 2017                   |
 | 
				
			||||||
| GitHub username                | MartinoMensio                      |
 | 
					| GitHub username                | MartinoMensio                      |
 | 
				
			||||||
| Website (optional)             | https://martinomensio.github.io/   |
 | 
					| Website (optional)             | https://martinomensio.github.io/   |
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/R1j1t.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/R1j1t.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Rajat                     |
 | 
				
			||||||
 | 
					| Company name (if applicable)   |                      |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |                      |
 | 
				
			||||||
 | 
					| Date                           |  24 May 2020                    |
 | 
				
			||||||
 | 
					| GitHub username                |  R1j1t                    |
 | 
				
			||||||
 | 
					| Website (optional)             |                      |
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/hiroshi-matsuda-rit.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/hiroshi-matsuda-rit.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Hiroshi Matsuda      |
 | 
				
			||||||
 | 
					| Company name (if applicable)   | Megagon Labs, Tokyo  |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  | Research Scientist   |
 | 
				
			||||||
 | 
					| Date                           | June 6, 2020         |
 | 
				
			||||||
 | 
					| GitHub username                | hiroshi-matsuda-rit  |
 | 
				
			||||||
 | 
					| Website (optional)             |                      |
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/jonesmartins.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/jonesmartins.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Jones Martins        |
 | 
				
			||||||
 | 
					| Company name (if applicable)   |                      |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |                      |
 | 
				
			||||||
 | 
					| Date                           | 2020-06-10           |
 | 
				
			||||||
 | 
					| GitHub username                | jonesmartins         |
 | 
				
			||||||
 | 
					| Website (optional)             |                      |
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/leomrocha.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/leomrocha.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Leonardo M. Rocha    |
 | 
				
			||||||
 | 
					| Company name (if applicable)   |                      |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |  Eng.                |
 | 
				
			||||||
 | 
					| Date                           |  31/05/2020          |
 | 
				
			||||||
 | 
					| GitHub username                |  leomrocha           |
 | 
				
			||||||
 | 
					| Website (optional)             |                      |
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/lfiedler.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/lfiedler.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Leander Fiedler      |
 | 
				
			||||||
 | 
					| Company name (if applicable)   |                      |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |                      |
 | 
				
			||||||
 | 
					| Date                           | 06 April 2020        |
 | 
				
			||||||
 | 
					| GitHub username                | lfiedler             |
 | 
				
			||||||
 | 
					| Website (optional)             |                      |
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/mahnerak.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/mahnerak.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Karen Hambardzumyan  |
 | 
				
			||||||
 | 
					| Company name (if applicable)   | YerevaNN             |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  | Researcher           |
 | 
				
			||||||
 | 
					| Date                           | 2020-06-19           |
 | 
				
			||||||
 | 
					| GitHub username                | mahnerak             |
 | 
				
			||||||
 | 
					| Website (optional)             | https://mahnerak.com/|
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/myavrum.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/myavrum.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI GmbH](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                |
 | 
				
			||||||
 | 
					|------------------------------- | -------------------- |
 | 
				
			||||||
 | 
					| Name                           | Marat M. Yavrumyan   |
 | 
				
			||||||
 | 
					| Company name (if applicable)   | YSU, UD_Armenian Project |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  | Dr., Principal Investigator |
 | 
				
			||||||
 | 
					| Date                           | 2020-06-19           |
 | 
				
			||||||
 | 
					| GitHub username                | myavrum              |
 | 
				
			||||||
 | 
					| Website (optional)             | http://armtreebank.yerevann.com/ |
 | 
				
			||||||
							
								
								
									
										106
									
								
								.github/contributors/theudas.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										106
									
								
								.github/contributors/theudas.md
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,106 @@
 | 
				
			||||||
 | 
					# spaCy contributor agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This spaCy Contributor Agreement (**"SCA"**) is based on the
 | 
				
			||||||
 | 
					[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
 | 
				
			||||||
 | 
					The SCA applies to any contribution that you make to any product or project
 | 
				
			||||||
 | 
					managed by us (the **"project"**), and sets out the intellectual property rights
 | 
				
			||||||
 | 
					you grant to us in the contributed materials. The term **"us"** shall mean
 | 
				
			||||||
 | 
					[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
 | 
				
			||||||
 | 
					**"you"** shall mean the person or entity identified below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					If you agree to be bound by these terms, fill in the information requested
 | 
				
			||||||
 | 
					below and include the filled-in version with your first pull request, under the
 | 
				
			||||||
 | 
					folder [`.github/contributors/`](/.github/contributors/). The name of the file
 | 
				
			||||||
 | 
					should be your GitHub username, with the extension `.md`. For example, the user
 | 
				
			||||||
 | 
					example_user would create the file `.github/contributors/example_user.md`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Read this agreement carefully before signing. These terms and conditions
 | 
				
			||||||
 | 
					constitute a binding legal agreement.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Agreement
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. The term "contribution" or "contributed materials" means any source code,
 | 
				
			||||||
 | 
					object code, patch, tool, sample, graphic, specification, manual,
 | 
				
			||||||
 | 
					documentation, or any other material posted or submitted by you to the project.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. With respect to any worldwide copyrights, or copyright applications and
 | 
				
			||||||
 | 
					registrations, in your contribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you hereby assign to us joint ownership, and to the extent that such
 | 
				
			||||||
 | 
					    assignment is or becomes invalid, ineffective or unenforceable, you hereby
 | 
				
			||||||
 | 
					    grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
 | 
				
			||||||
 | 
					    royalty-free, unrestricted license to exercise all rights under those
 | 
				
			||||||
 | 
					    copyrights. This includes, at our option, the right to sublicense these same
 | 
				
			||||||
 | 
					    rights to third parties through multiple levels of sublicensees or other
 | 
				
			||||||
 | 
					    licensing arrangements;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that each of us can do all things in relation to your
 | 
				
			||||||
 | 
					    contribution as if each of us were the sole owners, and if one of us makes
 | 
				
			||||||
 | 
					    a derivative work of your contribution, the one who makes the derivative
 | 
				
			||||||
 | 
					    work (or has it made will be the sole owner of that derivative work;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that you will not assert any moral rights in your contribution
 | 
				
			||||||
 | 
					    against us, our licensees or transferees;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that we may register a copyright in your contribution and
 | 
				
			||||||
 | 
					    exercise all ownership rights associated with it; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * you agree that neither of us has any duty to consult with, obtain the
 | 
				
			||||||
 | 
					    consent of, pay or render an accounting to the other for any use or
 | 
				
			||||||
 | 
					    distribution of your contribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					3. With respect to any patents you own, or that you can license without payment
 | 
				
			||||||
 | 
					to any third party, you hereby grant to us a perpetual, irrevocable,
 | 
				
			||||||
 | 
					non-exclusive, worldwide, no-charge, royalty-free license to:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * make, have made, use, sell, offer to sell, import, and otherwise transfer
 | 
				
			||||||
 | 
					    your contribution in whole or in part, alone or in combination with or
 | 
				
			||||||
 | 
					    included in any product, work or materials arising out of the project to
 | 
				
			||||||
 | 
					    which your contribution was submitted, and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * at our option, to sublicense these same rights to third parties through
 | 
				
			||||||
 | 
					    multiple levels of sublicensees or other licensing arrangements.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					4. Except as set out above, you keep all right, title, and interest in your
 | 
				
			||||||
 | 
					contribution. The rights that you grant to us under these terms are effective
 | 
				
			||||||
 | 
					on the date you first submitted a contribution to us, even if your submission
 | 
				
			||||||
 | 
					took place before the date you sign these terms.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					5. You covenant, represent, warrant and agree that:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * Each contribution that you submit is and shall be an original work of
 | 
				
			||||||
 | 
					    authorship and you can legally grant the rights set out in this SCA;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * to the best of your knowledge, each contribution will not violate any
 | 
				
			||||||
 | 
					    third party's copyrights, trademarks, patents, or other intellectual
 | 
				
			||||||
 | 
					    property rights; and
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * each contribution shall be in compliance with U.S. export control laws and
 | 
				
			||||||
 | 
					    other applicable export and import laws. You agree to notify us if you
 | 
				
			||||||
 | 
					    become aware of any circumstance which would make any of the foregoing
 | 
				
			||||||
 | 
					    representations inaccurate in any respect. We may publicly disclose your 
 | 
				
			||||||
 | 
					    participation in the project, including the fact that you have signed the SCA.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					6. This SCA is governed by the laws of the State of California and applicable
 | 
				
			||||||
 | 
					U.S. Federal law. Any choice of law rules will not apply.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					7. Please place an “x” on one of the applicable statement below. Please do NOT
 | 
				
			||||||
 | 
					mark both statements:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [x] I am signing on behalf of myself as an individual and no other person
 | 
				
			||||||
 | 
					    or entity, including my employer, has or will have rights with respect to my
 | 
				
			||||||
 | 
					    contributions.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    * [ ] I am signing on behalf of my employer or a legal entity and I have the
 | 
				
			||||||
 | 
					    actual authority to contractually bind that entity.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Contributor Details
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					| Field                          | Entry                    |
 | 
				
			||||||
 | 
					|------------------------------- | ------------------------ |
 | 
				
			||||||
 | 
					| Name                           | Philipp Sodmann          |
 | 
				
			||||||
 | 
					| Company name (if applicable)   | Empolis                  |
 | 
				
			||||||
 | 
					| Title or role (if applicable)  |                          |
 | 
				
			||||||
 | 
					| Date                           | 2017-05-06               |
 | 
				
			||||||
 | 
					| GitHub username                | theudas                  |
 | 
				
			||||||
 | 
					| Website (optional)             |                          |
 | 
				
			||||||
							
								
								
									
										29
									
								
								.github/workflows/issue-manager.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								.github/workflows/issue-manager.yml
									
									
									
									
										vendored
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,29 @@
 | 
				
			||||||
 | 
					name: Issue Manager
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					on:
 | 
				
			||||||
 | 
					  schedule:
 | 
				
			||||||
 | 
					    - cron: "0 0 * * *"
 | 
				
			||||||
 | 
					  issue_comment:
 | 
				
			||||||
 | 
					    types:
 | 
				
			||||||
 | 
					      - created
 | 
				
			||||||
 | 
					      - edited
 | 
				
			||||||
 | 
					  issues:
 | 
				
			||||||
 | 
					    types:
 | 
				
			||||||
 | 
					      - labeled
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					jobs:
 | 
				
			||||||
 | 
					  issue-manager:
 | 
				
			||||||
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
 | 
					    steps:
 | 
				
			||||||
 | 
					      - uses: tiangolo/issue-manager@0.2.1
 | 
				
			||||||
 | 
					        with:
 | 
				
			||||||
 | 
					          token: ${{ secrets.GITHUB_TOKEN }}
 | 
				
			||||||
 | 
					          config: >
 | 
				
			||||||
 | 
					            {
 | 
				
			||||||
 | 
					              "resolved": {
 | 
				
			||||||
 | 
					                "delay": "P7D",
 | 
				
			||||||
 | 
					                "message": "This issue has been automatically closed because it was answered and there was no follow-up discussion.",
 | 
				
			||||||
 | 
					                "remove_label_on_comment": true,
 | 
				
			||||||
 | 
					                "remove_label_on_close": true
 | 
				
			||||||
 | 
					              }
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
							
								
								
									
										5
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										5
									
								
								Makefile
									
									
									
									
									
								
							| 
						 | 
					@ -5,8 +5,9 @@ VENV := ./env$(PYVER)
 | 
				
			||||||
version := $(shell "bin/get-version.sh")
 | 
					version := $(shell "bin/get-version.sh")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
 | 
					dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
 | 
				
			||||||
	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data
 | 
						$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core
 | 
				
			||||||
	chmod a+rx $@
 | 
						chmod a+rx $@
 | 
				
			||||||
 | 
						cp $@ dist/spacy.pex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
dist/pytest.pex : wheelhouse/pytest-*.whl
 | 
					dist/pytest.pex : wheelhouse/pytest-*.whl
 | 
				
			||||||
	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock
 | 
						$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m pytest -o $@ pytest pytest-timeout mock
 | 
				
			||||||
| 
						 | 
					@ -14,7 +15,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
 | 
				
			||||||
 | 
					
 | 
				
			||||||
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
 | 
					wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
 | 
				
			||||||
	$(VENV)/bin/pip wheel . -w ./wheelhouse
 | 
						$(VENV)/bin/pip wheel . -w ./wheelhouse
 | 
				
			||||||
	$(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse
 | 
						$(VENV)/bin/pip wheel spacy-lookups-data jieba pkuseg==0.0.22 sudachipy sudachidict_core -w ./wheelhouse
 | 
				
			||||||
	touch $@
 | 
						touch $@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
 | 
					wheelhouse/pytest-%.whl : $(VENV)/bin/pex
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										17
									
								
								README.md
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								README.md
									
									
									
									
									
								
							| 
						 | 
					@ -6,12 +6,12 @@ spaCy is a library for advanced Natural Language Processing in Python and
 | 
				
			||||||
Cython. It's built on the very latest research, and was designed from day one to
 | 
					Cython. It's built on the very latest research, and was designed from day one to
 | 
				
			||||||
be used in real products. spaCy comes with
 | 
					be used in real products. spaCy comes with
 | 
				
			||||||
[pretrained statistical models](https://spacy.io/models) and word vectors, and
 | 
					[pretrained statistical models](https://spacy.io/models) and word vectors, and
 | 
				
			||||||
currently supports tokenization for **50+ languages**. It features
 | 
					currently supports tokenization for **60+ languages**. It features
 | 
				
			||||||
state-of-the-art speed, convolutional **neural network models** for tagging,
 | 
					state-of-the-art speed, convolutional **neural network models** for tagging,
 | 
				
			||||||
parsing and **named entity recognition** and easy **deep learning** integration.
 | 
					parsing and **named entity recognition** and easy **deep learning** integration.
 | 
				
			||||||
It's commercial open-source software, released under the MIT license.
 | 
					It's commercial open-source software, released under the MIT license.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
💫 **Version 2.2 out now!**
 | 
					💫 **Version 2.3 out now!**
 | 
				
			||||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
					[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
 | 
					[>)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
 | 
				
			||||||
| 
						 | 
					@ -31,7 +31,7 @@ It's commercial open-source software, released under the MIT license.
 | 
				
			||||||
| --------------- | -------------------------------------------------------------- |
 | 
					| --------------- | -------------------------------------------------------------- |
 | 
				
			||||||
| [spaCy 101]     | New to spaCy? Here's everything you need to know!              |
 | 
					| [spaCy 101]     | New to spaCy? Here's everything you need to know!              |
 | 
				
			||||||
| [Usage Guides]  | How to use spaCy and its features.                             |
 | 
					| [Usage Guides]  | How to use spaCy and its features.                             |
 | 
				
			||||||
| [New in v2.2]   | New features, backwards incompatibilities and migration guide. |
 | 
					| [New in v2.3]   | New features, backwards incompatibilities and migration guide. |
 | 
				
			||||||
| [API Reference] | The detailed reference for spaCy's API.                        |
 | 
					| [API Reference] | The detailed reference for spaCy's API.                        |
 | 
				
			||||||
| [Models]        | Download statistical language models for spaCy.                |
 | 
					| [Models]        | Download statistical language models for spaCy.                |
 | 
				
			||||||
| [Universe]      | Libraries, extensions, demos, books and courses.               |
 | 
					| [Universe]      | Libraries, extensions, demos, books and courses.               |
 | 
				
			||||||
| 
						 | 
					@ -39,7 +39,7 @@ It's commercial open-source software, released under the MIT license.
 | 
				
			||||||
| [Contribute]    | How to contribute to the spaCy project and code base.          |
 | 
					| [Contribute]    | How to contribute to the spaCy project and code base.          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[spacy 101]: https://spacy.io/usage/spacy-101
 | 
					[spacy 101]: https://spacy.io/usage/spacy-101
 | 
				
			||||||
[new in v2.2]: https://spacy.io/usage/v2-2
 | 
					[new in v2.3]: https://spacy.io/usage/v2-3
 | 
				
			||||||
[usage guides]: https://spacy.io/usage/
 | 
					[usage guides]: https://spacy.io/usage/
 | 
				
			||||||
[api reference]: https://spacy.io/api/
 | 
					[api reference]: https://spacy.io/api/
 | 
				
			||||||
[models]: https://spacy.io/models
 | 
					[models]: https://spacy.io/models
 | 
				
			||||||
| 
						 | 
					@ -119,12 +119,13 @@ of `v2.0.13`).
 | 
				
			||||||
pip install spacy
 | 
					pip install spacy
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To install additional data tables for lemmatization in **spaCy v2.2+** you can
 | 
					To install additional data tables for lemmatization and normalization in
 | 
				
			||||||
run `pip install spacy[lookups]` or install
 | 
					**spaCy v2.2+** you can run `pip install spacy[lookups]` or install
 | 
				
			||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
 | 
					[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
 | 
				
			||||||
separately. The lookups package is needed to create blank models with
 | 
					separately. The lookups package is needed to create blank models with
 | 
				
			||||||
lemmatization data, and to lemmatize in languages that don't yet come with
 | 
					lemmatization data for v2.2+ plus normalization data for v2.3+, and to
 | 
				
			||||||
pretrained models and aren't powered by third-party libraries.
 | 
					lemmatize in languages that don't yet come with pretrained models and aren't
 | 
				
			||||||
 | 
					powered by third-party libraries.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When using pip it is generally recommended to install packages in a virtual
 | 
					When using pip it is generally recommended to install packages in a virtual
 | 
				
			||||||
environment to avoid modifying system state:
 | 
					environment to avoid modifying system state:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,7 @@
 | 
				
			||||||
# coding: utf-8
 | 
					# coding: utf-8
 | 
				
			||||||
"""Using the parser to recognise your own semantics
 | 
					"""Using the parser to recognise your own semantics
 | 
				
			||||||
 | 
					
 | 
				
			||||||
spaCy's parser component can be used to trained to predict any type of tree
 | 
					spaCy's parser component can be trained to predict any type of tree
 | 
				
			||||||
structure over your input text. You can also predict trees over whole documents
 | 
					structure over your input text. You can also predict trees over whole documents
 | 
				
			||||||
or chat logs, with connections between the sentence-roots used to annotate
 | 
					or chat logs, with connections between the sentence-roots used to annotate
 | 
				
			||||||
discourse structure. In this example, we'll build a message parser for a common
 | 
					discourse structure. In this example, we'll build a message parser for a common
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -61,7 +61,7 @@ install_requires =
 | 
				
			||||||
 | 
					
 | 
				
			||||||
[options.extras_require]
 | 
					[options.extras_require]
 | 
				
			||||||
lookups =
 | 
					lookups =
 | 
				
			||||||
    spacy_lookups_data>=0.3.1,<0.4.0
 | 
					    spacy_lookups_data>=0.3.2,<0.4.0
 | 
				
			||||||
cuda =
 | 
					cuda =
 | 
				
			||||||
    cupy>=5.0.0b4,<9.0.0
 | 
					    cupy>=5.0.0b4,<9.0.0
 | 
				
			||||||
cuda80 =
 | 
					cuda80 =
 | 
				
			||||||
| 
						 | 
					@ -80,7 +80,8 @@ cuda102 =
 | 
				
			||||||
    cupy-cuda102>=5.0.0b4,<9.0.0
 | 
					    cupy-cuda102>=5.0.0b4,<9.0.0
 | 
				
			||||||
# Language tokenizers with external dependencies
 | 
					# Language tokenizers with external dependencies
 | 
				
			||||||
ja =
 | 
					ja =
 | 
				
			||||||
    fugashi>=0.1.3
 | 
					    sudachipy>=0.4.5
 | 
				
			||||||
 | 
					    sudachidict_core>=20200330
 | 
				
			||||||
ko =
 | 
					ko =
 | 
				
			||||||
    natto-py==0.9.0
 | 
					    natto-py==0.9.0
 | 
				
			||||||
th =
 | 
					th =
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,9 @@
 | 
				
			||||||
from typing import Optional, Dict, List, Union, Sequence
 | 
					from typing import Optional, Dict, List, Union, Sequence
 | 
				
			||||||
from timeit import default_timer as timer
 | 
					from timeit import default_timer as timer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import plac
 | 
				
			||||||
import srsly
 | 
					import srsly
 | 
				
			||||||
from pydantic import BaseModel, FilePath
 | 
					from pydantic import BaseModel, FilePath
 | 
				
			||||||
import plac
 | 
					 | 
				
			||||||
import tqdm
 | 
					import tqdm
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from wasabi import msg
 | 
					from wasabi import msg
 | 
				
			||||||
| 
						 | 
					@ -16,7 +16,9 @@ from ..gold import Corpus
 | 
				
			||||||
from ..lookups import Lookups
 | 
					from ..lookups import Lookups
 | 
				
			||||||
from .. import util
 | 
					from .. import util
 | 
				
			||||||
from ..errors import Errors
 | 
					from ..errors import Errors
 | 
				
			||||||
from ..ml import models  # don't remove - required to load the built-in architectures
 | 
					
 | 
				
			||||||
 | 
					# Don't remove - required to load the built-in architectures
 | 
				
			||||||
 | 
					from ..ml import models  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
registry = util.registry
 | 
					registry = util.registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -120,6 +122,7 @@ class ConfigSchema(BaseModel):
 | 
				
			||||||
    dev_path=("Location of JSON-formatted development data", "positional", None, Path),
 | 
					    dev_path=("Location of JSON-formatted development data", "positional", None, Path),
 | 
				
			||||||
    config_path=("Path to config file", "positional", None, Path),
 | 
					    config_path=("Path to config file", "positional", None, Path),
 | 
				
			||||||
    output_path=("Output directory to store model in", "option", "o", Path),
 | 
					    output_path=("Output directory to store model in", "option", "o", Path),
 | 
				
			||||||
 | 
					    code_path=("Path to Python file with additional code (registered functions) to be imported", "option", "c", Path),
 | 
				
			||||||
    init_tok2vec=(
 | 
					    init_tok2vec=(
 | 
				
			||||||
    "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
 | 
					    "Path to pretrained weights for the tok2vec components. See 'spacy pretrain'. Experimental.", "option", "t2v",
 | 
				
			||||||
    Path),
 | 
					    Path),
 | 
				
			||||||
| 
						 | 
					@ -135,6 +138,7 @@ def train_cli(
 | 
				
			||||||
    dev_path,
 | 
					    dev_path,
 | 
				
			||||||
    config_path,
 | 
					    config_path,
 | 
				
			||||||
    output_path=None,
 | 
					    output_path=None,
 | 
				
			||||||
 | 
					    code_path=None,
 | 
				
			||||||
    init_tok2vec=None,
 | 
					    init_tok2vec=None,
 | 
				
			||||||
    raw_text=None,
 | 
					    raw_text=None,
 | 
				
			||||||
    verbose=False,
 | 
					    verbose=False,
 | 
				
			||||||
| 
						 | 
					@ -541,6 +545,7 @@ def verify_cli_args(
 | 
				
			||||||
    dev_path,
 | 
					    dev_path,
 | 
				
			||||||
    config_path,
 | 
					    config_path,
 | 
				
			||||||
    output_path=None,
 | 
					    output_path=None,
 | 
				
			||||||
 | 
					    code_path=None,
 | 
				
			||||||
    init_tok2vec=None,
 | 
					    init_tok2vec=None,
 | 
				
			||||||
    raw_text=None,
 | 
					    raw_text=None,
 | 
				
			||||||
    verbose=False,
 | 
					    verbose=False,
 | 
				
			||||||
| 
						 | 
					@ -567,6 +572,13 @@ def verify_cli_args(
 | 
				
			||||||
                "the specified output path doesn't exist, the directory will be "
 | 
					                "the specified output path doesn't exist, the directory will be "
 | 
				
			||||||
                "created for you.",
 | 
					                "created for you.",
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					    if code_path is not None:
 | 
				
			||||||
 | 
					        if not code_path.exists():
 | 
				
			||||||
 | 
					            msg.fail("Path to Python code not found", code_path, exits=1)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            util.import_file("python_code", code_path)
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            msg.fail(f"Couldn't load Python code: {code_path}", e, exits=1)
 | 
				
			||||||
    if init_tok2vec is not None and not init_tok2vec.exists():
 | 
					    if init_tok2vec is not None and not init_tok2vec.exists():
 | 
				
			||||||
        msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
 | 
					        msg.fail("Can't find pretrained tok2vec", init_tok2vec, exits=1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ def add_codes(err_cls):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    class ErrorsWithCodes(err_cls):
 | 
					    class ErrorsWithCodes(err_cls):
 | 
				
			||||||
        def __getattribute__(self, code):
 | 
					        def __getattribute__(self, code):
 | 
				
			||||||
            msg = super().__getattribute__(code)
 | 
					            msg = super(ErrorsWithCodes, self).__getattribute__(code)
 | 
				
			||||||
            if code.startswith("__"):  # python system attributes like __class__
 | 
					            if code.startswith("__"):  # python system attributes like __class__
 | 
				
			||||||
                return msg
 | 
					                return msg
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
| 
						 | 
					@ -111,6 +111,25 @@ class Warnings(object):
 | 
				
			||||||
            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
 | 
					            "`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
 | 
				
			||||||
            " to check the alignment. Misaligned entities ('-') will be "
 | 
					            " to check the alignment. Misaligned entities ('-') will be "
 | 
				
			||||||
            "ignored during training.")
 | 
					            "ignored during training.")
 | 
				
			||||||
 | 
					    W031 = ("Model '{model}' ({model_version}) requires spaCy {version} and "
 | 
				
			||||||
 | 
					            "is incompatible with the current spaCy version ({current}). This "
 | 
				
			||||||
 | 
					            "may lead to unexpected results or runtime errors. To resolve "
 | 
				
			||||||
 | 
					            "this, download a newer compatible model or retrain your custom "
 | 
				
			||||||
 | 
					            "model with the current spaCy version. For more details and "
 | 
				
			||||||
 | 
					            "available updates, run: python -m spacy validate")
 | 
				
			||||||
 | 
					    W032 = ("Unable to determine model compatibility for model '{model}' "
 | 
				
			||||||
 | 
					            "({model_version}) with the current spaCy version ({current}). "
 | 
				
			||||||
 | 
					            "This may lead to unexpected results or runtime errors. To resolve "
 | 
				
			||||||
 | 
					            "this, download a newer compatible model or retrain your custom "
 | 
				
			||||||
 | 
					            "model with the current spaCy version. For more details and "
 | 
				
			||||||
 | 
					            "available updates, run: python -m spacy validate")
 | 
				
			||||||
 | 
					    W033 = ("Training a new {model} using a model with no lexeme normalization "
 | 
				
			||||||
 | 
					            "table. This may degrade the performance of the model to some "
 | 
				
			||||||
 | 
					            "degree. If this is intentional or the language you're using "
 | 
				
			||||||
 | 
					            "doesn't have a normalization table, please ignore this warning. "
 | 
				
			||||||
 | 
					            "If this is surprising, make sure you have the spacy-lookups-data "
 | 
				
			||||||
 | 
					            "package installed. The languages with lexeme normalization tables "
 | 
				
			||||||
 | 
					            "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
    W093 = ("Could not find any data to train the {name} on. Is your "
 | 
					    W093 = ("Could not find any data to train the {name} on. Is your "
 | 
				
			||||||
| 
						 | 
					@ -577,6 +596,9 @@ class Errors(object):
 | 
				
			||||||
    E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
 | 
					    E197 = ("Row out of bounds, unable to add row {row} for key {key}.")
 | 
				
			||||||
    E198 = ("Unable to return {n} most similar vectors for the current vectors "
 | 
					    E198 = ("Unable to return {n} most similar vectors for the current vectors "
 | 
				
			||||||
            "table, which contains {n_rows} vectors.")
 | 
					            "table, which contains {n_rows} vectors.")
 | 
				
			||||||
 | 
					    E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
 | 
				
			||||||
 | 
					    E200 = ("Specifying a base model with a pretrained component '{component}' "
 | 
				
			||||||
 | 
					            "can not be combined with adding a pretrained Tok2Vec layer.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
    E978 = ("The {method} method of component {name} takes a list of Example objects, "
 | 
					    E978 = ("The {method} method of component {name} takes a list of Example objects, "
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										0
									
								
								spacy/gold.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/gold.pyx
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -446,6 +446,8 @@ cdef class Writer:
 | 
				
			||||||
            assert not path.isdir(loc), f"{loc} is directory"
 | 
					            assert not path.isdir(loc), f"{loc} is directory"
 | 
				
			||||||
        if isinstance(loc, Path):
 | 
					        if isinstance(loc, Path):
 | 
				
			||||||
            loc = bytes(loc)
 | 
					            loc = bytes(loc)
 | 
				
			||||||
 | 
					        if path.exists(loc):
 | 
				
			||||||
 | 
					            assert not path.isdir(loc), "%s is directory." % loc
 | 
				
			||||||
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
					        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
				
			||||||
        self._fp = fopen(<char*>bytes_loc, 'wb')
 | 
					        self._fp = fopen(<char*>bytes_loc, 'wb')
 | 
				
			||||||
        if not self._fp:
 | 
					        if not self._fp:
 | 
				
			||||||
| 
						 | 
					@ -487,10 +489,10 @@ cdef class Writer:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Reader:
 | 
					cdef class Reader:
 | 
				
			||||||
    def __init__(self, object loc):
 | 
					    def __init__(self, object loc):
 | 
				
			||||||
        assert path.exists(loc)
 | 
					 | 
				
			||||||
        assert not path.isdir(loc)
 | 
					 | 
				
			||||||
        if isinstance(loc, Path):
 | 
					        if isinstance(loc, Path):
 | 
				
			||||||
            loc = bytes(loc)
 | 
					            loc = bytes(loc)
 | 
				
			||||||
 | 
					        assert path.exists(loc)
 | 
				
			||||||
 | 
					        assert not path.isdir(loc)
 | 
				
			||||||
        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
					        cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
 | 
				
			||||||
        self._fp = fopen(<char*>bytes_loc, 'rb')
 | 
					        self._fp = fopen(<char*>bytes_loc, 'rb')
 | 
				
			||||||
        if not self._fp:
 | 
					        if not self._fp:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -20,29 +20,25 @@ def noun_chunks(doclike):
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    nmod = doc.vocab.strings.add("nmod")
 | 
					    nmod = doc.vocab.strings.add("nmod")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    seen = set()
 | 
					    prev_end = -1
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # Prevent nested chunks from being produced
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
        if word.i in seen:
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        if word.dep in np_deps:
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
            if any(w.i in seen for w in word.subtree):
 | 
					 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            flag = False
 | 
					            flag = False
 | 
				
			||||||
            if word.pos == NOUN:
 | 
					            if word.pos == NOUN:
 | 
				
			||||||
                #  check for patterns such as γραμμή παραγωγής
 | 
					                #  check for patterns such as γραμμή παραγωγής
 | 
				
			||||||
                for potential_nmod in word.rights:
 | 
					                for potential_nmod in word.rights:
 | 
				
			||||||
                    if potential_nmod.dep == nmod:
 | 
					                    if potential_nmod.dep == nmod:
 | 
				
			||||||
                        seen.update(
 | 
					                        prev_end = potential_nmod.i
 | 
				
			||||||
                            j for j in range(word.left_edge.i, potential_nmod.i + 1)
 | 
					 | 
				
			||||||
                        )
 | 
					 | 
				
			||||||
                        yield word.left_edge.i, potential_nmod.i + 1, np_label
 | 
					                        yield word.left_edge.i, potential_nmod.i + 1, np_label
 | 
				
			||||||
                        flag = True
 | 
					                        flag = True
 | 
				
			||||||
                        break
 | 
					                        break
 | 
				
			||||||
            if flag is False:
 | 
					            if flag is False:
 | 
				
			||||||
                seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
					                prev_end = word.i
 | 
				
			||||||
                yield word.left_edge.i, word.i + 1, np_label
 | 
					                yield word.left_edge.i, word.i + 1, np_label
 | 
				
			||||||
        elif word.dep == conj:
 | 
					        elif word.dep == conj:
 | 
				
			||||||
            # covers the case: έχει όμορφα και έξυπνα παιδιά
 | 
					            # covers the case: έχει όμορφα και έξυπνα παιδιά
 | 
				
			||||||
| 
						 | 
					@ -51,9 +47,7 @@ def noun_chunks(doclike):
 | 
				
			||||||
                head = head.head
 | 
					                head = head.head
 | 
				
			||||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
					            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
				
			||||||
            if head.dep in np_deps:
 | 
					            if head.dep in np_deps:
 | 
				
			||||||
                if any(w.i in seen for w in word.subtree):
 | 
					                prev_end = word.i
 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
					 | 
				
			||||||
                yield word.left_edge.i, word.i + 1, np_label
 | 
					                yield word.left_edge.i, word.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,17 +25,15 @@ def noun_chunks(doclike):
 | 
				
			||||||
    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
					    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    seen = set()
 | 
					    prev_end = -1
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # Prevent nested chunks from being produced
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
        if word.i in seen:
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        if word.dep in np_deps:
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
            if any(w.i in seen for w in word.subtree):
 | 
					            prev_end = word.i
 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
					 | 
				
			||||||
            yield word.left_edge.i, word.i + 1, np_label
 | 
					            yield word.left_edge.i, word.i + 1, np_label
 | 
				
			||||||
        elif word.dep == conj:
 | 
					        elif word.dep == conj:
 | 
				
			||||||
            head = word.head
 | 
					            head = word.head
 | 
				
			||||||
| 
						 | 
					@ -43,9 +41,7 @@ def noun_chunks(doclike):
 | 
				
			||||||
                head = head.head
 | 
					                head = head.head
 | 
				
			||||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
					            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
				
			||||||
            if head.dep in np_deps:
 | 
					            if head.dep in np_deps:
 | 
				
			||||||
                if any(w.i in seen for w in word.subtree):
 | 
					                prev_end = word.i
 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
					 | 
				
			||||||
                yield word.left_edge.i, word.i + 1, np_label
 | 
					                yield word.left_edge.i, word.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -136,7 +136,19 @@ for pron in ["he", "she", "it"]:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# W-words, relative pronouns, prepositions etc.
 | 
					# W-words, relative pronouns, prepositions etc.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for word in ["who", "what", "when", "where", "why", "how", "there", "that"]:
 | 
					for word in [
 | 
				
			||||||
 | 
					    "who",
 | 
				
			||||||
 | 
					    "what",
 | 
				
			||||||
 | 
					    "when",
 | 
				
			||||||
 | 
					    "where",
 | 
				
			||||||
 | 
					    "why",
 | 
				
			||||||
 | 
					    "how",
 | 
				
			||||||
 | 
					    "there",
 | 
				
			||||||
 | 
					    "that",
 | 
				
			||||||
 | 
					    "this",
 | 
				
			||||||
 | 
					    "these",
 | 
				
			||||||
 | 
					    "those",
 | 
				
			||||||
 | 
					]:
 | 
				
			||||||
    for orth in [word, word.title()]:
 | 
					    for orth in [word, word.title()]:
 | 
				
			||||||
        _exc[orth + "'s"] = [
 | 
					        _exc[orth + "'s"] = [
 | 
				
			||||||
            {ORTH: orth, LEMMA: word, NORM: word},
 | 
					            {ORTH: orth, LEMMA: word, NORM: word},
 | 
				
			||||||
| 
						 | 
					@ -396,6 +408,8 @@ _other_exc = {
 | 
				
			||||||
        {ORTH: "Let", LEMMA: "let", NORM: "let"},
 | 
					        {ORTH: "Let", LEMMA: "let", NORM: "let"},
 | 
				
			||||||
        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
 | 
					        {ORTH: "'s", LEMMA: PRON_LEMMA, NORM: "us"},
 | 
				
			||||||
    ],
 | 
					    ],
 | 
				
			||||||
 | 
					    "c'mon": [{ORTH: "c'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
 | 
				
			||||||
 | 
					    "C'mon": [{ORTH: "C'm", NORM: "come", LEMMA: "come"}, {ORTH: "on"}],
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_exc.update(_other_exc)
 | 
					_exc.update(_other_exc)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,5 +14,9 @@ sentences = [
 | 
				
			||||||
    "El gato come pescado.",
 | 
					    "El gato come pescado.",
 | 
				
			||||||
    "Veo al hombre con el telescopio.",
 | 
					    "Veo al hombre con el telescopio.",
 | 
				
			||||||
    "La araña come moscas.",
 | 
					    "La araña come moscas.",
 | 
				
			||||||
    "El pingüino incuba en su nido.",
 | 
					    "El pingüino incuba en su nido sobre el hielo.",
 | 
				
			||||||
 | 
					    "¿Dónde estais?",
 | 
				
			||||||
 | 
					    "¿Quién es el presidente Francés?",
 | 
				
			||||||
 | 
					    "¿Dónde está encuentra la capital de Argentina?",
 | 
				
			||||||
 | 
					    "¿Cuándo nació José de San Martín?",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 | 
					from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES
 | 
				
			||||||
from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 | 
					from ..char_classes import LIST_ICONS, CURRENCY, LIST_UNITS, PUNCT
 | 
				
			||||||
from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
					from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,8 +7,12 @@ _exc = {
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
for exc_data in [
 | 
					for exc_data in [
 | 
				
			||||||
 | 
					    {ORTH: "n°", LEMMA: "número"},
 | 
				
			||||||
 | 
					    {ORTH: "°C", LEMMA: "grados Celcius"},
 | 
				
			||||||
    {ORTH: "aprox.", LEMMA: "aproximadamente"},
 | 
					    {ORTH: "aprox.", LEMMA: "aproximadamente"},
 | 
				
			||||||
    {ORTH: "dna.", LEMMA: "docena"},
 | 
					    {ORTH: "dna.", LEMMA: "docena"},
 | 
				
			||||||
 | 
					    {ORTH: "dpto.", LEMMA: "departamento"},
 | 
				
			||||||
 | 
					    {ORTH: "ej.", LEMMA: "ejemplo"},
 | 
				
			||||||
    {ORTH: "esq.", LEMMA: "esquina"},
 | 
					    {ORTH: "esq.", LEMMA: "esquina"},
 | 
				
			||||||
    {ORTH: "pág.", LEMMA: "página"},
 | 
					    {ORTH: "pág.", LEMMA: "página"},
 | 
				
			||||||
    {ORTH: "p.ej.", LEMMA: "por ejemplo"},
 | 
					    {ORTH: "p.ej.", LEMMA: "por ejemplo"},
 | 
				
			||||||
| 
						 | 
					@ -16,6 +20,7 @@ for exc_data in [
 | 
				
			||||||
    {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
 | 
					    {ORTH: "Vd.", LEMMA: PRON_LEMMA, NORM: "usted"},
 | 
				
			||||||
    {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
 | 
					    {ORTH: "Uds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
 | 
				
			||||||
    {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
 | 
					    {ORTH: "Vds.", LEMMA: PRON_LEMMA, NORM: "ustedes"},
 | 
				
			||||||
 | 
					    {ORTH: "vol.", NORM: "volúmen"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,10 +40,14 @@ for h in range(1, 12 + 1):
 | 
				
			||||||
for orth in [
 | 
					for orth in [
 | 
				
			||||||
    "a.C.",
 | 
					    "a.C.",
 | 
				
			||||||
    "a.J.C.",
 | 
					    "a.J.C.",
 | 
				
			||||||
 | 
					    "d.C.",
 | 
				
			||||||
 | 
					    "d.J.C.",
 | 
				
			||||||
    "apdo.",
 | 
					    "apdo.",
 | 
				
			||||||
    "Av.",
 | 
					    "Av.",
 | 
				
			||||||
    "Avda.",
 | 
					    "Avda.",
 | 
				
			||||||
    "Cía.",
 | 
					    "Cía.",
 | 
				
			||||||
 | 
					    "Dr.",
 | 
				
			||||||
 | 
					    "Dra.",
 | 
				
			||||||
    "EE.UU.",
 | 
					    "EE.UU.",
 | 
				
			||||||
    "etc.",
 | 
					    "etc.",
 | 
				
			||||||
    "fig.",
 | 
					    "fig.",
 | 
				
			||||||
| 
						 | 
					@ -54,9 +63,9 @@ for orth in [
 | 
				
			||||||
    "Prof.",
 | 
					    "Prof.",
 | 
				
			||||||
    "Profa.",
 | 
					    "Profa.",
 | 
				
			||||||
    "q.e.p.d.",
 | 
					    "q.e.p.d.",
 | 
				
			||||||
    "S.A.",
 | 
					    "Q.E.P.D." "S.A.",
 | 
				
			||||||
    "S.L.",
 | 
					    "S.L.",
 | 
				
			||||||
    "s.s.s.",
 | 
					    "S.R.L." "s.s.s.",
 | 
				
			||||||
    "Sr.",
 | 
					    "Sr.",
 | 
				
			||||||
    "Sra.",
 | 
					    "Sra.",
 | 
				
			||||||
    "Srta.",
 | 
					    "Srta.",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,17 +25,15 @@ def noun_chunks(doclike):
 | 
				
			||||||
    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
					    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    seen = set()
 | 
					    prev_end = -1
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # Prevent nested chunks from being produced
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
        if word.i in seen:
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        if word.dep in np_deps:
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
            if any(w.i in seen for w in word.subtree):
 | 
					            prev_end = word.i
 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
					 | 
				
			||||||
            yield word.left_edge.i, word.i + 1, np_label
 | 
					            yield word.left_edge.i, word.i + 1, np_label
 | 
				
			||||||
        elif word.dep == conj:
 | 
					        elif word.dep == conj:
 | 
				
			||||||
            head = word.head
 | 
					            head = word.head
 | 
				
			||||||
| 
						 | 
					@ -43,9 +41,7 @@ def noun_chunks(doclike):
 | 
				
			||||||
                head = head.head
 | 
					                head = head.head
 | 
				
			||||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
					            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
				
			||||||
            if head.dep in np_deps:
 | 
					            if head.dep in np_deps:
 | 
				
			||||||
                if any(w.i in seen for w in word.subtree):
 | 
					                prev_end = word.i
 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                seen.update(j for j in range(word.left_edge.i, word.i + 1))
 | 
					 | 
				
			||||||
                yield word.left_edge.i, word.i + 1, np_label
 | 
					                yield word.left_edge.i, word.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -531,7 +531,6 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Beaumont-Hamel",
 | 
					    "Beaumont-Hamel",
 | 
				
			||||||
    "Beaumont-Louestault",
 | 
					    "Beaumont-Louestault",
 | 
				
			||||||
    "Beaumont-Monteux",
 | 
					    "Beaumont-Monteux",
 | 
				
			||||||
    "Beaumont-Pied-de-Buf",
 | 
					 | 
				
			||||||
    "Beaumont-Pied-de-Bœuf",
 | 
					    "Beaumont-Pied-de-Bœuf",
 | 
				
			||||||
    "Beaumont-Sardolles",
 | 
					    "Beaumont-Sardolles",
 | 
				
			||||||
    "Beaumont-Village",
 | 
					    "Beaumont-Village",
 | 
				
			||||||
| 
						 | 
					@ -948,7 +947,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Buxières-sous-les-Côtes",
 | 
					    "Buxières-sous-les-Côtes",
 | 
				
			||||||
    "Buzy-Darmont",
 | 
					    "Buzy-Darmont",
 | 
				
			||||||
    "Byhleguhre-Byhlen",
 | 
					    "Byhleguhre-Byhlen",
 | 
				
			||||||
    "Burs-en-Othe",
 | 
					    "Bœurs-en-Othe",
 | 
				
			||||||
    "Bâle-Campagne",
 | 
					    "Bâle-Campagne",
 | 
				
			||||||
    "Bâle-Ville",
 | 
					    "Bâle-Ville",
 | 
				
			||||||
    "Béard-Géovreissiat",
 | 
					    "Béard-Géovreissiat",
 | 
				
			||||||
| 
						 | 
					@ -1586,11 +1585,11 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Cruci-Falgardiens",
 | 
					    "Cruci-Falgardiens",
 | 
				
			||||||
    "Cruquius-Oost",
 | 
					    "Cruquius-Oost",
 | 
				
			||||||
    "Cruviers-Lascours",
 | 
					    "Cruviers-Lascours",
 | 
				
			||||||
    "Crèvecur-en-Auge",
 | 
					    "Crèvecœur-en-Auge",
 | 
				
			||||||
    "Crèvecur-en-Brie",
 | 
					    "Crèvecœur-en-Brie",
 | 
				
			||||||
    "Crèvecur-le-Grand",
 | 
					    "Crèvecœur-le-Grand",
 | 
				
			||||||
    "Crèvecur-le-Petit",
 | 
					    "Crèvecœur-le-Petit",
 | 
				
			||||||
    "Crèvecur-sur-l'Escaut",
 | 
					    "Crèvecœur-sur-l'Escaut",
 | 
				
			||||||
    "Crécy-Couvé",
 | 
					    "Crécy-Couvé",
 | 
				
			||||||
    "Créon-d'Armagnac",
 | 
					    "Créon-d'Armagnac",
 | 
				
			||||||
    "Cubjac-Auvézère-Val-d'Ans",
 | 
					    "Cubjac-Auvézère-Val-d'Ans",
 | 
				
			||||||
| 
						 | 
					@ -1616,7 +1615,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Cuxac-Cabardès",
 | 
					    "Cuxac-Cabardès",
 | 
				
			||||||
    "Cuxac-d'Aude",
 | 
					    "Cuxac-d'Aude",
 | 
				
			||||||
    "Cuyk-Sainte-Agathe",
 | 
					    "Cuyk-Sainte-Agathe",
 | 
				
			||||||
    "Cuvres-et-Valsery",
 | 
					    "Cœuvres-et-Valsery",
 | 
				
			||||||
    "Céaux-d'Allègre",
 | 
					    "Céaux-d'Allègre",
 | 
				
			||||||
    "Céleste-Empire",
 | 
					    "Céleste-Empire",
 | 
				
			||||||
    "Cénac-et-Saint-Julien",
 | 
					    "Cénac-et-Saint-Julien",
 | 
				
			||||||
| 
						 | 
					@ -1679,7 +1678,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Devrai-Gondragnières",
 | 
					    "Devrai-Gondragnières",
 | 
				
			||||||
    "Dhuys et Morin-en-Brie",
 | 
					    "Dhuys et Morin-en-Brie",
 | 
				
			||||||
    "Diane-Capelle",
 | 
					    "Diane-Capelle",
 | 
				
			||||||
    "Dieffenbach-lès-Wrth",
 | 
					    "Dieffenbach-lès-Wœrth",
 | 
				
			||||||
    "Diekhusen-Fahrstedt",
 | 
					    "Diekhusen-Fahrstedt",
 | 
				
			||||||
    "Diennes-Aubigny",
 | 
					    "Diennes-Aubigny",
 | 
				
			||||||
    "Diensdorf-Radlow",
 | 
					    "Diensdorf-Radlow",
 | 
				
			||||||
| 
						 | 
					@ -1752,7 +1751,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Durdat-Larequille",
 | 
					    "Durdat-Larequille",
 | 
				
			||||||
    "Durfort-Lacapelette",
 | 
					    "Durfort-Lacapelette",
 | 
				
			||||||
    "Durfort-et-Saint-Martin-de-Sossenac",
 | 
					    "Durfort-et-Saint-Martin-de-Sossenac",
 | 
				
			||||||
    "Duil-sur-le-Mignon",
 | 
					    "Dœuil-sur-le-Mignon",
 | 
				
			||||||
    "Dão-Lafões",
 | 
					    "Dão-Lafões",
 | 
				
			||||||
    "Débats-Rivière-d'Orpra",
 | 
					    "Débats-Rivière-d'Orpra",
 | 
				
			||||||
    "Décines-Charpieu",
 | 
					    "Décines-Charpieu",
 | 
				
			||||||
| 
						 | 
					@ -2687,8 +2686,8 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Kuhlen-Wendorf",
 | 
					    "Kuhlen-Wendorf",
 | 
				
			||||||
    "KwaZulu-Natal",
 | 
					    "KwaZulu-Natal",
 | 
				
			||||||
    "Kyzyl-Arvat",
 | 
					    "Kyzyl-Arvat",
 | 
				
			||||||
    "Kur-la-Grande",
 | 
					    "Kœur-la-Grande",
 | 
				
			||||||
    "Kur-la-Petite",
 | 
					    "Kœur-la-Petite",
 | 
				
			||||||
    "Kölln-Reisiek",
 | 
					    "Kölln-Reisiek",
 | 
				
			||||||
    "Königsbach-Stein",
 | 
					    "Königsbach-Stein",
 | 
				
			||||||
    "Königshain-Wiederau",
 | 
					    "Königshain-Wiederau",
 | 
				
			||||||
| 
						 | 
					@ -4024,7 +4023,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Marcilly-d'Azergues",
 | 
					    "Marcilly-d'Azergues",
 | 
				
			||||||
    "Marcillé-Raoul",
 | 
					    "Marcillé-Raoul",
 | 
				
			||||||
    "Marcillé-Robert",
 | 
					    "Marcillé-Robert",
 | 
				
			||||||
    "Marcq-en-Barul",
 | 
					    "Marcq-en-Barœul",
 | 
				
			||||||
    "Marcy-l'Etoile",
 | 
					    "Marcy-l'Etoile",
 | 
				
			||||||
    "Marcy-l'Étoile",
 | 
					    "Marcy-l'Étoile",
 | 
				
			||||||
    "Mareil-Marly",
 | 
					    "Mareil-Marly",
 | 
				
			||||||
| 
						 | 
					@ -4258,7 +4257,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Monlezun-d'Armagnac",
 | 
					    "Monlezun-d'Armagnac",
 | 
				
			||||||
    "Monléon-Magnoac",
 | 
					    "Monléon-Magnoac",
 | 
				
			||||||
    "Monnetier-Mornex",
 | 
					    "Monnetier-Mornex",
 | 
				
			||||||
    "Mons-en-Barul",
 | 
					    "Mons-en-Barœul",
 | 
				
			||||||
    "Monsempron-Libos",
 | 
					    "Monsempron-Libos",
 | 
				
			||||||
    "Monsteroux-Milieu",
 | 
					    "Monsteroux-Milieu",
 | 
				
			||||||
    "Montacher-Villegardin",
 | 
					    "Montacher-Villegardin",
 | 
				
			||||||
| 
						 | 
					@ -4348,7 +4347,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Mornay-Berry",
 | 
					    "Mornay-Berry",
 | 
				
			||||||
    "Mortain-Bocage",
 | 
					    "Mortain-Bocage",
 | 
				
			||||||
    "Morteaux-Couliboeuf",
 | 
					    "Morteaux-Couliboeuf",
 | 
				
			||||||
    "Morteaux-Coulibuf",
 | 
					    "Morteaux-Coulibœuf",
 | 
				
			||||||
    "Morteaux-Coulibœuf",
 | 
					    "Morteaux-Coulibœuf",
 | 
				
			||||||
    "Mortes-Frontières",
 | 
					    "Mortes-Frontières",
 | 
				
			||||||
    "Mory-Montcrux",
 | 
					    "Mory-Montcrux",
 | 
				
			||||||
| 
						 | 
					@ -4391,7 +4390,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Muncq-Nieurlet",
 | 
					    "Muncq-Nieurlet",
 | 
				
			||||||
    "Murtin-Bogny",
 | 
					    "Murtin-Bogny",
 | 
				
			||||||
    "Murtin-et-le-Châtelet",
 | 
					    "Murtin-et-le-Châtelet",
 | 
				
			||||||
    "Murs-Verdey",
 | 
					    "Mœurs-Verdey",
 | 
				
			||||||
    "Ménestérol-Montignac",
 | 
					    "Ménestérol-Montignac",
 | 
				
			||||||
    "Ménil'muche",
 | 
					    "Ménil'muche",
 | 
				
			||||||
    "Ménil-Annelles",
 | 
					    "Ménil-Annelles",
 | 
				
			||||||
| 
						 | 
					@ -4612,7 +4611,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Neuves-Maisons",
 | 
					    "Neuves-Maisons",
 | 
				
			||||||
    "Neuvic-Entier",
 | 
					    "Neuvic-Entier",
 | 
				
			||||||
    "Neuvicq-Montguyon",
 | 
					    "Neuvicq-Montguyon",
 | 
				
			||||||
    "Neuville-lès-Luilly",
 | 
					    "Neuville-lès-Lœuilly",
 | 
				
			||||||
    "Neuvy-Bouin",
 | 
					    "Neuvy-Bouin",
 | 
				
			||||||
    "Neuvy-Deux-Clochers",
 | 
					    "Neuvy-Deux-Clochers",
 | 
				
			||||||
    "Neuvy-Grandchamp",
 | 
					    "Neuvy-Grandchamp",
 | 
				
			||||||
| 
						 | 
					@ -4773,8 +4772,8 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Nuncq-Hautecôte",
 | 
					    "Nuncq-Hautecôte",
 | 
				
			||||||
    "Nurieux-Volognat",
 | 
					    "Nurieux-Volognat",
 | 
				
			||||||
    "Nuthe-Urstromtal",
 | 
					    "Nuthe-Urstromtal",
 | 
				
			||||||
    "Nux-les-Mines",
 | 
					    "Nœux-les-Mines",
 | 
				
			||||||
    "Nux-lès-Auxi",
 | 
					    "Nœux-lès-Auxi",
 | 
				
			||||||
    "Nâves-Parmelan",
 | 
					    "Nâves-Parmelan",
 | 
				
			||||||
    "Nézignan-l'Evêque",
 | 
					    "Nézignan-l'Evêque",
 | 
				
			||||||
    "Nézignan-l'Évêque",
 | 
					    "Nézignan-l'Évêque",
 | 
				
			||||||
| 
						 | 
					@ -5343,7 +5342,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Quincy-Voisins",
 | 
					    "Quincy-Voisins",
 | 
				
			||||||
    "Quincy-sous-le-Mont",
 | 
					    "Quincy-sous-le-Mont",
 | 
				
			||||||
    "Quint-Fonsegrives",
 | 
					    "Quint-Fonsegrives",
 | 
				
			||||||
    "Quux-Haut-Maînil",
 | 
					    "Quœux-Haut-Maînil",
 | 
				
			||||||
    "Quœux-Haut-Maînil",
 | 
					    "Quœux-Haut-Maînil",
 | 
				
			||||||
    "Qwa-Qwa",
 | 
					    "Qwa-Qwa",
 | 
				
			||||||
    "R.-V.",
 | 
					    "R.-V.",
 | 
				
			||||||
| 
						 | 
					@ -5631,12 +5630,12 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Saint Aulaye-Puymangou",
 | 
					    "Saint Aulaye-Puymangou",
 | 
				
			||||||
    "Saint Geniez d'Olt et d'Aubrac",
 | 
					    "Saint Geniez d'Olt et d'Aubrac",
 | 
				
			||||||
    "Saint Martin de l'If",
 | 
					    "Saint Martin de l'If",
 | 
				
			||||||
    "Saint-Denux",
 | 
					    "Saint-Denœux",
 | 
				
			||||||
    "Saint-Jean-de-Buf",
 | 
					    "Saint-Jean-de-Bœuf",
 | 
				
			||||||
    "Saint-Martin-le-Nud",
 | 
					    "Saint-Martin-le-Nœud",
 | 
				
			||||||
    "Saint-Michel-Tubuf",
 | 
					    "Saint-Michel-Tubœuf",
 | 
				
			||||||
    "Saint-Paul - Flaugnac",
 | 
					    "Saint-Paul - Flaugnac",
 | 
				
			||||||
    "Saint-Pierre-de-Buf",
 | 
					    "Saint-Pierre-de-Bœuf",
 | 
				
			||||||
    "Saint-Thegonnec Loc-Eguiner",
 | 
					    "Saint-Thegonnec Loc-Eguiner",
 | 
				
			||||||
    "Sainte-Alvère-Saint-Laurent Les Bâtons",
 | 
					    "Sainte-Alvère-Saint-Laurent Les Bâtons",
 | 
				
			||||||
    "Salignac-Eyvignes",
 | 
					    "Salignac-Eyvignes",
 | 
				
			||||||
| 
						 | 
					@ -6208,7 +6207,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Tite-Live",
 | 
					    "Tite-Live",
 | 
				
			||||||
    "Titisee-Neustadt",
 | 
					    "Titisee-Neustadt",
 | 
				
			||||||
    "Tobel-Tägerschen",
 | 
					    "Tobel-Tägerschen",
 | 
				
			||||||
    "Togny-aux-Bufs",
 | 
					    "Togny-aux-Bœufs",
 | 
				
			||||||
    "Tongre-Notre-Dame",
 | 
					    "Tongre-Notre-Dame",
 | 
				
			||||||
    "Tonnay-Boutonne",
 | 
					    "Tonnay-Boutonne",
 | 
				
			||||||
    "Tonnay-Charente",
 | 
					    "Tonnay-Charente",
 | 
				
			||||||
| 
						 | 
					@ -6336,7 +6335,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Vals-près-le-Puy",
 | 
					    "Vals-près-le-Puy",
 | 
				
			||||||
    "Valverde-Enrique",
 | 
					    "Valverde-Enrique",
 | 
				
			||||||
    "Valzin-en-Petite-Montagne",
 | 
					    "Valzin-en-Petite-Montagne",
 | 
				
			||||||
    "Vanduvre-lès-Nancy",
 | 
					    "Vandœuvre-lès-Nancy",
 | 
				
			||||||
    "Varces-Allières-et-Risset",
 | 
					    "Varces-Allières-et-Risset",
 | 
				
			||||||
    "Varenne-l'Arconce",
 | 
					    "Varenne-l'Arconce",
 | 
				
			||||||
    "Varenne-sur-le-Doubs",
 | 
					    "Varenne-sur-le-Doubs",
 | 
				
			||||||
| 
						 | 
					@ -6457,9 +6456,9 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Villenave-d'Ornon",
 | 
					    "Villenave-d'Ornon",
 | 
				
			||||||
    "Villequier-Aumont",
 | 
					    "Villequier-Aumont",
 | 
				
			||||||
    "Villerouge-Termenès",
 | 
					    "Villerouge-Termenès",
 | 
				
			||||||
    "Villers-aux-Nuds",
 | 
					    "Villers-aux-Nœuds",
 | 
				
			||||||
    "Villez-sur-le-Neubourg",
 | 
					    "Villez-sur-le-Neubourg",
 | 
				
			||||||
    "Villiers-en-Désuvre",
 | 
					    "Villiers-en-Désœuvre",
 | 
				
			||||||
    "Villieu-Loyes-Mollon",
 | 
					    "Villieu-Loyes-Mollon",
 | 
				
			||||||
    "Villingen-Schwenningen",
 | 
					    "Villingen-Schwenningen",
 | 
				
			||||||
    "Villié-Morgon",
 | 
					    "Villié-Morgon",
 | 
				
			||||||
| 
						 | 
					@ -6467,7 +6466,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Vilosnes-Haraumont",
 | 
					    "Vilosnes-Haraumont",
 | 
				
			||||||
    "Vilters-Wangs",
 | 
					    "Vilters-Wangs",
 | 
				
			||||||
    "Vincent-Froideville",
 | 
					    "Vincent-Froideville",
 | 
				
			||||||
    "Vincy-Manuvre",
 | 
					    "Vincy-Manœuvre",
 | 
				
			||||||
    "Vincy-Manœuvre",
 | 
					    "Vincy-Manœuvre",
 | 
				
			||||||
    "Vincy-Reuil-et-Magny",
 | 
					    "Vincy-Reuil-et-Magny",
 | 
				
			||||||
    "Vindrac-Alayrac",
 | 
					    "Vindrac-Alayrac",
 | 
				
			||||||
| 
						 | 
					@ -6511,8 +6510,8 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Vrigne-Meusiens",
 | 
					    "Vrigne-Meusiens",
 | 
				
			||||||
    "Vrijhoeve-Capelle",
 | 
					    "Vrijhoeve-Capelle",
 | 
				
			||||||
    "Vuisternens-devant-Romont",
 | 
					    "Vuisternens-devant-Romont",
 | 
				
			||||||
    "Vlfling-lès-Bouzonville",
 | 
					    "Vœlfling-lès-Bouzonville",
 | 
				
			||||||
    "Vuil-et-Giget",
 | 
					    "Vœuil-et-Giget",
 | 
				
			||||||
    "Vélez-Blanco",
 | 
					    "Vélez-Blanco",
 | 
				
			||||||
    "Vélez-Málaga",
 | 
					    "Vélez-Málaga",
 | 
				
			||||||
    "Vélez-Rubio",
 | 
					    "Vélez-Rubio",
 | 
				
			||||||
| 
						 | 
					@ -6615,7 +6614,7 @@ FR_BASE_EXCEPTIONS = [
 | 
				
			||||||
    "Wust-Fischbeck",
 | 
					    "Wust-Fischbeck",
 | 
				
			||||||
    "Wutha-Farnroda",
 | 
					    "Wutha-Farnroda",
 | 
				
			||||||
    "Wy-dit-Joli-Village",
 | 
					    "Wy-dit-Joli-Village",
 | 
				
			||||||
    "Wlfling-lès-Sarreguemines",
 | 
					    "Wœlfling-lès-Sarreguemines",
 | 
				
			||||||
    "Wünnewil-Flamatt",
 | 
					    "Wünnewil-Flamatt",
 | 
				
			||||||
    "X-SAMPA",
 | 
					    "X-SAMPA",
 | 
				
			||||||
    "X-arbre",
 | 
					    "X-arbre",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,17 +24,15 @@ def noun_chunks(doclike):
 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    seen = set()
 | 
					    prev_end = -1
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # Prevent nested chunks from being produced
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
        if word.i in seen:
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        if word.dep in np_deps:
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
            if any(w.i in seen for w in word.subtree):
 | 
					            prev_end = word.right_edge.i
 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
					 | 
				
			||||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
				
			||||||
        elif word.dep == conj:
 | 
					        elif word.dep == conj:
 | 
				
			||||||
            head = word.head
 | 
					            head = word.head
 | 
				
			||||||
| 
						 | 
					@ -42,9 +40,7 @@ def noun_chunks(doclike):
 | 
				
			||||||
                head = head.head
 | 
					                head = head.head
 | 
				
			||||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
					            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
				
			||||||
            if head.dep in np_deps:
 | 
					            if head.dep in np_deps:
 | 
				
			||||||
                if any(w.i in seen for w in word.subtree):
 | 
					                prev_end = word.right_edge.i
 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
					 | 
				
			||||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .punctuation import ELISION, HYPHENS
 | 
					from .punctuation import ELISION, HYPHENS
 | 
				
			||||||
from ..tokenizer_exceptions import URL_PATTERN
 | 
					 | 
				
			||||||
from ..char_classes import ALPHA_LOWER, ALPHA
 | 
					from ..char_classes import ALPHA_LOWER, ALPHA
 | 
				
			||||||
from ...symbols import ORTH, LEMMA
 | 
					from ...symbols import ORTH, LEMMA
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -452,9 +451,6 @@ _regular_exp += [
 | 
				
			||||||
    for hc in _hyphen_combination
 | 
					    for hc in _hyphen_combination
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# URLs
 | 
					 | 
				
			||||||
_regular_exp.append(URL_PATTERN)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = _exc
 | 
				
			||||||
TOKEN_MATCH = re.compile(
 | 
					TOKEN_MATCH = re.compile(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
એમ
 | 
					એમ
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,7 +7,6 @@ _concat_icons = CONCAT_ICONS.replace("\u00B0", "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_currency = r"\$¢£€¥฿"
 | 
					_currency = r"\$¢£€¥฿"
 | 
				
			||||||
_quotes = CONCAT_QUOTES.replace("'", "")
 | 
					_quotes = CONCAT_QUOTES.replace("'", "")
 | 
				
			||||||
_units = UNITS.replace("%", "")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
_prefixes = (
 | 
					_prefixes = (
 | 
				
			||||||
    LIST_PUNCT
 | 
					    LIST_PUNCT
 | 
				
			||||||
| 
						 | 
					@ -18,7 +17,8 @@ _prefixes = (
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_suffixes = (
 | 
					_suffixes = (
 | 
				
			||||||
    LIST_PUNCT
 | 
					    [r"\+"]
 | 
				
			||||||
 | 
					    + LIST_PUNCT
 | 
				
			||||||
    + LIST_ELLIPSES
 | 
					    + LIST_ELLIPSES
 | 
				
			||||||
    + LIST_QUOTES
 | 
					    + LIST_QUOTES
 | 
				
			||||||
    + [_concat_icons]
 | 
					    + [_concat_icons]
 | 
				
			||||||
| 
						 | 
					@ -26,7 +26,7 @@ _suffixes = (
 | 
				
			||||||
        r"(?<=[0-9])\+",
 | 
					        r"(?<=[0-9])\+",
 | 
				
			||||||
        r"(?<=°[FfCcKk])\.",
 | 
					        r"(?<=°[FfCcKk])\.",
 | 
				
			||||||
        r"(?<=[0-9])(?:[{c}])".format(c=_currency),
 | 
					        r"(?<=[0-9])(?:[{c}])".format(c=_currency),
 | 
				
			||||||
        r"(?<=[0-9])(?:{u})".format(u=_units),
 | 
					        r"(?<=[0-9])(?:{u})".format(u=UNITS),
 | 
				
			||||||
        r"(?<=[{al}{e}{q}(?:{c})])\.".format(
 | 
					        r"(?<=[{al}{e}{q}(?:{c})])\.".format(
 | 
				
			||||||
            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
 | 
					            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, c=_currency
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,6 @@
 | 
				
			||||||
import re
 | 
					import re
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..punctuation import ALPHA_LOWER, CURRENCY
 | 
					from ..punctuation import ALPHA_LOWER, CURRENCY
 | 
				
			||||||
from ..tokenizer_exceptions import URL_PATTERN
 | 
					 | 
				
			||||||
from ...symbols import ORTH
 | 
					from ...symbols import ORTH
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -646,4 +645,4 @@ _nums = r"(({ne})|({t})|({on})|({c}))({s})?".format(
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKENIZER_EXCEPTIONS = _exc
 | 
					TOKENIZER_EXCEPTIONS = _exc
 | 
				
			||||||
TOKEN_MATCH = re.compile(r"^({u})|({n})$".format(u=URL_PATTERN, n=_nums)).match
 | 
					TOKEN_MATCH = re.compile(r"^{n}$".format(n=_nums)).match
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
from .lex_attrs import LEX_ATTRS
 | 
					from .lex_attrs import LEX_ATTRS
 | 
				
			||||||
from .tag_map import TAG_MAP
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
>>> from spacy.lang.hy.examples import sentences
 | 
					>>> from spacy.lang.hy.examples import sentences
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,12 +1,9 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_num_words = [
 | 
					_num_words = [
 | 
				
			||||||
    "զրօ",
 | 
					    "զրո",
 | 
				
			||||||
    "մէկ",
 | 
					    "մեկ",
 | 
				
			||||||
    "երկու",
 | 
					    "երկու",
 | 
				
			||||||
    "երեք",
 | 
					    "երեք",
 | 
				
			||||||
    "չորս",
 | 
					    "չորս",
 | 
				
			||||||
| 
						 | 
					@ -28,10 +25,10 @@ _num_words = [
 | 
				
			||||||
    "քսան" "երեսուն",
 | 
					    "քսան" "երեսուն",
 | 
				
			||||||
    "քառասուն",
 | 
					    "քառասուն",
 | 
				
			||||||
    "հիսուն",
 | 
					    "հիսուն",
 | 
				
			||||||
    "վաթցսուն",
 | 
					    "վաթսուն",
 | 
				
			||||||
    "յոթանասուն",
 | 
					    "յոթանասուն",
 | 
				
			||||||
    "ութսուն",
 | 
					    "ութսուն",
 | 
				
			||||||
    "ինիսուն",
 | 
					    "իննսուն",
 | 
				
			||||||
    "հարյուր",
 | 
					    "հարյուր",
 | 
				
			||||||
    "հազար",
 | 
					    "հազար",
 | 
				
			||||||
    "միլիոն",
 | 
					    "միլիոն",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
նա
 | 
					նա
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 | 
					from ...symbols import POS, ADJ, NUM, DET, ADV, ADP, X, VERB, NOUN
 | 
				
			||||||
from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
 | 
					from ...symbols import PROPN, PART, INTJ, PRON, SCONJ, AUX, CCONJ
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,17 +24,15 @@ def noun_chunks(doclike):
 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    seen = set()
 | 
					    prev_end = -1
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # Prevent nested chunks from being produced
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
        if word.i in seen:
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        if word.dep in np_deps:
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
            if any(w.i in seen for w in word.subtree):
 | 
					            prev_end = word.right_edge.i
 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
					 | 
				
			||||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
				
			||||||
        elif word.dep == conj:
 | 
					        elif word.dep == conj:
 | 
				
			||||||
            head = word.head
 | 
					            head = word.head
 | 
				
			||||||
| 
						 | 
					@ -42,9 +40,7 @@ def noun_chunks(doclike):
 | 
				
			||||||
                head = head.head
 | 
					                head = head.head
 | 
				
			||||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
					            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
				
			||||||
            if head.dep in np_deps:
 | 
					            if head.dep in np_deps:
 | 
				
			||||||
                if any(w.i in seen for w in word.subtree):
 | 
					                prev_end = word.right_edge.i
 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
					 | 
				
			||||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,111 +1,266 @@
 | 
				
			||||||
import re
 | 
					import srsly
 | 
				
			||||||
from collections import namedtuple
 | 
					from collections import namedtuple, OrderedDict
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					from .syntax_iterators import SYNTAX_ITERATORS
 | 
				
			||||||
from .tag_map import TAG_MAP
 | 
					from .tag_map import TAG_MAP
 | 
				
			||||||
 | 
					from .tag_orth_map import TAG_ORTH_MAP
 | 
				
			||||||
 | 
					from .tag_bigram_map import TAG_BIGRAM_MAP
 | 
				
			||||||
from ...attrs import LANG
 | 
					from ...attrs import LANG
 | 
				
			||||||
from ...language import Language
 | 
					 | 
				
			||||||
from ...tokens import Doc
 | 
					 | 
				
			||||||
from ...compat import copy_reg
 | 
					from ...compat import copy_reg
 | 
				
			||||||
 | 
					from ...errors import Errors
 | 
				
			||||||
 | 
					from ...language import Language
 | 
				
			||||||
 | 
					from ...symbols import POS
 | 
				
			||||||
 | 
					from ...tokens import Doc
 | 
				
			||||||
from ...util import DummyTokenizer
 | 
					from ...util import DummyTokenizer
 | 
				
			||||||
 | 
					from ... import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Hold the attributes we need with convenient names
 | 
				
			||||||
 | 
					DetailedToken = namedtuple("DetailedToken", ["surface", "pos", "lemma"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Handling for multiple spaces in a row is somewhat awkward, this simplifies
 | 
					# Handling for multiple spaces in a row is somewhat awkward, this simplifies
 | 
				
			||||||
# the flow by creating a dummy with the same interface.
 | 
					# the flow by creating a dummy with the same interface.
 | 
				
			||||||
DummyNode = namedtuple("DummyNode", ["surface", "pos", "feature"])
 | 
					DummyNode = namedtuple("DummyNode", ["surface", "pos", "lemma"])
 | 
				
			||||||
DummyNodeFeatures = namedtuple("DummyNodeFeatures", ["lemma"])
 | 
					DummySpace = DummyNode(" ", " ", " ")
 | 
				
			||||||
DummySpace = DummyNode(" ", " ", DummyNodeFeatures(" "))
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def try_fugashi_import():
 | 
					def try_sudachi_import(split_mode="A"):
 | 
				
			||||||
    """Fugashi is required for Japanese support, so check for it.
 | 
					    """SudachiPy is required for Japanese support, so check for it.
 | 
				
			||||||
    It it's not available blow up and explain how to fix it."""
 | 
					    It it's not available blow up and explain how to fix it.
 | 
				
			||||||
 | 
					    split_mode should be one of these values: "A", "B", "C", None->"A"."""
 | 
				
			||||||
    try:
 | 
					    try:
 | 
				
			||||||
        import fugashi
 | 
					        from sudachipy import dictionary, tokenizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return fugashi
 | 
					        split_mode = {
 | 
				
			||||||
 | 
					            None: tokenizer.Tokenizer.SplitMode.A,
 | 
				
			||||||
 | 
					            "A": tokenizer.Tokenizer.SplitMode.A,
 | 
				
			||||||
 | 
					            "B": tokenizer.Tokenizer.SplitMode.B,
 | 
				
			||||||
 | 
					            "C": tokenizer.Tokenizer.SplitMode.C,
 | 
				
			||||||
 | 
					        }[split_mode]
 | 
				
			||||||
 | 
					        tok = dictionary.Dictionary().create(mode=split_mode)
 | 
				
			||||||
 | 
					        return tok
 | 
				
			||||||
    except ImportError:
 | 
					    except ImportError:
 | 
				
			||||||
        raise ImportError(
 | 
					        raise ImportError(
 | 
				
			||||||
            "Japanese support requires Fugashi: " "https://github.com/polm/fugashi"
 | 
					            "Japanese support requires SudachiPy and SudachiDict-core "
 | 
				
			||||||
 | 
					            "(https://github.com/WorksApplications/SudachiPy). "
 | 
				
			||||||
 | 
					            "Install with `pip install sudachipy sudachidict_core` or "
 | 
				
			||||||
 | 
					            "install spaCy with `pip install spacy[ja]`."
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def resolve_pos(token):
 | 
					def resolve_pos(orth, pos, next_pos):
 | 
				
			||||||
    """If necessary, add a field to the POS tag for UD mapping.
 | 
					    """If necessary, add a field to the POS tag for UD mapping.
 | 
				
			||||||
    Under Universal Dependencies, sometimes the same Unidic POS tag can
 | 
					    Under Universal Dependencies, sometimes the same Unidic POS tag can
 | 
				
			||||||
    be mapped differently depending on the literal token or its context
 | 
					    be mapped differently depending on the literal token or its context
 | 
				
			||||||
    in the sentence. This function adds information to the POS tag to
 | 
					    in the sentence. This function returns resolved POSs for both token
 | 
				
			||||||
    resolve ambiguous mappings.
 | 
					    and next_token by tuple.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # this is only used for consecutive ascii spaces
 | 
					    # Some tokens have their UD tag decided based on the POS of the following
 | 
				
			||||||
    if token.surface == " ":
 | 
					    # token.
 | 
				
			||||||
        return "空白"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: This is a first take. The rules here are crude approximations.
 | 
					    # orth based rules
 | 
				
			||||||
    # For many of these, full dependencies are needed to properly resolve
 | 
					    if pos[0] in TAG_ORTH_MAP:
 | 
				
			||||||
    # PoS mappings.
 | 
					        orth_map = TAG_ORTH_MAP[pos[0]]
 | 
				
			||||||
    if token.pos == "連体詞,*,*,*":
 | 
					        if orth in orth_map:
 | 
				
			||||||
        if re.match(r"[こそあど此其彼]の", token.surface):
 | 
					            return orth_map[orth], None
 | 
				
			||||||
            return token.pos + ",DET"
 | 
					
 | 
				
			||||||
        if re.match(r"[こそあど此其彼]", token.surface):
 | 
					    # tag bi-gram mapping
 | 
				
			||||||
            return token.pos + ",PRON"
 | 
					    if next_pos:
 | 
				
			||||||
        return token.pos + ",ADJ"
 | 
					        tag_bigram = pos[0], next_pos[0]
 | 
				
			||||||
    return token.pos
 | 
					        if tag_bigram in TAG_BIGRAM_MAP:
 | 
				
			||||||
 | 
					            bipos = TAG_BIGRAM_MAP[tag_bigram]
 | 
				
			||||||
 | 
					            if bipos[0] is None:
 | 
				
			||||||
 | 
					                return TAG_MAP[pos[0]][POS], bipos[1]
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                return bipos
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return TAG_MAP[pos[0]][POS], None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_words_and_spaces(tokenizer, text):
 | 
					# Use a mapping of paired punctuation to avoid splitting quoted sentences.
 | 
				
			||||||
    """Get the individual tokens that make up the sentence and handle white space.
 | 
					pairpunct = {"「": "」", "『": "』", "【": "】"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Japanese doesn't usually use white space, and MeCab's handling of it for
 | 
					
 | 
				
			||||||
    multiple spaces in a row is somewhat awkward.
 | 
					def separate_sentences(doc):
 | 
				
			||||||
 | 
					    """Given a doc, mark tokens that start sentences based on Unidic tags.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    tokens = tokenizer.parseToNodeList(text)
 | 
					    stack = []  # save paired punctuation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for i, token in enumerate(doc[:-2]):
 | 
				
			||||||
 | 
					        # Set all tokens after the first to false by default. This is necessary
 | 
				
			||||||
 | 
					        # for the doc code to be aware we've done sentencization, see
 | 
				
			||||||
 | 
					        # `is_sentenced`.
 | 
				
			||||||
 | 
					        token.sent_start = i == 0
 | 
				
			||||||
 | 
					        if token.tag_:
 | 
				
			||||||
 | 
					            if token.tag_ == "補助記号-括弧開":
 | 
				
			||||||
 | 
					                ts = str(token)
 | 
				
			||||||
 | 
					                if ts in pairpunct:
 | 
				
			||||||
 | 
					                    stack.append(pairpunct[ts])
 | 
				
			||||||
 | 
					                elif stack and ts == stack[-1]:
 | 
				
			||||||
 | 
					                    stack.pop()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if token.tag_ == "補助記号-句点":
 | 
				
			||||||
 | 
					                next_token = doc[i + 1]
 | 
				
			||||||
 | 
					                if next_token.tag_ != token.tag_ and not stack:
 | 
				
			||||||
 | 
					                    next_token.sent_start = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_dtokens(tokenizer, text):
 | 
				
			||||||
 | 
					    tokens = tokenizer.tokenize(text)
 | 
				
			||||||
    words = []
 | 
					    words = []
 | 
				
			||||||
    spaces = []
 | 
					    for ti, token in enumerate(tokens):
 | 
				
			||||||
    for token in tokens:
 | 
					        tag = "-".join([xx for xx in token.part_of_speech()[:4] if xx != "*"])
 | 
				
			||||||
        # If there's more than one space, spaces after the first become tokens
 | 
					        inf = "-".join([xx for xx in token.part_of_speech()[4:] if xx != "*"])
 | 
				
			||||||
        for ii in range(len(token.white_space) - 1):
 | 
					        dtoken = DetailedToken(token.surface(), (tag, inf), token.dictionary_form())
 | 
				
			||||||
            words.append(DummySpace)
 | 
					        if ti > 0 and words[-1].pos[0] == "空白" and tag == "空白":
 | 
				
			||||||
            spaces.append(False)
 | 
					            # don't add multiple space tokens in a row
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        words.append(dtoken)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        words.append(token)
 | 
					    # remove empty tokens. These can be produced with characters like … that
 | 
				
			||||||
        spaces.append(bool(token.white_space))
 | 
					    # Sudachi normalizes internally.
 | 
				
			||||||
    return words, spaces
 | 
					    words = [ww for ww in words if len(ww.surface) > 0]
 | 
				
			||||||
 | 
					    return words
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_words_lemmas_tags_spaces(dtokens, text, gap_tag=("空白", "")):
 | 
				
			||||||
 | 
					    words = [x.surface for x in dtokens]
 | 
				
			||||||
 | 
					    if "".join("".join(words).split()) != "".join(text.split()):
 | 
				
			||||||
 | 
					        raise ValueError(Errors.E194.format(text=text, words=words))
 | 
				
			||||||
 | 
					    text_words = []
 | 
				
			||||||
 | 
					    text_lemmas = []
 | 
				
			||||||
 | 
					    text_tags = []
 | 
				
			||||||
 | 
					    text_spaces = []
 | 
				
			||||||
 | 
					    text_pos = 0
 | 
				
			||||||
 | 
					    # handle empty and whitespace-only texts
 | 
				
			||||||
 | 
					    if len(words) == 0:
 | 
				
			||||||
 | 
					        return text_words, text_lemmas, text_tags, text_spaces
 | 
				
			||||||
 | 
					    elif len([word for word in words if not word.isspace()]) == 0:
 | 
				
			||||||
 | 
					        assert text.isspace()
 | 
				
			||||||
 | 
					        text_words = [text]
 | 
				
			||||||
 | 
					        text_lemmas = [text]
 | 
				
			||||||
 | 
					        text_tags = [gap_tag]
 | 
				
			||||||
 | 
					        text_spaces = [False]
 | 
				
			||||||
 | 
					        return text_words, text_lemmas, text_tags, text_spaces
 | 
				
			||||||
 | 
					    # normalize words to remove all whitespace tokens
 | 
				
			||||||
 | 
					    norm_words, norm_dtokens = zip(
 | 
				
			||||||
 | 
					        *[
 | 
				
			||||||
 | 
					            (word, dtokens)
 | 
				
			||||||
 | 
					            for word, dtokens in zip(words, dtokens)
 | 
				
			||||||
 | 
					            if not word.isspace()
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    # align words with text
 | 
				
			||||||
 | 
					    for word, dtoken in zip(norm_words, norm_dtokens):
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            word_start = text[text_pos:].index(word)
 | 
				
			||||||
 | 
					        except ValueError:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E194.format(text=text, words=words))
 | 
				
			||||||
 | 
					        if word_start > 0:
 | 
				
			||||||
 | 
					            w = text[text_pos : text_pos + word_start]
 | 
				
			||||||
 | 
					            text_words.append(w)
 | 
				
			||||||
 | 
					            text_lemmas.append(w)
 | 
				
			||||||
 | 
					            text_tags.append(gap_tag)
 | 
				
			||||||
 | 
					            text_spaces.append(False)
 | 
				
			||||||
 | 
					            text_pos += word_start
 | 
				
			||||||
 | 
					        text_words.append(word)
 | 
				
			||||||
 | 
					        text_lemmas.append(dtoken.lemma)
 | 
				
			||||||
 | 
					        text_tags.append(dtoken.pos)
 | 
				
			||||||
 | 
					        text_spaces.append(False)
 | 
				
			||||||
 | 
					        text_pos += len(word)
 | 
				
			||||||
 | 
					        if text_pos < len(text) and text[text_pos] == " ":
 | 
				
			||||||
 | 
					            text_spaces[-1] = True
 | 
				
			||||||
 | 
					            text_pos += 1
 | 
				
			||||||
 | 
					    if text_pos < len(text):
 | 
				
			||||||
 | 
					        w = text[text_pos:]
 | 
				
			||||||
 | 
					        text_words.append(w)
 | 
				
			||||||
 | 
					        text_lemmas.append(w)
 | 
				
			||||||
 | 
					        text_tags.append(gap_tag)
 | 
				
			||||||
 | 
					        text_spaces.append(False)
 | 
				
			||||||
 | 
					    return text_words, text_lemmas, text_tags, text_spaces
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class JapaneseTokenizer(DummyTokenizer):
 | 
					class JapaneseTokenizer(DummyTokenizer):
 | 
				
			||||||
    def __init__(self, cls, nlp=None):
 | 
					    def __init__(self, cls, nlp=None, config={}):
 | 
				
			||||||
        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
					        self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
 | 
				
			||||||
        self.tokenizer = try_fugashi_import().Tagger()
 | 
					        self.split_mode = config.get("split_mode", None)
 | 
				
			||||||
        self.tokenizer.parseToNodeList("")  # see #2901
 | 
					        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, text):
 | 
					    def __call__(self, text):
 | 
				
			||||||
        dtokens, spaces = get_words_and_spaces(self.tokenizer, text)
 | 
					        dtokens = get_dtokens(self.tokenizer, text)
 | 
				
			||||||
        words = [x.surface for x in dtokens]
 | 
					
 | 
				
			||||||
 | 
					        words, lemmas, unidic_tags, spaces = get_words_lemmas_tags_spaces(dtokens, text)
 | 
				
			||||||
        doc = Doc(self.vocab, words=words, spaces=spaces)
 | 
					        doc = Doc(self.vocab, words=words, spaces=spaces)
 | 
				
			||||||
        unidic_tags = []
 | 
					        next_pos = None
 | 
				
			||||||
        for token, dtoken in zip(doc, dtokens):
 | 
					        for idx, (token, lemma, unidic_tag) in enumerate(zip(doc, lemmas, unidic_tags)):
 | 
				
			||||||
            unidic_tags.append(dtoken.pos)
 | 
					            token.tag_ = unidic_tag[0]
 | 
				
			||||||
            token.tag_ = resolve_pos(dtoken)
 | 
					            if next_pos:
 | 
				
			||||||
 | 
					                token.pos = next_pos
 | 
				
			||||||
 | 
					                next_pos = None
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                token.pos, next_pos = resolve_pos(
 | 
				
			||||||
 | 
					                    token.orth_,
 | 
				
			||||||
 | 
					                    unidic_tag,
 | 
				
			||||||
 | 
					                    unidic_tags[idx + 1] if idx + 1 < len(unidic_tags) else None,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            # if there's no lemma info (it's an unk) just use the surface
 | 
					            # if there's no lemma info (it's an unk) just use the surface
 | 
				
			||||||
            token.lemma_ = dtoken.feature.lemma or dtoken.surface
 | 
					            token.lemma_ = lemma
 | 
				
			||||||
        doc.user_data["unidic_tags"] = unidic_tags
 | 
					        doc.user_data["unidic_tags"] = unidic_tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return doc
 | 
					        return doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _get_config(self):
 | 
				
			||||||
 | 
					        config = OrderedDict((("split_mode", self.split_mode),))
 | 
				
			||||||
 | 
					        return config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _set_config(self, config={}):
 | 
				
			||||||
 | 
					        self.split_mode = config.get("split_mode", None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_bytes(self, **kwargs):
 | 
				
			||||||
 | 
					        serializers = OrderedDict(
 | 
				
			||||||
 | 
					            (("cfg", lambda: srsly.json_dumps(self._get_config())),)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        return util.to_bytes(serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_bytes(self, data, **kwargs):
 | 
				
			||||||
 | 
					        deserializers = OrderedDict(
 | 
				
			||||||
 | 
					            (("cfg", lambda b: self._set_config(srsly.json_loads(b))),)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        util.from_bytes(data, deserializers, [])
 | 
				
			||||||
 | 
					        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
				
			||||||
 | 
					        return self
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def to_disk(self, path, **kwargs):
 | 
				
			||||||
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
 | 
					        serializers = OrderedDict(
 | 
				
			||||||
 | 
					            (("cfg", lambda p: srsly.write_json(p, self._get_config())),)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        return util.to_disk(path, serializers, [])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def from_disk(self, path, **kwargs):
 | 
				
			||||||
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
 | 
					        serializers = OrderedDict(
 | 
				
			||||||
 | 
					            (("cfg", lambda p: self._set_config(srsly.read_json(p))),)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        util.from_disk(path, serializers, [])
 | 
				
			||||||
 | 
					        self.tokenizer = try_sudachi_import(self.split_mode)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class JapaneseDefaults(Language.Defaults):
 | 
					class JapaneseDefaults(Language.Defaults):
 | 
				
			||||||
    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
					    lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
 | 
				
			||||||
    lex_attr_getters[LANG] = lambda _text: "ja"
 | 
					    lex_attr_getters[LANG] = lambda _text: "ja"
 | 
				
			||||||
    stop_words = STOP_WORDS
 | 
					    stop_words = STOP_WORDS
 | 
				
			||||||
    tag_map = TAG_MAP
 | 
					    tag_map = TAG_MAP
 | 
				
			||||||
 | 
					    syntax_iterators = SYNTAX_ITERATORS
 | 
				
			||||||
    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
					    writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					    @classmethod
 | 
				
			||||||
    def create_tokenizer(cls, nlp=None):
 | 
					    def create_tokenizer(cls, nlp=None, config={}):
 | 
				
			||||||
        return JapaneseTokenizer(cls, nlp)
 | 
					        return JapaneseTokenizer(cls, nlp, config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class Japanese(Language):
 | 
					class Japanese(Language):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										176
									
								
								spacy/lang/ja/bunsetu.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										176
									
								
								spacy/lang/ja/bunsetu.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,176 @@
 | 
				
			||||||
 | 
					POS_PHRASE_MAP = {
 | 
				
			||||||
 | 
					    "NOUN": "NP",
 | 
				
			||||||
 | 
					    "NUM": "NP",
 | 
				
			||||||
 | 
					    "PRON": "NP",
 | 
				
			||||||
 | 
					    "PROPN": "NP",
 | 
				
			||||||
 | 
					    "VERB": "VP",
 | 
				
			||||||
 | 
					    "ADJ": "ADJP",
 | 
				
			||||||
 | 
					    "ADV": "ADVP",
 | 
				
			||||||
 | 
					    "CCONJ": "CCONJP",
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# return value: [(bunsetu_tokens, phrase_type={'NP', 'VP', 'ADJP', 'ADVP'}, phrase_tokens)]
 | 
				
			||||||
 | 
					def yield_bunsetu(doc, debug=False):
 | 
				
			||||||
 | 
					    bunsetu = []
 | 
				
			||||||
 | 
					    bunsetu_may_end = False
 | 
				
			||||||
 | 
					    phrase_type = None
 | 
				
			||||||
 | 
					    phrase = None
 | 
				
			||||||
 | 
					    prev = None
 | 
				
			||||||
 | 
					    prev_tag = None
 | 
				
			||||||
 | 
					    prev_dep = None
 | 
				
			||||||
 | 
					    prev_head = None
 | 
				
			||||||
 | 
					    for t in doc:
 | 
				
			||||||
 | 
					        pos = t.pos_
 | 
				
			||||||
 | 
					        pos_type = POS_PHRASE_MAP.get(pos, None)
 | 
				
			||||||
 | 
					        tag = t.tag_
 | 
				
			||||||
 | 
					        dep = t.dep_
 | 
				
			||||||
 | 
					        head = t.head.i
 | 
				
			||||||
 | 
					        if debug:
 | 
				
			||||||
 | 
					            print(
 | 
				
			||||||
 | 
					                t.i,
 | 
				
			||||||
 | 
					                t.orth_,
 | 
				
			||||||
 | 
					                pos,
 | 
				
			||||||
 | 
					                pos_type,
 | 
				
			||||||
 | 
					                dep,
 | 
				
			||||||
 | 
					                head,
 | 
				
			||||||
 | 
					                bunsetu_may_end,
 | 
				
			||||||
 | 
					                phrase_type,
 | 
				
			||||||
 | 
					                phrase,
 | 
				
			||||||
 | 
					                bunsetu,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # DET is always an individual bunsetu
 | 
				
			||||||
 | 
					        if pos == "DET":
 | 
				
			||||||
 | 
					            if bunsetu:
 | 
				
			||||||
 | 
					                yield bunsetu, phrase_type, phrase
 | 
				
			||||||
 | 
					            yield [t], None, None
 | 
				
			||||||
 | 
					            bunsetu = []
 | 
				
			||||||
 | 
					            bunsetu_may_end = False
 | 
				
			||||||
 | 
					            phrase_type = None
 | 
				
			||||||
 | 
					            phrase = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # PRON or Open PUNCT always splits bunsetu
 | 
				
			||||||
 | 
					        elif tag == "補助記号-括弧開":
 | 
				
			||||||
 | 
					            if bunsetu:
 | 
				
			||||||
 | 
					                yield bunsetu, phrase_type, phrase
 | 
				
			||||||
 | 
					            bunsetu = [t]
 | 
				
			||||||
 | 
					            bunsetu_may_end = True
 | 
				
			||||||
 | 
					            phrase_type = None
 | 
				
			||||||
 | 
					            phrase = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # bunsetu head not appeared
 | 
				
			||||||
 | 
					        elif phrase_type is None:
 | 
				
			||||||
 | 
					            if bunsetu and prev_tag == "補助記号-読点":
 | 
				
			||||||
 | 
					                yield bunsetu, phrase_type, phrase
 | 
				
			||||||
 | 
					                bunsetu = []
 | 
				
			||||||
 | 
					                bunsetu_may_end = False
 | 
				
			||||||
 | 
					                phrase_type = None
 | 
				
			||||||
 | 
					                phrase = None
 | 
				
			||||||
 | 
					            bunsetu.append(t)
 | 
				
			||||||
 | 
					            if pos_type:  # begin phrase
 | 
				
			||||||
 | 
					                phrase = [t]
 | 
				
			||||||
 | 
					                phrase_type = pos_type
 | 
				
			||||||
 | 
					                if pos_type in {"ADVP", "CCONJP"}:
 | 
				
			||||||
 | 
					                    bunsetu_may_end = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # entering new bunsetu
 | 
				
			||||||
 | 
					        elif pos_type and (
 | 
				
			||||||
 | 
					            pos_type != phrase_type
 | 
				
			||||||
 | 
					            or bunsetu_may_end  # different phrase type arises  # same phrase type but bunsetu already ended
 | 
				
			||||||
 | 
					        ):
 | 
				
			||||||
 | 
					            # exceptional case: NOUN to VERB
 | 
				
			||||||
 | 
					            if (
 | 
				
			||||||
 | 
					                phrase_type == "NP"
 | 
				
			||||||
 | 
					                and pos_type == "VP"
 | 
				
			||||||
 | 
					                and prev_dep == "compound"
 | 
				
			||||||
 | 
					                and prev_head == t.i
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					                bunsetu.append(t)
 | 
				
			||||||
 | 
					                phrase_type = "VP"
 | 
				
			||||||
 | 
					                phrase.append(t)
 | 
				
			||||||
 | 
					            # exceptional case: VERB to NOUN
 | 
				
			||||||
 | 
					            elif (
 | 
				
			||||||
 | 
					                phrase_type == "VP"
 | 
				
			||||||
 | 
					                and pos_type == "NP"
 | 
				
			||||||
 | 
					                and (
 | 
				
			||||||
 | 
					                    prev_dep == "compound"
 | 
				
			||||||
 | 
					                    and prev_head == t.i
 | 
				
			||||||
 | 
					                    or dep == "compound"
 | 
				
			||||||
 | 
					                    and prev == head
 | 
				
			||||||
 | 
					                    or prev_dep == "nmod"
 | 
				
			||||||
 | 
					                    and prev_head == t.i
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					                bunsetu.append(t)
 | 
				
			||||||
 | 
					                phrase_type = "NP"
 | 
				
			||||||
 | 
					                phrase.append(t)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                yield bunsetu, phrase_type, phrase
 | 
				
			||||||
 | 
					                bunsetu = [t]
 | 
				
			||||||
 | 
					                bunsetu_may_end = False
 | 
				
			||||||
 | 
					                phrase_type = pos_type
 | 
				
			||||||
 | 
					                phrase = [t]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # NOUN bunsetu
 | 
				
			||||||
 | 
					        elif phrase_type == "NP":
 | 
				
			||||||
 | 
					            bunsetu.append(t)
 | 
				
			||||||
 | 
					            if not bunsetu_may_end and (
 | 
				
			||||||
 | 
					                (
 | 
				
			||||||
 | 
					                    (pos_type == "NP" or pos == "SYM")
 | 
				
			||||||
 | 
					                    and (prev_head == t.i or prev_head == head)
 | 
				
			||||||
 | 
					                    and prev_dep in {"compound", "nummod"}
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                or (
 | 
				
			||||||
 | 
					                    pos == "PART"
 | 
				
			||||||
 | 
					                    and (prev == head or prev_head == head)
 | 
				
			||||||
 | 
					                    and dep == "mark"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					                phrase.append(t)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                bunsetu_may_end = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # VERB bunsetu
 | 
				
			||||||
 | 
					        elif phrase_type == "VP":
 | 
				
			||||||
 | 
					            bunsetu.append(t)
 | 
				
			||||||
 | 
					            if (
 | 
				
			||||||
 | 
					                not bunsetu_may_end
 | 
				
			||||||
 | 
					                and pos == "VERB"
 | 
				
			||||||
 | 
					                and prev_head == t.i
 | 
				
			||||||
 | 
					                and prev_dep == "compound"
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					                phrase.append(t)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                bunsetu_may_end = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # ADJ bunsetu
 | 
				
			||||||
 | 
					        elif phrase_type == "ADJP" and tag != "連体詞":
 | 
				
			||||||
 | 
					            bunsetu.append(t)
 | 
				
			||||||
 | 
					            if not bunsetu_may_end and (
 | 
				
			||||||
 | 
					                (
 | 
				
			||||||
 | 
					                    pos == "NOUN"
 | 
				
			||||||
 | 
					                    and (prev_head == t.i or prev_head == head)
 | 
				
			||||||
 | 
					                    and prev_dep in {"amod", "compound"}
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                or (
 | 
				
			||||||
 | 
					                    pos == "PART"
 | 
				
			||||||
 | 
					                    and (prev == head or prev_head == head)
 | 
				
			||||||
 | 
					                    and dep == "mark"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            ):
 | 
				
			||||||
 | 
					                phrase.append(t)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                bunsetu_may_end = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # other bunsetu
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            bunsetu.append(t)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        prev = t.i
 | 
				
			||||||
 | 
					        prev_tag = t.tag_
 | 
				
			||||||
 | 
					        prev_dep = t.dep_
 | 
				
			||||||
 | 
					        prev_head = head
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if bunsetu:
 | 
				
			||||||
 | 
					        yield bunsetu, phrase_type, phrase
 | 
				
			||||||
							
								
								
									
										54
									
								
								spacy/lang/ja/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								spacy/lang/ja/syntax_iterators.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,54 @@
 | 
				
			||||||
 | 
					from ...symbols import NOUN, PROPN, PRON, VERB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# XXX this can probably be pruned a bit
 | 
				
			||||||
 | 
					labels = [
 | 
				
			||||||
 | 
					    "nsubj",
 | 
				
			||||||
 | 
					    "nmod",
 | 
				
			||||||
 | 
					    "dobj",
 | 
				
			||||||
 | 
					    "nsubjpass",
 | 
				
			||||||
 | 
					    "pcomp",
 | 
				
			||||||
 | 
					    "pobj",
 | 
				
			||||||
 | 
					    "obj",
 | 
				
			||||||
 | 
					    "obl",
 | 
				
			||||||
 | 
					    "dative",
 | 
				
			||||||
 | 
					    "appos",
 | 
				
			||||||
 | 
					    "attr",
 | 
				
			||||||
 | 
					    "ROOT",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def noun_chunks(obj):
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Detect base noun phrases from a dependency parse. Works on both Doc and Span.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    doc = obj.doc  # Ensure works on both Doc and Span.
 | 
				
			||||||
 | 
					    np_deps = [doc.vocab.strings.add(label) for label in labels]
 | 
				
			||||||
 | 
					    doc.vocab.strings.add("conj")
 | 
				
			||||||
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
 | 
					    seen = set()
 | 
				
			||||||
 | 
					    for i, word in enumerate(obj):
 | 
				
			||||||
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
 | 
					        if word.i in seen:
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
 | 
					            unseen = [w.i for w in word.subtree if w.i not in seen]
 | 
				
			||||||
 | 
					            if not unseen:
 | 
				
			||||||
 | 
					                continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # this takes care of particles etc.
 | 
				
			||||||
 | 
					            seen.update(j.i for j in word.subtree)
 | 
				
			||||||
 | 
					            # This avoids duplicating embedded clauses
 | 
				
			||||||
 | 
					            seen.update(range(word.i + 1))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # if the head of this is a verb, mark that and rights seen
 | 
				
			||||||
 | 
					            # Don't do the subtree as that can hide other phrases
 | 
				
			||||||
 | 
					            if word.head.pos == VERB:
 | 
				
			||||||
 | 
					                seen.add(word.head.i)
 | 
				
			||||||
 | 
					                seen.update(w.i for w in word.head.rights)
 | 
				
			||||||
 | 
					            yield unseen[0], word.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
 | 
				
			||||||
							
								
								
									
										28
									
								
								spacy/lang/ja/tag_bigram_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										28
									
								
								spacy/lang/ja/tag_bigram_map.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,28 @@
 | 
				
			||||||
 | 
					from ...symbols import ADJ, AUX, NOUN, PART, VERB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# mapping from tag bi-gram to pos of previous token
 | 
				
			||||||
 | 
					TAG_BIGRAM_MAP = {
 | 
				
			||||||
 | 
					    # This covers only small part of AUX.
 | 
				
			||||||
 | 
					    ("形容詞-非自立可能", "助詞-終助詞"): (AUX, None),
 | 
				
			||||||
 | 
					    ("名詞-普通名詞-形状詞可能", "助動詞"): (ADJ, None),
 | 
				
			||||||
 | 
					    # ("副詞", "名詞-普通名詞-形状詞可能"): (None, ADJ),
 | 
				
			||||||
 | 
					    # This covers acl, advcl, obl and root, but has side effect for compound.
 | 
				
			||||||
 | 
					    ("名詞-普通名詞-サ変可能", "動詞-非自立可能"): (VERB, AUX),
 | 
				
			||||||
 | 
					    # This covers almost all of the deps
 | 
				
			||||||
 | 
					    ("名詞-普通名詞-サ変形状詞可能", "動詞-非自立可能"): (VERB, AUX),
 | 
				
			||||||
 | 
					    ("名詞-普通名詞-副詞可能", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("副詞", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("形容詞-一般", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("形容詞-非自立可能", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("接頭辞", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("助詞-係助詞", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("助詞-副助詞", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("助詞-格助詞", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("補助記号-読点", "動詞-非自立可能"): (None, VERB),
 | 
				
			||||||
 | 
					    ("形容詞-一般", "接尾辞-名詞的-一般"): (None, PART),
 | 
				
			||||||
 | 
					    ("助詞-格助詞", "形状詞-助動詞語幹"): (None, NOUN),
 | 
				
			||||||
 | 
					    ("連体詞", "形状詞-助動詞語幹"): (None, NOUN),
 | 
				
			||||||
 | 
					    ("動詞-一般", "助詞-副助詞"): (None, PART),
 | 
				
			||||||
 | 
					    ("動詞-非自立可能", "助詞-副助詞"): (None, PART),
 | 
				
			||||||
 | 
					    ("助動詞", "助詞-副助詞"): (None, PART),
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -1,79 +1,68 @@
 | 
				
			||||||
from ...symbols import POS, PUNCT, INTJ, X, ADJ, AUX, ADP, PART, SCONJ, NOUN
 | 
					from ...symbols import POS, PUNCT, INTJ, ADJ, AUX, ADP, PART, SCONJ, NOUN
 | 
				
			||||||
from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE
 | 
					from ...symbols import SYM, PRON, VERB, ADV, PROPN, NUM, DET, SPACE, CCONJ
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TAG_MAP = {
 | 
					TAG_MAP = {
 | 
				
			||||||
    # Explanation of Unidic tags:
 | 
					    # Explanation of Unidic tags:
 | 
				
			||||||
    # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
 | 
					    # https://www.gavo.t.u-tokyo.ac.jp/~mine/japanese/nlp+slp/UNIDIC_manual.pdf
 | 
				
			||||||
    # Universal Dependencies Mapping:
 | 
					    # Universal Dependencies Mapping: (Some of the entries in this mapping are updated to v2.6 in the list below)
 | 
				
			||||||
    # http://universaldependencies.org/ja/overview/morphology.html
 | 
					    # http://universaldependencies.org/ja/overview/morphology.html
 | 
				
			||||||
    # http://universaldependencies.org/ja/pos/all.html
 | 
					    # http://universaldependencies.org/ja/pos/all.html
 | 
				
			||||||
    "記号,一般,*,*": {
 | 
					    "記号-一般": {POS: NOUN},  # this includes characters used to represent sounds like ドレミ
 | 
				
			||||||
        POS: PUNCT
 | 
					    "記号-文字": {
 | 
				
			||||||
    },  # this includes characters used to represent sounds like ドレミ
 | 
					        POS: NOUN
 | 
				
			||||||
    "記号,文字,*,*": {
 | 
					    },  # this is for Greek and Latin characters having some meanings, or used as symbols, as in math
 | 
				
			||||||
        POS: PUNCT
 | 
					    "感動詞-フィラー": {POS: INTJ},
 | 
				
			||||||
    },  # this is for Greek and Latin characters used as sumbols, as in math
 | 
					    "感動詞-一般": {POS: INTJ},
 | 
				
			||||||
    "感動詞,フィラー,*,*": {POS: INTJ},
 | 
					 | 
				
			||||||
    "感動詞,一般,*,*": {POS: INTJ},
 | 
					 | 
				
			||||||
    # this is specifically for unicode full-width space
 | 
					 | 
				
			||||||
    "空白,*,*,*": {POS: X},
 | 
					 | 
				
			||||||
    # This is used when sequential half-width spaces are present
 | 
					 | 
				
			||||||
    "空白": {POS: SPACE},
 | 
					    "空白": {POS: SPACE},
 | 
				
			||||||
    "形状詞,一般,*,*": {POS: ADJ},
 | 
					    "形状詞-一般": {POS: ADJ},
 | 
				
			||||||
    "形状詞,タリ,*,*": {POS: ADJ},
 | 
					    "形状詞-タリ": {POS: ADJ},
 | 
				
			||||||
    "形状詞,助動詞語幹,*,*": {POS: ADJ},
 | 
					    "形状詞-助動詞語幹": {POS: AUX},
 | 
				
			||||||
    "形容詞,一般,*,*": {POS: ADJ},
 | 
					    "形容詞-一般": {POS: ADJ},
 | 
				
			||||||
    "形容詞,非自立可能,*,*": {POS: AUX},  # XXX ADJ if alone, AUX otherwise
 | 
					    "形容詞-非自立可能": {POS: ADJ},  # XXX ADJ if alone, AUX otherwise
 | 
				
			||||||
    "助詞,格助詞,*,*": {POS: ADP},
 | 
					    "助詞-格助詞": {POS: ADP},
 | 
				
			||||||
    "助詞,係助詞,*,*": {POS: ADP},
 | 
					    "助詞-係助詞": {POS: ADP},
 | 
				
			||||||
    "助詞,終助詞,*,*": {POS: PART},
 | 
					    "助詞-終助詞": {POS: PART},
 | 
				
			||||||
    "助詞,準体助詞,*,*": {POS: SCONJ},  # の as in 走るのが速い
 | 
					    "助詞-準体助詞": {POS: SCONJ},  # の as in 走るのが速い
 | 
				
			||||||
    "助詞,接続助詞,*,*": {POS: SCONJ},  # verb ending て
 | 
					    "助詞-接続助詞": {POS: SCONJ},  # verb ending て0
 | 
				
			||||||
    "助詞,副助詞,*,*": {POS: PART},  # ばかり, つつ after a verb
 | 
					    "助詞-副助詞": {POS: ADP},  # ばかり, つつ after a verb
 | 
				
			||||||
    "助動詞,*,*,*": {POS: AUX},
 | 
					    "助動詞": {POS: AUX},
 | 
				
			||||||
    "接続詞,*,*,*": {POS: SCONJ},  # XXX: might need refinement
 | 
					    "接続詞": {POS: CCONJ},  # XXX: might need refinement
 | 
				
			||||||
    "接頭辞,*,*,*": {POS: NOUN},
 | 
					    "接頭辞": {POS: NOUN},
 | 
				
			||||||
    "接尾辞,形状詞的,*,*": {POS: ADJ},  # がち, チック
 | 
					    "接尾辞-形状詞的": {POS: PART},  # がち, チック
 | 
				
			||||||
    "接尾辞,形容詞的,*,*": {POS: ADJ},  # -らしい
 | 
					    "接尾辞-形容詞的": {POS: AUX},  # -らしい
 | 
				
			||||||
    "接尾辞,動詞的,*,*": {POS: NOUN},  # -じみ
 | 
					    "接尾辞-動詞的": {POS: PART},  # -じみ
 | 
				
			||||||
    "接尾辞,名詞的,サ変可能,*": {POS: NOUN},  # XXX see 名詞,普通名詞,サ変可能,*
 | 
					    "接尾辞-名詞的-サ変可能": {POS: NOUN},  # XXX see 名詞,普通名詞,サ変可能,*
 | 
				
			||||||
    "接尾辞,名詞的,一般,*": {POS: NOUN},
 | 
					    "接尾辞-名詞的-一般": {POS: NOUN},
 | 
				
			||||||
    "接尾辞,名詞的,助数詞,*": {POS: NOUN},
 | 
					    "接尾辞-名詞的-助数詞": {POS: NOUN},
 | 
				
			||||||
    "接尾辞,名詞的,副詞可能,*": {POS: NOUN},  # -後, -過ぎ
 | 
					    "接尾辞-名詞的-副詞可能": {POS: NOUN},  # -後, -過ぎ
 | 
				
			||||||
    "代名詞,*,*,*": {POS: PRON},
 | 
					    "代名詞": {POS: PRON},
 | 
				
			||||||
    "動詞,一般,*,*": {POS: VERB},
 | 
					    "動詞-一般": {POS: VERB},
 | 
				
			||||||
    "動詞,非自立可能,*,*": {POS: VERB},  # XXX VERB if alone, AUX otherwise
 | 
					    "動詞-非自立可能": {POS: AUX},  # XXX VERB if alone, AUX otherwise
 | 
				
			||||||
    "動詞,非自立可能,*,*,AUX": {POS: AUX},
 | 
					    "副詞": {POS: ADV},
 | 
				
			||||||
    "動詞,非自立可能,*,*,VERB": {POS: VERB},
 | 
					    "補助記号-AA-一般": {POS: SYM},  # text art
 | 
				
			||||||
    "副詞,*,*,*": {POS: ADV},
 | 
					    "補助記号-AA-顔文字": {POS: PUNCT},  # kaomoji
 | 
				
			||||||
    "補助記号,AA,一般,*": {POS: SYM},  # text art
 | 
					    "補助記号-一般": {POS: SYM},
 | 
				
			||||||
    "補助記号,AA,顔文字,*": {POS: SYM},  # kaomoji
 | 
					    "補助記号-括弧開": {POS: PUNCT},  # open bracket
 | 
				
			||||||
    "補助記号,一般,*,*": {POS: SYM},
 | 
					    "補助記号-括弧閉": {POS: PUNCT},  # close bracket
 | 
				
			||||||
    "補助記号,括弧開,*,*": {POS: PUNCT},  # open bracket
 | 
					    "補助記号-句点": {POS: PUNCT},  # period or other EOS marker
 | 
				
			||||||
    "補助記号,括弧閉,*,*": {POS: PUNCT},  # close bracket
 | 
					    "補助記号-読点": {POS: PUNCT},  # comma
 | 
				
			||||||
    "補助記号,句点,*,*": {POS: PUNCT},  # period or other EOS marker
 | 
					    "名詞-固有名詞-一般": {POS: PROPN},  # general proper noun
 | 
				
			||||||
    "補助記号,読点,*,*": {POS: PUNCT},  # comma
 | 
					    "名詞-固有名詞-人名-一般": {POS: PROPN},  # person's name
 | 
				
			||||||
    "名詞,固有名詞,一般,*": {POS: PROPN},  # general proper noun
 | 
					    "名詞-固有名詞-人名-姓": {POS: PROPN},  # surname
 | 
				
			||||||
    "名詞,固有名詞,人名,一般": {POS: PROPN},  # person's name
 | 
					    "名詞-固有名詞-人名-名": {POS: PROPN},  # first name
 | 
				
			||||||
    "名詞,固有名詞,人名,姓": {POS: PROPN},  # surname
 | 
					    "名詞-固有名詞-地名-一般": {POS: PROPN},  # place name
 | 
				
			||||||
    "名詞,固有名詞,人名,名": {POS: PROPN},  # first name
 | 
					    "名詞-固有名詞-地名-国": {POS: PROPN},  # country name
 | 
				
			||||||
    "名詞,固有名詞,地名,一般": {POS: PROPN},  # place name
 | 
					    "名詞-助動詞語幹": {POS: AUX},
 | 
				
			||||||
    "名詞,固有名詞,地名,国": {POS: PROPN},  # country name
 | 
					    "名詞-数詞": {POS: NUM},  # includes Chinese numerals
 | 
				
			||||||
    "名詞,助動詞語幹,*,*": {POS: AUX},
 | 
					    "名詞-普通名詞-サ変可能": {POS: NOUN},  # XXX: sometimes VERB in UDv2; suru-verb noun
 | 
				
			||||||
    "名詞,数詞,*,*": {POS: NUM},  # includes Chinese numerals
 | 
					    "名詞-普通名詞-サ変形状詞可能": {POS: NOUN},
 | 
				
			||||||
    "名詞,普通名詞,サ変可能,*": {POS: NOUN},  # XXX: sometimes VERB in UDv2; suru-verb noun
 | 
					    "名詞-普通名詞-一般": {POS: NOUN},
 | 
				
			||||||
    "名詞,普通名詞,サ変可能,*,NOUN": {POS: NOUN},
 | 
					    "名詞-普通名詞-形状詞可能": {POS: NOUN},  # XXX: sometimes ADJ in UDv2
 | 
				
			||||||
    "名詞,普通名詞,サ変可能,*,VERB": {POS: VERB},
 | 
					    "名詞-普通名詞-助数詞可能": {POS: NOUN},  # counter / unit
 | 
				
			||||||
    "名詞,普通名詞,サ変形状詞可能,*": {POS: NOUN},  # ex: 下手
 | 
					    "名詞-普通名詞-副詞可能": {POS: NOUN},
 | 
				
			||||||
    "名詞,普通名詞,一般,*": {POS: NOUN},
 | 
					    "連体詞": {POS: DET},  # XXX this has exceptions based on literal token
 | 
				
			||||||
    "名詞,普通名詞,形状詞可能,*": {POS: NOUN},  # XXX: sometimes ADJ in UDv2
 | 
					    # GSD tags. These aren't in Unidic, but we need them for the GSD data.
 | 
				
			||||||
    "名詞,普通名詞,形状詞可能,*,NOUN": {POS: NOUN},
 | 
					    "外国語": {POS: PROPN},  # Foreign words
 | 
				
			||||||
    "名詞,普通名詞,形状詞可能,*,ADJ": {POS: ADJ},
 | 
					    "絵文字・記号等": {POS: SYM},  # emoji / kaomoji ^^;
 | 
				
			||||||
    "名詞,普通名詞,助数詞可能,*": {POS: NOUN},  # counter / unit
 | 
					 | 
				
			||||||
    "名詞,普通名詞,副詞可能,*": {POS: NOUN},
 | 
					 | 
				
			||||||
    "連体詞,*,*,*": {POS: ADJ},  # XXX this has exceptions based on literal token
 | 
					 | 
				
			||||||
    "連体詞,*,*,*,ADJ": {POS: ADJ},
 | 
					 | 
				
			||||||
    "連体詞,*,*,*,PRON": {POS: PRON},
 | 
					 | 
				
			||||||
    "連体詞,*,*,*,DET": {POS: DET},
 | 
					 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										22
									
								
								spacy/lang/ja/tag_orth_map.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								spacy/lang/ja/tag_orth_map.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,22 @@
 | 
				
			||||||
 | 
					from ...symbols import DET, PART, PRON, SPACE, X
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# mapping from tag bi-gram to pos of previous token
 | 
				
			||||||
 | 
					TAG_ORTH_MAP = {
 | 
				
			||||||
 | 
					    "空白": {" ": SPACE, " ": X},
 | 
				
			||||||
 | 
					    "助詞-副助詞": {"たり": PART},
 | 
				
			||||||
 | 
					    "連体詞": {
 | 
				
			||||||
 | 
					        "あの": DET,
 | 
				
			||||||
 | 
					        "かの": DET,
 | 
				
			||||||
 | 
					        "この": DET,
 | 
				
			||||||
 | 
					        "その": DET,
 | 
				
			||||||
 | 
					        "どの": DET,
 | 
				
			||||||
 | 
					        "彼の": DET,
 | 
				
			||||||
 | 
					        "此の": DET,
 | 
				
			||||||
 | 
					        "其の": DET,
 | 
				
			||||||
 | 
					        "ある": PRON,
 | 
				
			||||||
 | 
					        "こんな": PRON,
 | 
				
			||||||
 | 
					        "そんな": PRON,
 | 
				
			||||||
 | 
					        "どんな": PRON,
 | 
				
			||||||
 | 
					        "あらゆる": PRON,
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from .stop_words import STOP_WORDS
 | 
					from .stop_words import STOP_WORDS
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...language import Language
 | 
					from ...language import Language
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
"""
 | 
					"""
 | 
				
			||||||
Example sentences to test spaCy and its language models.
 | 
					Example sentences to test spaCy and its language models.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
STOP_WORDS = set(
 | 
					STOP_WORDS = set(
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
അത്
 | 
					അത്
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -24,17 +24,15 @@ def noun_chunks(doclike):
 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    seen = set()
 | 
					    prev_end = -1
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # Prevent nested chunks from being produced
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
        if word.i in seen:
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        if word.dep in np_deps:
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
            if any(w.i in seen for w in word.subtree):
 | 
					            prev_end = word.right_edge.i
 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
					 | 
				
			||||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
				
			||||||
        elif word.dep == conj:
 | 
					        elif word.dep == conj:
 | 
				
			||||||
            head = word.head
 | 
					            head = word.head
 | 
				
			||||||
| 
						 | 
					@ -42,9 +40,7 @@ def noun_chunks(doclike):
 | 
				
			||||||
                head = head.head
 | 
					                head = head.head
 | 
				
			||||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
					            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
				
			||||||
            if head.dep in np_deps:
 | 
					            if head.dep in np_deps:
 | 
				
			||||||
                if any(w.i in seen for w in word.subtree):
 | 
					                prev_end = word.right_edge.i
 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
					 | 
				
			||||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,103 +1,75 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...lemmatizer import Lemmatizer
 | 
					from ...lemmatizer import Lemmatizer
 | 
				
			||||||
from ...parts_of_speech import NAMES
 | 
					from ...parts_of_speech import NAMES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PolishLemmatizer(Lemmatizer):
 | 
					class PolishLemmatizer(Lemmatizer):
 | 
				
			||||||
    # This lemmatizer implements lookup lemmatization based on
 | 
					    # This lemmatizer implements lookup lemmatization based on the Morfeusz
 | 
				
			||||||
    # the Morfeusz dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS
 | 
					    # dictionary (morfeusz.sgjp.pl/en) by Institute of Computer Science PAS.
 | 
				
			||||||
    # It utilizes some prefix based improvements for
 | 
					    # It utilizes some prefix based improvements for verb and adjectives
 | 
				
			||||||
    # verb and adjectives lemmatization, as well as case-sensitive
 | 
					    # lemmatization, as well as case-sensitive lemmatization for nouns.
 | 
				
			||||||
    # lemmatization for nouns
 | 
					 | 
				
			||||||
    def __init__(self, lookups, *args, **kwargs):
 | 
					 | 
				
			||||||
        # this lemmatizer is lookup based, so it does not require an index, exceptionlist, or rules
 | 
					 | 
				
			||||||
        super().__init__(lookups)
 | 
					 | 
				
			||||||
        self.lemma_lookups = {}
 | 
					 | 
				
			||||||
        for tag in [
 | 
					 | 
				
			||||||
            "ADJ",
 | 
					 | 
				
			||||||
            "ADP",
 | 
					 | 
				
			||||||
            "ADV",
 | 
					 | 
				
			||||||
            "AUX",
 | 
					 | 
				
			||||||
            "NOUN",
 | 
					 | 
				
			||||||
            "NUM",
 | 
					 | 
				
			||||||
            "PART",
 | 
					 | 
				
			||||||
            "PRON",
 | 
					 | 
				
			||||||
            "VERB",
 | 
					 | 
				
			||||||
            "X",
 | 
					 | 
				
			||||||
        ]:
 | 
					 | 
				
			||||||
            self.lemma_lookups[tag] = self.lookups.get_table(
 | 
					 | 
				
			||||||
                "lemma_lookup_" + tag.lower(), {}
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        self.lemma_lookups["DET"] = self.lemma_lookups["X"]
 | 
					 | 
				
			||||||
        self.lemma_lookups["PROPN"] = self.lemma_lookups["NOUN"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def __call__(self, string, univ_pos, morphology=None):
 | 
					    def __call__(self, string, univ_pos, morphology=None):
 | 
				
			||||||
        if isinstance(univ_pos, int):
 | 
					        if isinstance(univ_pos, int):
 | 
				
			||||||
            univ_pos = NAMES.get(univ_pos, "X")
 | 
					            univ_pos = NAMES.get(univ_pos, "X")
 | 
				
			||||||
        univ_pos = univ_pos.upper()
 | 
					        univ_pos = univ_pos.upper()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        lookup_pos = univ_pos.lower()
 | 
				
			||||||
 | 
					        if univ_pos == "PROPN":
 | 
				
			||||||
 | 
					            lookup_pos = "noun"
 | 
				
			||||||
 | 
					        lookup_table = self.lookups.get_table("lemma_lookup_" + lookup_pos, {})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if univ_pos == "NOUN":
 | 
					        if univ_pos == "NOUN":
 | 
				
			||||||
            return self.lemmatize_noun(string, morphology)
 | 
					            return self.lemmatize_noun(string, morphology, lookup_table)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if univ_pos != "PROPN":
 | 
					        if univ_pos != "PROPN":
 | 
				
			||||||
            string = string.lower()
 | 
					            string = string.lower()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if univ_pos == "ADJ":
 | 
					        if univ_pos == "ADJ":
 | 
				
			||||||
            return self.lemmatize_adj(string, morphology)
 | 
					            return self.lemmatize_adj(string, morphology, lookup_table)
 | 
				
			||||||
        elif univ_pos == "VERB":
 | 
					        elif univ_pos == "VERB":
 | 
				
			||||||
            return self.lemmatize_verb(string, morphology)
 | 
					            return self.lemmatize_verb(string, morphology, lookup_table)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        lemma_dict = self.lemma_lookups.get(univ_pos, {})
 | 
					        return [lookup_table.get(string, string.lower())]
 | 
				
			||||||
        return [lemma_dict.get(string, string.lower())]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize_adj(self, string, morphology):
 | 
					    def lemmatize_adj(self, string, morphology, lookup_table):
 | 
				
			||||||
        # this method utilizes different procedures for adjectives
 | 
					        # this method utilizes different procedures for adjectives
 | 
				
			||||||
        # with 'nie' and 'naj' prefixes
 | 
					        # with 'nie' and 'naj' prefixes
 | 
				
			||||||
        lemma_dict = self.lemma_lookups["ADJ"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if string[:3] == "nie":
 | 
					        if string[:3] == "nie":
 | 
				
			||||||
            search_string = string[3:]
 | 
					            search_string = string[3:]
 | 
				
			||||||
            if search_string[:3] == "naj":
 | 
					            if search_string[:3] == "naj":
 | 
				
			||||||
                naj_search_string = search_string[3:]
 | 
					                naj_search_string = search_string[3:]
 | 
				
			||||||
                if naj_search_string in lemma_dict:
 | 
					                if naj_search_string in lookup_table:
 | 
				
			||||||
                    return [lemma_dict[naj_search_string]]
 | 
					                    return [lookup_table[naj_search_string]]
 | 
				
			||||||
            if search_string in lemma_dict:
 | 
					            if search_string in lookup_table:
 | 
				
			||||||
                return [lemma_dict[search_string]]
 | 
					                return [lookup_table[search_string]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if string[:3] == "naj":
 | 
					        if string[:3] == "naj":
 | 
				
			||||||
            naj_search_string = string[3:]
 | 
					            naj_search_string = string[3:]
 | 
				
			||||||
            if naj_search_string in lemma_dict:
 | 
					            if naj_search_string in lookup_table:
 | 
				
			||||||
                return [lemma_dict[naj_search_string]]
 | 
					                return [lookup_table[naj_search_string]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return [lemma_dict.get(string, string)]
 | 
					        return [lookup_table.get(string, string)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize_verb(self, string, morphology):
 | 
					    def lemmatize_verb(self, string, morphology, lookup_table):
 | 
				
			||||||
        # this method utilizes a different procedure for verbs
 | 
					        # this method utilizes a different procedure for verbs
 | 
				
			||||||
        # with 'nie' prefix
 | 
					        # with 'nie' prefix
 | 
				
			||||||
        lemma_dict = self.lemma_lookups["VERB"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if string[:3] == "nie":
 | 
					        if string[:3] == "nie":
 | 
				
			||||||
            search_string = string[3:]
 | 
					            search_string = string[3:]
 | 
				
			||||||
            if search_string in lemma_dict:
 | 
					            if search_string in lookup_table:
 | 
				
			||||||
                return [lemma_dict[search_string]]
 | 
					                return [lookup_table[search_string]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return [lemma_dict.get(string, string)]
 | 
					        return [lookup_table.get(string, string)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lemmatize_noun(self, string, morphology):
 | 
					    def lemmatize_noun(self, string, morphology, lookup_table):
 | 
				
			||||||
        # this method is case-sensitive, in order to work
 | 
					        # this method is case-sensitive, in order to work
 | 
				
			||||||
        # for incorrectly tagged proper names
 | 
					        # for incorrectly tagged proper names
 | 
				
			||||||
        lemma_dict = self.lemma_lookups["NOUN"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if string != string.lower():
 | 
					        if string != string.lower():
 | 
				
			||||||
            if string.lower() in lemma_dict:
 | 
					            if string.lower() in lookup_table:
 | 
				
			||||||
                return [lemma_dict[string.lower()]]
 | 
					                return [lookup_table[string.lower()]]
 | 
				
			||||||
            elif string in lemma_dict:
 | 
					            elif string in lookup_table:
 | 
				
			||||||
                return [lemma_dict[string]]
 | 
					                return [lookup_table[string]]
 | 
				
			||||||
            return [string.lower()]
 | 
					            return [string.lower()]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return [lemma_dict.get(string, string)]
 | 
					        return [lookup_table.get(string, string)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def lookup(self, string, orth=None):
 | 
					    def lookup(self, string, orth=None):
 | 
				
			||||||
        return string.lower()
 | 
					        return string.lower()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ...attrs import LIKE_NUM
 | 
					from ...attrs import LIKE_NUM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,17 +25,15 @@ def noun_chunks(doclike):
 | 
				
			||||||
    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
					    np_deps = [doc.vocab.strings[label] for label in labels]
 | 
				
			||||||
    conj = doc.vocab.strings.add("conj")
 | 
					    conj = doc.vocab.strings.add("conj")
 | 
				
			||||||
    np_label = doc.vocab.strings.add("NP")
 | 
					    np_label = doc.vocab.strings.add("NP")
 | 
				
			||||||
    seen = set()
 | 
					    prev_end = -1
 | 
				
			||||||
    for i, word in enumerate(doclike):
 | 
					    for i, word in enumerate(doclike):
 | 
				
			||||||
        if word.pos not in (NOUN, PROPN, PRON):
 | 
					        if word.pos not in (NOUN, PROPN, PRON):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        # Prevent nested chunks from being produced
 | 
					        # Prevent nested chunks from being produced
 | 
				
			||||||
        if word.i in seen:
 | 
					        if word.left_edge.i <= prev_end:
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
        if word.dep in np_deps:
 | 
					        if word.dep in np_deps:
 | 
				
			||||||
            if any(w.i in seen for w in word.subtree):
 | 
					            prev_end = word.right_edge.i
 | 
				
			||||||
                continue
 | 
					 | 
				
			||||||
            seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
					 | 
				
			||||||
            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					            yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
				
			||||||
        elif word.dep == conj:
 | 
					        elif word.dep == conj:
 | 
				
			||||||
            head = word.head
 | 
					            head = word.head
 | 
				
			||||||
| 
						 | 
					@ -43,9 +41,7 @@ def noun_chunks(doclike):
 | 
				
			||||||
                head = head.head
 | 
					                head = head.head
 | 
				
			||||||
            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
					            # If the head is an NP, and we're coordinated to it, we're an NP
 | 
				
			||||||
            if head.dep in np_deps:
 | 
					            if head.dep in np_deps:
 | 
				
			||||||
                if any(w.i in seen for w in word.subtree):
 | 
					                prev_end = word.right_edge.i
 | 
				
			||||||
                    continue
 | 
					 | 
				
			||||||
                seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
 | 
					 | 
				
			||||||
                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
					                yield word.left_edge.i, word.right_edge.i + 1, np_label
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,4 +14,9 @@ sentences = [
 | 
				
			||||||
    "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ்  இலவசமாக வழங்கப்படவுள்ளது.",
 | 
					    "இந்த ஃபோனுடன் சுமார் ரூ.2,990 மதிப்புள்ள போட் ராக்கர்ஸ் நிறுவனத்தின் ஸ்போர்ட் புளூடூத் ஹெட்போன்ஸ்  இலவசமாக வழங்கப்படவுள்ளது.",
 | 
				
			||||||
    "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்",
 | 
					    "மட்டக்களப்பில் பல இடங்களில் வீட்டுத் திட்டங்களுக்கு இன்று அடிக்கல் நாட்டல்",
 | 
				
			||||||
    "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும்  விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது",
 | 
					    "ஐ போன்க்கு முகத்தை வைத்து அன்லாக் செய்யும் முறை மற்றும்  விரலால் தொட்டு அன்லாக் செய்யும் முறையை வாட்ஸ் ஆப் நிறுவனம் இதற்கு முன் கண்டுபிடித்தது",
 | 
				
			||||||
 | 
					    "இது ஒரு வாக்கியம்.",
 | 
				
			||||||
 | 
					    "ஆப்பிள் நிறுவனம் யு.கே. தொடக்க நிறுவனத்தை ஒரு லட்சம் கோடிக்கு வாங்கப் பார்க்கிறது",
 | 
				
			||||||
 | 
					    "தன்னாட்சி கார்கள் காப்பீட்டு பொறுப்பை உற்பத்தியாளரிடம் மாற்றுகின்றன",
 | 
				
			||||||
 | 
					    "நடைபாதை விநியோக ரோபோக்களை தடை செய்வதை சான் பிரான்சிஸ்கோ கருதுகிறது",
 | 
				
			||||||
 | 
					    "லண்டன் ஐக்கிய இராச்சியத்தில் ஒரு பெரிய நகரம்.",
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -55,7 +55,8 @@ URL_PATTERN = (
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
).strip()
 | 
					).strip()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TOKEN_MATCH = re.compile("(?u)" + URL_PATTERN).match
 | 
					TOKEN_MATCH = None
 | 
				
			||||||
 | 
					URL_MATCH = re.compile("(?u)" + URL_PATTERN).match
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
BASE_EXCEPTIONS = {}
 | 
					BASE_EXCEPTIONS = {}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
 | 
					from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
 | 
				
			||||||
from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE
 | 
					from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE, PROPN
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
 | 
					# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
 | 
				
			||||||
# Treebank tag set. We also map the tags to the simpler Universal Dependencies
 | 
					# Treebank tag set. We also map the tags to the simpler Universal Dependencies
 | 
				
			||||||
| 
						 | 
					@ -25,7 +25,7 @@ TAG_MAP = {
 | 
				
			||||||
    "URL": {POS: X},
 | 
					    "URL": {POS: X},
 | 
				
			||||||
    "INF": {POS: X},
 | 
					    "INF": {POS: X},
 | 
				
			||||||
    "NN": {POS: NOUN},
 | 
					    "NN": {POS: NOUN},
 | 
				
			||||||
    "NR": {POS: NOUN},
 | 
					    "NR": {POS: PROPN},
 | 
				
			||||||
    "NT": {POS: NOUN},
 | 
					    "NT": {POS: NOUN},
 | 
				
			||||||
    "VA": {POS: VERB},
 | 
					    "VA": {POS: VERB},
 | 
				
			||||||
    "VC": {POS: VERB},
 | 
					    "VC": {POS: VERB},
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,7 +25,7 @@ from .util import link_vectors_to_models, create_default_optimizer, registry
 | 
				
			||||||
from .attrs import IS_STOP, LANG, NORM
 | 
					from .attrs import IS_STOP, LANG, NORM
 | 
				
			||||||
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
					from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
 | 
				
			||||||
from .lang.punctuation import TOKENIZER_INFIXES
 | 
					from .lang.punctuation import TOKENIZER_INFIXES
 | 
				
			||||||
from .lang.tokenizer_exceptions import TOKEN_MATCH
 | 
					from .lang.tokenizer_exceptions import TOKEN_MATCH, URL_MATCH
 | 
				
			||||||
from .lang.norm_exceptions import BASE_NORMS
 | 
					from .lang.norm_exceptions import BASE_NORMS
 | 
				
			||||||
from .lang.tag_map import TAG_MAP
 | 
					from .lang.tag_map import TAG_MAP
 | 
				
			||||||
from .tokens import Doc
 | 
					from .tokens import Doc
 | 
				
			||||||
| 
						 | 
					@ -86,6 +86,7 @@ class BaseDefaults(object):
 | 
				
			||||||
    def create_tokenizer(cls, nlp=None):
 | 
					    def create_tokenizer(cls, nlp=None):
 | 
				
			||||||
        rules = cls.tokenizer_exceptions
 | 
					        rules = cls.tokenizer_exceptions
 | 
				
			||||||
        token_match = cls.token_match
 | 
					        token_match = cls.token_match
 | 
				
			||||||
 | 
					        url_match = cls.url_match
 | 
				
			||||||
        prefix_search = (
 | 
					        prefix_search = (
 | 
				
			||||||
            util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
 | 
					            util.compile_prefix_regex(cls.prefixes).search if cls.prefixes else None
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					@ -103,10 +104,12 @@ class BaseDefaults(object):
 | 
				
			||||||
            suffix_search=suffix_search,
 | 
					            suffix_search=suffix_search,
 | 
				
			||||||
            infix_finditer=infix_finditer,
 | 
					            infix_finditer=infix_finditer,
 | 
				
			||||||
            token_match=token_match,
 | 
					            token_match=token_match,
 | 
				
			||||||
 | 
					            url_match=url_match,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pipe_names = ["tagger", "parser", "ner"]
 | 
					    pipe_names = ["tagger", "parser", "ner"]
 | 
				
			||||||
    token_match = TOKEN_MATCH
 | 
					    token_match = TOKEN_MATCH
 | 
				
			||||||
 | 
					    url_match = URL_MATCH
 | 
				
			||||||
    prefixes = tuple(TOKENIZER_PREFIXES)
 | 
					    prefixes = tuple(TOKENIZER_PREFIXES)
 | 
				
			||||||
    suffixes = tuple(TOKENIZER_SUFFIXES)
 | 
					    suffixes = tuple(TOKENIZER_SUFFIXES)
 | 
				
			||||||
    infixes = tuple(TOKENIZER_INFIXES)
 | 
					    infixes = tuple(TOKENIZER_INFIXES)
 | 
				
			||||||
| 
						 | 
					@ -954,9 +957,7 @@ class Language(object):
 | 
				
			||||||
        serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
 | 
					        serializers["tokenizer"] = lambda p: self.tokenizer.to_disk(
 | 
				
			||||||
            p, exclude=["vocab"]
 | 
					            p, exclude=["vocab"]
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        serializers["meta.json"] = lambda p: p.open("w").write(
 | 
					        serializers["meta.json"] = lambda p: srsly.write_json(p, self.meta)
 | 
				
			||||||
            srsly.json_dumps(self.meta)
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        serializers["config.cfg"] = lambda p: self.config.to_disk(p)
 | 
					        serializers["config.cfg"] = lambda p: self.config.to_disk(p)
 | 
				
			||||||
        for name, proc in self.pipeline:
 | 
					        for name, proc in self.pipeline:
 | 
				
			||||||
            if not hasattr(proc, "name"):
 | 
					            if not hasattr(proc, "name"):
 | 
				
			||||||
| 
						 | 
					@ -980,17 +981,30 @@ class Language(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/language#from_disk
 | 
					        DOCS: https://spacy.io/api/language#from_disk
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def deserialize_meta(path):
 | 
				
			||||||
 | 
					            if path.exists():
 | 
				
			||||||
 | 
					                data = srsly.read_json(path)
 | 
				
			||||||
 | 
					                self.meta.update(data)
 | 
				
			||||||
 | 
					                # self.meta always overrides meta["vectors"] with the metadata
 | 
				
			||||||
 | 
					                # from self.vocab.vectors, so set the name directly
 | 
				
			||||||
 | 
					                self.vocab.vectors.name = data.get("vectors", {}).get("name")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def deserialize_vocab(path):
 | 
				
			||||||
 | 
					            if path.exists():
 | 
				
			||||||
 | 
					                self.vocab.from_disk(path)
 | 
				
			||||||
 | 
					            _fix_pretrained_vectors_name(self)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if disable is not None:
 | 
					        if disable is not None:
 | 
				
			||||||
            warnings.warn(Warnings.W014, DeprecationWarning)
 | 
					            warnings.warn(Warnings.W014, DeprecationWarning)
 | 
				
			||||||
            exclude = disable
 | 
					            exclude = disable
 | 
				
			||||||
        path = util.ensure_path(path)
 | 
					        path = util.ensure_path(path)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        deserializers = {}
 | 
					        deserializers = {}
 | 
				
			||||||
        if Path(path / "config.cfg").exists():
 | 
					        if Path(path / "config.cfg").exists():
 | 
				
			||||||
            deserializers["config.cfg"] = lambda p: self.config.from_disk(p)
 | 
					            deserializers["config.cfg"] = lambda p: self.config.from_disk(p)
 | 
				
			||||||
        deserializers["meta.json"] = lambda p: self.meta.update(srsly.read_json(p))
 | 
					        deserializers["meta.json"] = deserialize_meta
 | 
				
			||||||
        deserializers["vocab"] = lambda p: self.vocab.from_disk(
 | 
					        deserializers["vocab"] = deserialize_vocab
 | 
				
			||||||
            p
 | 
					 | 
				
			||||||
        ) and _fix_pretrained_vectors_name(self)
 | 
					 | 
				
			||||||
        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
 | 
					        deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(
 | 
				
			||||||
            p, exclude=["vocab"]
 | 
					            p, exclude=["vocab"]
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					@ -1044,15 +1058,25 @@ class Language(object):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        DOCS: https://spacy.io/api/language#from_bytes
 | 
					        DOCS: https://spacy.io/api/language#from_bytes
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def deserialize_meta(b):
 | 
				
			||||||
 | 
					            data = srsly.json_loads(b)
 | 
				
			||||||
 | 
					            self.meta.update(data)
 | 
				
			||||||
 | 
					            # self.meta always overrides meta["vectors"] with the metadata
 | 
				
			||||||
 | 
					            # from self.vocab.vectors, so set the name directly
 | 
				
			||||||
 | 
					            self.vocab.vectors.name = data.get("vectors", {}).get("name")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def deserialize_vocab(b):
 | 
				
			||||||
 | 
					            self.vocab.from_bytes(b)
 | 
				
			||||||
 | 
					            _fix_pretrained_vectors_name(self)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if disable is not None:
 | 
					        if disable is not None:
 | 
				
			||||||
            warnings.warn(Warnings.W014, DeprecationWarning)
 | 
					            warnings.warn(Warnings.W014, DeprecationWarning)
 | 
				
			||||||
            exclude = disable
 | 
					            exclude = disable
 | 
				
			||||||
        deserializers = {}
 | 
					        deserializers = {}
 | 
				
			||||||
        deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
 | 
					        deserializers["config.cfg"] = lambda b: self.config.from_bytes(b)
 | 
				
			||||||
        deserializers["meta.json"] = lambda b: self.meta.update(srsly.json_loads(b))
 | 
					        deserializers["meta.json"] = deserialize_meta
 | 
				
			||||||
        deserializers["vocab"] = lambda b: self.vocab.from_bytes(
 | 
					        deserializers["vocab"] = deserialize_vocab
 | 
				
			||||||
            b
 | 
					 | 
				
			||||||
        ) and _fix_pretrained_vectors_name(self)
 | 
					 | 
				
			||||||
        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
 | 
					        deserializers["tokenizer"] = lambda b: self.tokenizer.from_bytes(
 | 
				
			||||||
            b, exclude=["vocab"]
 | 
					            b, exclude=["vocab"]
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
| 
						 | 
					@ -1135,7 +1159,7 @@ class component(object):
 | 
				
			||||||
def _fix_pretrained_vectors_name(nlp):
 | 
					def _fix_pretrained_vectors_name(nlp):
 | 
				
			||||||
    # TODO: Replace this once we handle vectors consistently as static
 | 
					    # TODO: Replace this once we handle vectors consistently as static
 | 
				
			||||||
    # data
 | 
					    # data
 | 
				
			||||||
    if "vectors" in nlp.meta and nlp.meta["vectors"].get("name"):
 | 
					    if "vectors" in nlp.meta and "name" in nlp.meta["vectors"]:
 | 
				
			||||||
        nlp.vocab.vectors.name = nlp.meta["vectors"]["name"]
 | 
					        nlp.vocab.vectors.name = nlp.meta["vectors"]["name"]
 | 
				
			||||||
    elif not nlp.vocab.vectors.size:
 | 
					    elif not nlp.vocab.vectors.size:
 | 
				
			||||||
        nlp.vocab.vectors.name = None
 | 
					        nlp.vocab.vectors.name = None
 | 
				
			||||||
| 
						 | 
					@ -1145,7 +1169,7 @@ def _fix_pretrained_vectors_name(nlp):
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        raise ValueError(Errors.E092)
 | 
					        raise ValueError(Errors.E092)
 | 
				
			||||||
    if nlp.vocab.vectors.size != 0:
 | 
					    if nlp.vocab.vectors.size != 0:
 | 
				
			||||||
        link_vectors_to_models(nlp.vocab, skip_rank=True)
 | 
					        link_vectors_to_models(nlp.vocab)
 | 
				
			||||||
    for name, proc in nlp.pipeline:
 | 
					    for name, proc in nlp.pipeline:
 | 
				
			||||||
        if not hasattr(proc, "cfg"):
 | 
					        if not hasattr(proc, "cfg"):
 | 
				
			||||||
            continue
 | 
					            continue
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
from .symbols import NOUN, VERB, ADJ, PUNCT, PROPN
 | 
					 | 
				
			||||||
from .errors import Errors
 | 
					from .errors import Errors
 | 
				
			||||||
from .lookups import Lookups
 | 
					from .lookups import Lookups
 | 
				
			||||||
from .parts_of_speech import NAMES as UPOS_NAMES
 | 
					from .parts_of_speech import NAMES as UPOS_NAMES
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,6 @@ import numpy
 | 
				
			||||||
from thinc.api import get_array_module
 | 
					from thinc.api import get_array_module
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from libc.stdint cimport UINT64_MAX
 | 
					 | 
				
			||||||
from .typedefs cimport attr_t, flags_t
 | 
					from .typedefs cimport attr_t, flags_t
 | 
				
			||||||
from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 | 
					from .attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
 | 
				
			||||||
from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 | 
					from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
 | 
				
			||||||
| 
						 | 
					@ -20,7 +19,7 @@ from .attrs import intify_attrs
 | 
				
			||||||
from .errors import Errors, Warnings
 | 
					from .errors import Errors, Warnings
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
OOV_RANK = UINT64_MAX
 | 
					OOV_RANK = 0xffffffffffffffff # UINT64_MAX
 | 
				
			||||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 | 
					memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
 | 
				
			||||||
EMPTY_LEXEME.id = OOV_RANK
 | 
					EMPTY_LEXEME.id = OOV_RANK
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -328,7 +328,7 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
 | 
				
			||||||
    matcher = PhraseMatcher(vocab, attr=attr)
 | 
					    matcher = PhraseMatcher(vocab, attr=attr)
 | 
				
			||||||
    for key, specs in docs.items():
 | 
					    for key, specs in docs.items():
 | 
				
			||||||
        callback = callbacks.get(key, None)
 | 
					        callback = callbacks.get(key, None)
 | 
				
			||||||
        matcher.add(key, callback, *specs)
 | 
					        matcher.add(key, specs, on_match=callback)
 | 
				
			||||||
    return matcher
 | 
					    return matcher
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1 +1 @@
 | 
				
			||||||
from .models import *
 | 
					from .models import *  # noqa: F401, F403
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,8 @@
 | 
				
			||||||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
 | 
					"""Thinc layer to do simpler transition-based parsing, NER, etc."""
 | 
				
			||||||
from typing import List, Tuple, Dict, Optional
 | 
					from typing import Dict, Optional
 | 
				
			||||||
import numpy
 | 
					import numpy
 | 
				
			||||||
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
 | 
					from thinc.api import Model
 | 
				
			||||||
from thinc.api import to_numpy
 | 
					from thinc.types import Padded, Floats3d
 | 
				
			||||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokens import Doc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def BILUO() -> Model[Padded, Padded]:
 | 
					def BILUO() -> Model[Padded, Padded]:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,7 @@
 | 
				
			||||||
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
 | 
					"""Thinc layer to do simpler transition-based parsing, NER, etc."""
 | 
				
			||||||
from typing import List, Tuple, Dict, Optional
 | 
					from typing import Dict, Optional
 | 
				
			||||||
from thinc.api import Ops, Model, with_array, softmax_activation, padded2list
 | 
					from thinc.api import Ops, Model
 | 
				
			||||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
 | 
					from thinc.types import Padded, Floats3d
 | 
				
			||||||
 | 
					 | 
				
			||||||
from ..tokens import Doc
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def IOB() -> Model[Padded, Padded]:
 | 
					def IOB() -> Model[Padded, Padded]:
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from .entity_linker import *  # noqa
 | 
					from .entity_linker import *  # noqa
 | 
				
			||||||
from .parser import *  # noqa
 | 
					from .parser import *  # noqa
 | 
				
			||||||
from .simple_ner import *
 | 
					from .simple_ner import *  # noqa
 | 
				
			||||||
from .tagger import *  # noqa
 | 
					from .tagger import *  # noqa
 | 
				
			||||||
from .textcat import *  # noqa
 | 
					from .textcat import *  # noqa
 | 
				
			||||||
from .tok2vec import *  # noqa
 | 
					from .tok2vec import *  # noqa
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,17 +1,8 @@
 | 
				
			||||||
import functools
 | 
					from typing import List
 | 
				
			||||||
from typing import List, Tuple, Dict, Optional
 | 
					from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
 | 
				
			||||||
from thinc.api import (
 | 
					 | 
				
			||||||
    Ops,
 | 
					 | 
				
			||||||
    Model,
 | 
					 | 
				
			||||||
    Linear,
 | 
					 | 
				
			||||||
    Softmax,
 | 
					 | 
				
			||||||
    with_array,
 | 
					 | 
				
			||||||
    softmax_activation,
 | 
					 | 
				
			||||||
    padded2list,
 | 
					 | 
				
			||||||
)
 | 
					 | 
				
			||||||
from thinc.api import chain, list2padded, configure_normal_init
 | 
					from thinc.api import chain, list2padded, configure_normal_init
 | 
				
			||||||
from thinc.api import Dropout
 | 
					from thinc.api import Dropout
 | 
				
			||||||
from thinc.types import Padded, Ints1d, Ints3d, Floats2d, Floats3d
 | 
					from thinc.types import Floats2d
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...tokens import Doc
 | 
					from ...tokens import Doc
 | 
				
			||||||
from .._biluo import BILUO
 | 
					from .._biluo import BILUO
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,4 @@
 | 
				
			||||||
from thinc.api import zero_init, with_array, Softmax, chain, Model, Dropout
 | 
					from thinc.api import zero_init, with_array, Softmax, chain, Model
 | 
				
			||||||
from thinc.api import glorot_uniform_init
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -28,7 +28,7 @@ from thinc.api import (
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..spacy_vectors import SpacyVectors
 | 
					from ..spacy_vectors import SpacyVectors
 | 
				
			||||||
from ... import util
 | 
					from ... import util
 | 
				
			||||||
from ...attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE, LOWER
 | 
					from ...attrs import ID, ORTH, PREFIX, SUFFIX, SHAPE, LOWER
 | 
				
			||||||
from ...util import registry
 | 
					from ...util import registry
 | 
				
			||||||
from ..extract_ngrams import extract_ngrams
 | 
					from ..extract_ngrams import extract_ngrams
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -66,6 +66,9 @@ cdef class Morphology:
 | 
				
			||||||
        self.tags = PreshMap()
 | 
					        self.tags = PreshMap()
 | 
				
			||||||
        # Add special space symbol. We prefix with underscore, to make sure it
 | 
					        # Add special space symbol. We prefix with underscore, to make sure it
 | 
				
			||||||
        # always sorts to the end.
 | 
					        # always sorts to the end.
 | 
				
			||||||
 | 
					        if '_SP' in tag_map:
 | 
				
			||||||
 | 
					            space_attrs = tag_map.get('_SP')
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
            space_attrs = tag_map.get('SP', {POS: SPACE})
 | 
					            space_attrs = tag_map.get('SP', {POS: SPACE})
 | 
				
			||||||
        if '_SP' not in tag_map:
 | 
					        if '_SP' not in tag_map:
 | 
				
			||||||
            self.strings.add('_SP')
 | 
					            self.strings.add('_SP')
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -188,7 +188,7 @@ class Pipe(object):
 | 
				
			||||||
        serialize = {}
 | 
					        serialize = {}
 | 
				
			||||||
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
 | 
					        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
 | 
				
			||||||
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
 | 
					        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
 | 
				
			||||||
        serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
 | 
					        serialize["model"] = lambda p: self.model.to_disk(p)
 | 
				
			||||||
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
 | 
					        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
 | 
				
			||||||
        util.to_disk(path, serialize, exclude)
 | 
					        util.to_disk(path, serialize, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -350,6 +350,8 @@ class Tagger(Pipe):
 | 
				
			||||||
        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
 | 
					        lemma_tables = ["lemma_rules", "lemma_index", "lemma_exc", "lemma_lookup"]
 | 
				
			||||||
        if not any(table in self.vocab.lookups for table in lemma_tables):
 | 
					        if not any(table in self.vocab.lookups for table in lemma_tables):
 | 
				
			||||||
            warnings.warn(Warnings.W022)
 | 
					            warnings.warn(Warnings.W022)
 | 
				
			||||||
 | 
					        if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
 | 
				
			||||||
 | 
					            warnings.warn(Warnings.W033.format(model="part-of-speech tagger"))
 | 
				
			||||||
        orig_tag_map = dict(self.vocab.morphology.tag_map)
 | 
					        orig_tag_map = dict(self.vocab.morphology.tag_map)
 | 
				
			||||||
        new_tag_map = {}
 | 
					        new_tag_map = {}
 | 
				
			||||||
        for example in get_examples():
 | 
					        for example in get_examples():
 | 
				
			||||||
| 
						 | 
					@ -366,6 +368,8 @@ class Tagger(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        cdef Vocab vocab = self.vocab
 | 
					        cdef Vocab vocab = self.vocab
 | 
				
			||||||
        if new_tag_map:
 | 
					        if new_tag_map:
 | 
				
			||||||
 | 
					            if "_SP" in orig_tag_map:
 | 
				
			||||||
 | 
					                new_tag_map["_SP"] = orig_tag_map["_SP"]
 | 
				
			||||||
            vocab.morphology = Morphology(vocab.strings, new_tag_map,
 | 
					            vocab.morphology = Morphology(vocab.strings, new_tag_map,
 | 
				
			||||||
                                          vocab.morphology.lemmatizer,
 | 
					                                          vocab.morphology.lemmatizer,
 | 
				
			||||||
                                          exc=vocab.morphology.exc)
 | 
					                                          exc=vocab.morphology.exc)
 | 
				
			||||||
| 
						 | 
					@ -456,7 +460,7 @@ class Tagger(Pipe):
 | 
				
			||||||
        serialize = {
 | 
					        serialize = {
 | 
				
			||||||
            "vocab": lambda p: self.vocab.to_disk(p),
 | 
					            "vocab": lambda p: self.vocab.to_disk(p),
 | 
				
			||||||
            "tag_map": lambda p: srsly.write_msgpack(p, tag_map),
 | 
					            "tag_map": lambda p: srsly.write_msgpack(p, tag_map),
 | 
				
			||||||
            "model": lambda p: p.open("wb").write(self.model.to_bytes()),
 | 
					            "model": lambda p: self.model.to_disk(p),
 | 
				
			||||||
            "cfg": lambda p: srsly.write_json(p, self.cfg),
 | 
					            "cfg": lambda p: srsly.write_json(p, self.cfg),
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
 | 
					        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
 | 
				
			||||||
| 
						 | 
					@ -1073,6 +1077,8 @@ class EntityLinker(Pipe):
 | 
				
			||||||
            raise ValueError(Errors.E990.format(type=type(self.kb)))
 | 
					            raise ValueError(Errors.E990.format(type=type(self.kb)))
 | 
				
			||||||
        self.cfg = dict(cfg)
 | 
					        self.cfg = dict(cfg)
 | 
				
			||||||
        self.distance = CosineDistance(normalize=False)
 | 
					        self.distance = CosineDistance(normalize=False)
 | 
				
			||||||
 | 
					        # how many neightbour sentences to take into account
 | 
				
			||||||
 | 
					        self.n_sents = cfg.get("n_sents", 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def require_kb(self):
 | 
					    def require_kb(self):
 | 
				
			||||||
        # Raise an error if the knowledge base is not initialized.
 | 
					        # Raise an error if the knowledge base is not initialized.
 | 
				
			||||||
| 
						 | 
					@ -1106,15 +1112,30 @@ class EntityLinker(Pipe):
 | 
				
			||||||
            predictions = self.model.predict(docs)
 | 
					            predictions = self.model.predict(docs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for eg in examples:
 | 
					        for eg in examples:
 | 
				
			||||||
 | 
					            sentences = [s for s in eg.predicted.sents]
 | 
				
			||||||
            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
 | 
					            kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
 | 
				
			||||||
            for ent in eg.predicted.ents:
 | 
					            for ent in eg.predicted.ents:
 | 
				
			||||||
                kb_id = kb_ids[ent.start]  # KB ID of the first token is the same as the whole span
 | 
					                kb_id = kb_ids[ent.start]  # KB ID of the first token is the same as the whole span
 | 
				
			||||||
                if kb_id:
 | 
					                if kb_id:
 | 
				
			||||||
                    try:
 | 
					                    try:
 | 
				
			||||||
                        sentence_docs.append(ent.sent.as_doc())
 | 
					                        # find the sentence in the list of sentences.
 | 
				
			||||||
 | 
					                        sent_index = sentences.index(ent.sent)
 | 
				
			||||||
                    except AttributeError:
 | 
					                    except AttributeError:
 | 
				
			||||||
                        # Catch the exception when ent.sent is None and provide a user-friendly warning
 | 
					                        # Catch the exception when ent.sent is None and provide a user-friendly warning
 | 
				
			||||||
                        raise RuntimeError(Errors.E030)
 | 
					                        raise RuntimeError(Errors.E030)
 | 
				
			||||||
 | 
					                    # get n previous sentences, if there are any
 | 
				
			||||||
 | 
					                    start_sentence = max(0, sent_index - self.n_sents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    # get n posterior sentences, or as many < n as there are
 | 
				
			||||||
 | 
					                    end_sentence = min(len(sentences) -1, sent_index + self.n_sents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    # get token positions
 | 
				
			||||||
 | 
					                    start_token = sentences[start_sentence].start
 | 
				
			||||||
 | 
					                    end_token = sentences[end_sentence].end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                    # append that span as a doc to training
 | 
				
			||||||
 | 
					                    sent_doc = eg.predicted[start_token:end_token].as_doc()
 | 
				
			||||||
 | 
					                    sentence_docs.append(sent_doc)
 | 
				
			||||||
        set_dropout_rate(self.model, drop)
 | 
					        set_dropout_rate(self.model, drop)
 | 
				
			||||||
        if not sentence_docs:
 | 
					        if not sentence_docs:
 | 
				
			||||||
            warnings.warn(Warnings.W093.format(name="Entity Linker"))
 | 
					            warnings.warn(Warnings.W093.format(name="Entity Linker"))
 | 
				
			||||||
| 
						 | 
					@ -1197,18 +1218,28 @@ class EntityLinker(Pipe):
 | 
				
			||||||
            docs = [docs]
 | 
					            docs = [docs]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for i, doc in enumerate(docs):
 | 
					        for i, doc in enumerate(docs):
 | 
				
			||||||
 | 
					            sentences = [s for s in doc.sents]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            if len(doc) > 0:
 | 
					            if len(doc) > 0:
 | 
				
			||||||
                # Looping through each sentence and each entity
 | 
					                # Looping through each sentence and each entity
 | 
				
			||||||
                # This may go wrong if there are entities across sentences - which shouldn't happen normally.
 | 
					                # This may go wrong if there are entities across sentences - which shouldn't happen normally.
 | 
				
			||||||
                for sent in doc.sents:
 | 
					                for sent_index, sent in enumerate(sentences):
 | 
				
			||||||
                    sent_doc = sent.as_doc()
 | 
					                    if sent.ents:
 | 
				
			||||||
 | 
					                        # get n_neightbour sentences, clipped to the length of the document
 | 
				
			||||||
 | 
					                        start_sentence = max(0, sent_index - self.n_sents)
 | 
				
			||||||
 | 
					                        end_sentence = min(len(sentences) -1, sent_index + self.n_sents)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                        start_token = sentences[start_sentence].start
 | 
				
			||||||
 | 
					                        end_token = sentences[end_sentence].end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                        sent_doc = doc[start_token:end_token].as_doc()
 | 
				
			||||||
                        # currently, the context is the same for each entity in a sentence (should be refined)
 | 
					                        # currently, the context is the same for each entity in a sentence (should be refined)
 | 
				
			||||||
                        sentence_encoding = self.model.predict([sent_doc])[0]
 | 
					                        sentence_encoding = self.model.predict([sent_doc])[0]
 | 
				
			||||||
                        xp = get_array_module(sentence_encoding)
 | 
					                        xp = get_array_module(sentence_encoding)
 | 
				
			||||||
                        sentence_encoding_t = sentence_encoding.T
 | 
					                        sentence_encoding_t = sentence_encoding.T
 | 
				
			||||||
                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
 | 
					                        sentence_norm = xp.linalg.norm(sentence_encoding_t)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                    for ent in sent_doc.ents:
 | 
					                        for ent in sent.ents:
 | 
				
			||||||
                            entity_count += 1
 | 
					                            entity_count += 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                            to_discard = self.cfg.get("labels_discard", [])
 | 
					                            to_discard = self.cfg.get("labels_discard", [])
 | 
				
			||||||
| 
						 | 
					@ -1284,7 +1315,7 @@ class EntityLinker(Pipe):
 | 
				
			||||||
        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
 | 
					        serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
 | 
				
			||||||
        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
 | 
					        serialize["vocab"] = lambda p: self.vocab.to_disk(p)
 | 
				
			||||||
        serialize["kb"] = lambda p: self.kb.dump(p)
 | 
					        serialize["kb"] = lambda p: self.kb.dump(p)
 | 
				
			||||||
        serialize["model"] = lambda p: p.open("wb").write(self.model.to_bytes())
 | 
					        serialize["model"] = lambda p: self.model.to_disk(p)
 | 
				
			||||||
        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
 | 
					        exclude = util.get_serialization_exclude(serialize, exclude, kwargs)
 | 
				
			||||||
        util.to_disk(path, serialize, exclude)
 | 
					        util.to_disk(path, serialize, exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -130,7 +130,7 @@ class SimpleNER(Pipe):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _has_ner(eg):
 | 
					def _has_ner(eg):
 | 
				
			||||||
    for ner_tag in eg.gold.ner:
 | 
					    for ner_tag in eg.gold.ner:
 | 
				
			||||||
        if ner_tag != "-" and ner_tag != None:
 | 
					        if ner_tag != "-" and ner_tag is not None:
 | 
				
			||||||
            return True
 | 
					            return True
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,7 @@
 | 
				
			||||||
# cython: infer_types=True, cdivision=True, boundscheck=False
 | 
					# cython: infer_types=True, cdivision=True, boundscheck=False
 | 
				
			||||||
cimport cython.parallel
 | 
					cimport cython.parallel
 | 
				
			||||||
cimport numpy as np
 | 
					cimport numpy as np
 | 
				
			||||||
 | 
					from itertools import islice
 | 
				
			||||||
from cpython.ref cimport PyObject, Py_XDECREF
 | 
					from cpython.ref cimport PyObject, Py_XDECREF
 | 
				
			||||||
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 | 
					from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
 | 
				
			||||||
from libc.math cimport exp
 | 
					from libc.math cimport exp
 | 
				
			||||||
| 
						 | 
					@ -394,6 +395,8 @@ cdef class Parser:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
 | 
					    def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
 | 
				
			||||||
        self.cfg.update(kwargs)
 | 
					        self.cfg.update(kwargs)
 | 
				
			||||||
 | 
					        if len(self.vocab.lookups.get_table("lexeme_norm", {})) == 0:
 | 
				
			||||||
 | 
					            warnings.warn(Warnings.W033.format(model="parser or NER"))
 | 
				
			||||||
        if not hasattr(get_examples, '__call__'):
 | 
					        if not hasattr(get_examples, '__call__'):
 | 
				
			||||||
            gold_tuples = get_examples
 | 
					            gold_tuples = get_examples
 | 
				
			||||||
            get_examples = lambda: gold_tuples
 | 
					            get_examples = lambda: gold_tuples
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -137,7 +137,7 @@ def it_tokenizer():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(scope="session")
 | 
					@pytest.fixture(scope="session")
 | 
				
			||||||
def ja_tokenizer():
 | 
					def ja_tokenizer():
 | 
				
			||||||
    pytest.importorskip("fugashi")
 | 
					    pytest.importorskip("sudachipy")
 | 
				
			||||||
    return get_lang_class("ja").Defaults.create_tokenizer()
 | 
					    return get_lang_class("ja").Defaults.create_tokenizer()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -429,3 +429,10 @@ def test_retokenize_skip_duplicates(en_vocab):
 | 
				
			||||||
        retokenizer.merge(doc[0:2])
 | 
					        retokenizer.merge(doc[0:2])
 | 
				
			||||||
    assert len(doc) == 2
 | 
					    assert len(doc) == 2
 | 
				
			||||||
    assert doc[0].text == "hello world"
 | 
					    assert doc[0].text == "hello world"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_retokenize_disallow_zero_length(en_vocab):
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=["hello", "world", "!"])
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
 | 
					        with doc.retokenize() as retokenizer:
 | 
				
			||||||
 | 
					            retokenizer.merge(doc[1:1])
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,7 +43,7 @@ def test_en_tokenizer_doesnt_split_apos_exc(en_tokenizer, text):
 | 
				
			||||||
    assert tokens[0].text == text
 | 
					    assert tokens[0].text == text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll"])
 | 
					@pytest.mark.parametrize("text", ["we'll", "You'll", "there'll", "this'll", "those'll"])
 | 
				
			||||||
def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
 | 
					def test_en_tokenizer_handles_ll_contraction(en_tokenizer, text):
 | 
				
			||||||
    tokens = en_tokenizer(text)
 | 
					    tokens = en_tokenizer(text)
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.lang.hy.lex_attrs import like_num
 | 
					from spacy.lang.hy.lex_attrs import like_num
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,7 +3,7 @@ import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.mark.parametrize(
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
    "word,lemma",
 | 
					    "word,lemma",
 | 
				
			||||||
    [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "凄い"), ("いただきました", "頂く"), ("なった", "成る")],
 | 
					    [("新しく", "新しい"), ("赤く", "赤い"), ("すごく", "すごい"), ("いただきました", "いただく"), ("なった", "なる")],
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
 | 
					def test_ja_lemmatizer_assigns(ja_tokenizer, word, lemma):
 | 
				
			||||||
    test_lemma = ja_tokenizer(word)[0].lemma_
 | 
					    test_lemma = ja_tokenizer(word)[0].lemma_
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										33
									
								
								spacy/tests/lang/ja/test_serialize.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										33
									
								
								spacy/tests/lang/ja/test_serialize.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,33 @@
 | 
				
			||||||
 | 
					from spacy.lang.ja import Japanese
 | 
				
			||||||
 | 
					from ...util import make_tempdir
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_ja_tokenizer_serialize(ja_tokenizer):
 | 
				
			||||||
 | 
					    tokenizer_bytes = ja_tokenizer.to_bytes()
 | 
				
			||||||
 | 
					    nlp = Japanese()
 | 
				
			||||||
 | 
					    nlp.tokenizer.from_bytes(tokenizer_bytes)
 | 
				
			||||||
 | 
					    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | 
				
			||||||
 | 
					    assert nlp.tokenizer.split_mode is None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with make_tempdir() as d:
 | 
				
			||||||
 | 
					        file_path = d / "tokenizer"
 | 
				
			||||||
 | 
					        ja_tokenizer.to_disk(file_path)
 | 
				
			||||||
 | 
					        nlp = Japanese()
 | 
				
			||||||
 | 
					        nlp.tokenizer.from_disk(file_path)
 | 
				
			||||||
 | 
					        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
 | 
				
			||||||
 | 
					        assert nlp.tokenizer.split_mode is None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # split mode is (de)serialized correctly
 | 
				
			||||||
 | 
					    nlp = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
 | 
				
			||||||
 | 
					    nlp_r = Japanese()
 | 
				
			||||||
 | 
					    nlp_bytes = nlp.to_bytes()
 | 
				
			||||||
 | 
					    nlp_r.from_bytes(nlp_bytes)
 | 
				
			||||||
 | 
					    assert nlp_bytes == nlp_r.to_bytes()
 | 
				
			||||||
 | 
					    assert nlp_r.tokenizer.split_mode == "B"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with make_tempdir() as d:
 | 
				
			||||||
 | 
					        nlp.to_disk(d)
 | 
				
			||||||
 | 
					        nlp_r = Japanese()
 | 
				
			||||||
 | 
					        nlp_r.from_disk(d)
 | 
				
			||||||
 | 
					        assert nlp_bytes == nlp_r.to_bytes()
 | 
				
			||||||
 | 
					        assert nlp_r.tokenizer.split_mode == "B"
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,7 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
 | 
				
			||||||
 | 
					from spacy.lang.ja import Japanese
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
TOKENIZER_TESTS = [
 | 
					TOKENIZER_TESTS = [
 | 
				
			||||||
| 
						 | 
					@ -11,20 +13,25 @@ TOKENIZER_TESTS = [
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
TAG_TESTS = [
 | 
					TAG_TESTS = [
 | 
				
			||||||
    ("日本語だよ", ['名詞,固有名詞,地名,国', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '助詞,終助詞,*,*']),
 | 
					    ("日本語だよ", ['名詞-固有名詞-地名-国', '名詞-普通名詞-一般', '助動詞', '助詞-終助詞']),
 | 
				
			||||||
    ("東京タワーの近くに住んでいます。", ['名詞,固有名詞,地名,一般', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '動詞,非自立可能,*,*', '助動詞,*,*,*', '補助記号,句点,*,*']),
 | 
					    ("東京タワーの近くに住んでいます。", ['名詞-固有名詞-地名-一般', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '動詞-非自立可能', '助動詞', '補助記号-句点']),
 | 
				
			||||||
    ("吾輩は猫である。", ['代名詞,*,*,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助動詞,*,*,*', '動詞,非自立可能,*,*', '補助記号,句点,*,*']),
 | 
					    ("吾輩は猫である。", ['代名詞', '助詞-係助詞', '名詞-普通名詞-一般', '助動詞', '動詞-非自立可能', '補助記号-句点']),
 | 
				
			||||||
    ("月に代わって、お仕置きよ!", ['名詞,普通名詞,助数詞可能,*', '助詞,格助詞,*,*', '動詞,一般,*,*', '助詞,接続助詞,*,*', '補助記号,読点,*,*', '接頭辞,*,*,*', '名詞,普通名詞,一般,*', '助詞,終助詞,*,*', '補助記号,句点,*,*']),
 | 
					    ("月に代わって、お仕置きよ!", ['名詞-普通名詞-助数詞可能', '助詞-格助詞', '動詞-一般', '助詞-接続助詞', '補助記号-読点', '接頭辞', '名詞-普通名詞-一般', '助詞-終助詞', '補助記号-句点']),
 | 
				
			||||||
    ("すもももももももものうち", ['名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,係助詞,*,*', '名詞,普通名詞,一般,*', '助詞,格助詞,*,*', '名詞,普通名詞,副詞可能,*'])
 | 
					    ("すもももももももものうち", ['名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-係助詞', '名詞-普通名詞-一般', '助詞-格助詞', '名詞-普通名詞-副詞可能'])
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
POS_TESTS = [
 | 
					POS_TESTS = [
 | 
				
			||||||
    ('日本語だよ', ['PROPN', 'NOUN', 'AUX', 'PART']),
 | 
					    ('日本語だよ', ['fish', 'NOUN', 'AUX', 'PART']),
 | 
				
			||||||
    ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
 | 
					    ('東京タワーの近くに住んでいます。', ['PROPN', 'NOUN', 'ADP', 'NOUN', 'ADP', 'VERB', 'SCONJ', 'VERB', 'AUX', 'PUNCT']),
 | 
				
			||||||
    ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
 | 
					    ('吾輩は猫である。', ['PRON', 'ADP', 'NOUN', 'AUX', 'VERB', 'PUNCT']),
 | 
				
			||||||
    ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
 | 
					    ('月に代わって、お仕置きよ!', ['NOUN', 'ADP', 'VERB', 'SCONJ', 'PUNCT', 'NOUN', 'NOUN', 'PART', 'PUNCT']),
 | 
				
			||||||
    ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
 | 
					    ('すもももももももものうち', ['NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN', 'ADP', 'NOUN'])
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SENTENCE_TESTS = [
 | 
				
			||||||
 | 
					    ("あれ。これ。", ["あれ。", "これ。"]),
 | 
				
			||||||
 | 
					    ("「伝染るんです。」という漫画があります。", ["「伝染るんです。」という漫画があります。"]),
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
# fmt: on
 | 
					# fmt: on
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -40,14 +47,56 @@ def test_ja_tokenizer_tags(ja_tokenizer, text, expected_tags):
 | 
				
			||||||
    assert tags == expected_tags
 | 
					    assert tags == expected_tags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# XXX This isn't working? Always passes
 | 
				
			||||||
@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
 | 
					@pytest.mark.parametrize("text,expected_pos", POS_TESTS)
 | 
				
			||||||
def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
 | 
					def test_ja_tokenizer_pos(ja_tokenizer, text, expected_pos):
 | 
				
			||||||
    pos = [token.pos_ for token in ja_tokenizer(text)]
 | 
					    pos = [token.pos_ for token in ja_tokenizer(text)]
 | 
				
			||||||
    assert pos == expected_pos
 | 
					    assert pos == expected_pos
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_extra_spaces(ja_tokenizer):
 | 
					@pytest.mark.skip(reason="sentence segmentation in tokenizer is buggy")
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text,expected_sents", SENTENCE_TESTS)
 | 
				
			||||||
 | 
					def test_ja_tokenizer_sents(ja_tokenizer, text, expected_sents):
 | 
				
			||||||
 | 
					    sents = [str(sent) for sent in ja_tokenizer(text).sents]
 | 
				
			||||||
 | 
					    assert sents == expected_sents
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_ja_tokenizer_extra_spaces(ja_tokenizer):
 | 
				
			||||||
    # note: three spaces after "I"
 | 
					    # note: three spaces after "I"
 | 
				
			||||||
    tokens = ja_tokenizer("I   like cheese.")
 | 
					    tokens = ja_tokenizer("I   like cheese.")
 | 
				
			||||||
    assert tokens[1].orth_ == "  "
 | 
					    assert tokens[1].orth_ == "  "
 | 
				
			||||||
    assert tokens[2].orth_ == " "
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
 | 
				
			||||||
 | 
					def test_ja_tokenizer_naughty_strings(ja_tokenizer, text):
 | 
				
			||||||
 | 
					    tokens = ja_tokenizer(text)
 | 
				
			||||||
 | 
					    assert tokens.text_with_ws == text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "text,len_a,len_b,len_c",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        ("選挙管理委員会", 4, 3, 1),
 | 
				
			||||||
 | 
					        ("客室乗務員", 3, 2, 1),
 | 
				
			||||||
 | 
					        ("労働者協同組合", 4, 3, 1),
 | 
				
			||||||
 | 
					        ("機能性食品", 3, 2, 1),
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_ja_tokenizer_split_modes(ja_tokenizer, text, len_a, len_b, len_c):
 | 
				
			||||||
 | 
					    nlp_a = Japanese(meta={"tokenizer": {"config": {"split_mode": "A"}}})
 | 
				
			||||||
 | 
					    nlp_b = Japanese(meta={"tokenizer": {"config": {"split_mode": "B"}}})
 | 
				
			||||||
 | 
					    nlp_c = Japanese(meta={"tokenizer": {"config": {"split_mode": "C"}}})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert len(ja_tokenizer(text)) == len_a
 | 
				
			||||||
 | 
					    assert len(nlp_a(text)) == len_a
 | 
				
			||||||
 | 
					    assert len(nlp_b(text)) == len_b
 | 
				
			||||||
 | 
					    assert len(nlp_c(text)) == len_c
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_ja_tokenizer_emptyish_texts(ja_tokenizer):
 | 
				
			||||||
 | 
					    doc = ja_tokenizer("")
 | 
				
			||||||
 | 
					    assert len(doc) == 0
 | 
				
			||||||
 | 
					    doc = ja_tokenizer(" ")
 | 
				
			||||||
 | 
					    assert len(doc) == 1
 | 
				
			||||||
 | 
					    doc = ja_tokenizer("\n\n\n \t\t \n\n\n")
 | 
				
			||||||
 | 
					    assert len(doc) == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.lang.sv.lex_attrs import like_num
 | 
					from spacy.lang.sv.lex_attrs import like_num
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,3 @@
 | 
				
			||||||
# coding: utf-8
 | 
					 | 
				
			||||||
from __future__ import unicode_literals
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from spacy.lang.zh import Chinese
 | 
					from spacy.lang.zh import Chinese
 | 
				
			||||||
from ...util import make_tempdir
 | 
					from ...util import make_tempdir
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,5 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					import srsly
 | 
				
			||||||
from mock import Mock
 | 
					from mock import Mock
 | 
				
			||||||
from spacy.matcher import PhraseMatcher
 | 
					from spacy.matcher import PhraseMatcher
 | 
				
			||||||
from spacy.tokens import Doc
 | 
					from spacy.tokens import Doc
 | 
				
			||||||
| 
						 | 
					@ -263,3 +264,26 @@ def test_phrase_matcher_basic_check(en_vocab):
 | 
				
			||||||
    pattern = Doc(en_vocab, words=["hello", "world"])
 | 
					    pattern = Doc(en_vocab, words=["hello", "world"])
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        matcher.add("TEST", pattern)
 | 
					        matcher.add("TEST", pattern)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_phrase_matcher_pickle(en_vocab):
 | 
				
			||||||
 | 
					    matcher = PhraseMatcher(en_vocab)
 | 
				
			||||||
 | 
					    mock = Mock()
 | 
				
			||||||
 | 
					    matcher.add("TEST", [Doc(en_vocab, words=["test"])])
 | 
				
			||||||
 | 
					    matcher.add("TEST2", [Doc(en_vocab, words=["test2"])], on_match=mock)
 | 
				
			||||||
 | 
					    doc = Doc(en_vocab, words=["these", "are", "tests", ":", "test", "test2"])
 | 
				
			||||||
 | 
					    assert len(matcher) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    b = srsly.pickle_dumps(matcher)
 | 
				
			||||||
 | 
					    matcher_unpickled = srsly.pickle_loads(b)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # call after pickling to avoid recursion error related to mock
 | 
				
			||||||
 | 
					    matches = matcher(doc)
 | 
				
			||||||
 | 
					    matches_unpickled = matcher_unpickled(doc)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    assert len(matcher) == len(matcher_unpickled)
 | 
				
			||||||
 | 
					    assert matches == matches_unpickled
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # clunky way to vaguely check that callback is unpickled
 | 
				
			||||||
 | 
					    (vocab, docs, callbacks, attr) = matcher_unpickled.__reduce__()[1]
 | 
				
			||||||
 | 
					    assert isinstance(callbacks.get("TEST2"), Mock)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,7 +10,13 @@ def test_build_dependencies():
 | 
				
			||||||
        "mock",
 | 
					        "mock",
 | 
				
			||||||
        "flake8",
 | 
					        "flake8",
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
 | 
					    libs_ignore_setup = [
 | 
				
			||||||
 | 
					        "fugashi",
 | 
				
			||||||
 | 
					        "natto-py",
 | 
				
			||||||
 | 
					        "pythainlp",
 | 
				
			||||||
 | 
					        "sudachipy",
 | 
				
			||||||
 | 
					        "sudachidict_core",
 | 
				
			||||||
 | 
					    ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # check requirements.txt
 | 
					    # check requirements.txt
 | 
				
			||||||
    req_dict = {}
 | 
					    req_dict = {}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,5 +1,5 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from thinc.api import Adam, NumpyOps
 | 
					from thinc.api import Adam
 | 
				
			||||||
from spacy.attrs import NORM
 | 
					from spacy.attrs import NORM
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,6 +3,9 @@ from spacy.attrs import ENT_IOB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy import util
 | 
					from spacy import util
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from spacy.language import Language
 | 
				
			||||||
 | 
					from spacy.lookups import Lookups
 | 
				
			||||||
from spacy.pipeline.defaults import default_ner
 | 
					from spacy.pipeline.defaults import default_ner
 | 
				
			||||||
from spacy.pipeline import EntityRecognizer, EntityRuler
 | 
					from spacy.pipeline import EntityRecognizer, EntityRuler
 | 
				
			||||||
from spacy.vocab import Vocab
 | 
					from spacy.vocab import Vocab
 | 
				
			||||||
| 
						 | 
					@ -353,6 +356,21 @@ def test_overfitting_IO():
 | 
				
			||||||
        assert ents2[0].label_ == "LOC"
 | 
					        assert ents2[0].label_ == "LOC"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_ner_warns_no_lookups():
 | 
				
			||||||
 | 
					    nlp = Language()
 | 
				
			||||||
 | 
					    nlp.vocab.lookups = Lookups()
 | 
				
			||||||
 | 
					    assert not len(nlp.vocab.lookups)
 | 
				
			||||||
 | 
					    ner = nlp.create_pipe("ner")
 | 
				
			||||||
 | 
					    nlp.add_pipe(ner)
 | 
				
			||||||
 | 
					    with pytest.warns(UserWarning):
 | 
				
			||||||
 | 
					        nlp.begin_training()
 | 
				
			||||||
 | 
					    nlp.vocab.lookups.add_table("lexeme_norm")
 | 
				
			||||||
 | 
					    nlp.vocab.lookups.get_table("lexeme_norm")["a"] = "A"
 | 
				
			||||||
 | 
					    with pytest.warns(None) as record:
 | 
				
			||||||
 | 
					        nlp.begin_training()
 | 
				
			||||||
 | 
					        assert not record.list
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class BlockerComponent1(object):
 | 
					class BlockerComponent1(object):
 | 
				
			||||||
    name = "my_blocker"
 | 
					    name = "my_blocker"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										0
									
								
								spacy/tests/parser/test_nn_beam.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										0
									
								
								spacy/tests/parser/test_nn_beam.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -1,10 +1,7 @@
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
from collections import namedtuple
 | 
					from collections import namedtuple
 | 
				
			||||||
 | 
					 | 
				
			||||||
from thinc.api import NumpyOps
 | 
					from thinc.api import NumpyOps
 | 
				
			||||||
from spacy.ml._biluo import BILUO, _get_transition_table
 | 
					from spacy.ml._biluo import BILUO, _get_transition_table
 | 
				
			||||||
from spacy.pipeline.simple_ner import SimpleNER
 | 
					 | 
				
			||||||
import spacy
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture(
 | 
					@pytest.fixture(
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,3 @@
 | 
				
			||||||
import pytest
 | 
					 | 
				
			||||||
from spacy.language import Language
 | 
					from spacy.language import Language
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show More
		Loading…
	
		Reference in New Issue
	
	Block a user