diff --git a/.github/contributors/GiorgioPorgio.md b/.github/contributors/GiorgioPorgio.md
new file mode 100644
index 000000000..ffa1f693e
--- /dev/null
+++ b/.github/contributors/GiorgioPorgio.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | George Ketsopoulos |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 23 October 2019 |
+| GitHub username | GiorgioPorgio |
+| Website (optional) | |
diff --git a/.github/contributors/PeterGilles.md b/.github/contributors/PeterGilles.md
new file mode 100644
index 000000000..17fe3e2b1
--- /dev/null
+++ b/.github/contributors/PeterGilles.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [X] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Peter Gilles |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 10.10. |
+| GitHub username | Peter Gilles |
+| Website (optional) | |
diff --git a/.github/contributors/gustavengstrom.md b/.github/contributors/gustavengstrom.md
new file mode 100644
index 000000000..d3e0da5a4
--- /dev/null
+++ b/.github/contributors/gustavengstrom.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Gustav Engström |
+| Company name (if applicable) | Davcon |
+| Title or role (if applicable) | Data scientis |
+| Date | 2019-10-10 |
+| GitHub username | gustavengstrom |
+| Website (optional) | |
diff --git a/.github/contributors/neelkamath.md b/.github/contributors/neelkamath.md
new file mode 100644
index 000000000..76502e7c0
--- /dev/null
+++ b/.github/contributors/neelkamath.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | ---------------------- |
+| Name | Neel Kamath |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | October 30, 2019 |
+| GitHub username | neelkamath |
+| Website (optional) | https://neelkamath.com |
diff --git a/.github/contributors/pberba.md b/.github/contributors/pberba.md
new file mode 100644
index 000000000..34feb3eea
--- /dev/null
+++ b/.github/contributors/pberba.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [X] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Pepe Berba |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2019-10-18 |
+| GitHub username | pberba |
+| Website (optional) | |
\ No newline at end of file
diff --git a/.github/contributors/prilopes.md b/.github/contributors/prilopes.md
new file mode 100644
index 000000000..ad111d4de
--- /dev/null
+++ b/.github/contributors/prilopes.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Priscilla Lopes |
+| Company name (if applicable) | |
+| Title or role (if applicable) | |
+| Date | 2019-11-06 |
+| GitHub username | prilopes |
+| Website (optional) | |
diff --git a/.github/contributors/questoph.md b/.github/contributors/questoph.md
new file mode 100644
index 000000000..24559c098
--- /dev/null
+++ b/.github/contributors/questoph.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI GmbH](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | -------------------- |
+| Name | Christoph Purschke |
+| Company name (if applicable) | University of Luxembourg |
+| Title or role (if applicable) | |
+| Date | 14/11/2019 |
+| GitHub username | questoph |
+| Website (optional) | https://purschke.info |
diff --git a/.github/contributors/zhuorulin.md b/.github/contributors/zhuorulin.md
new file mode 100644
index 000000000..8fef7577a
--- /dev/null
+++ b/.github/contributors/zhuorulin.md
@@ -0,0 +1,106 @@
+# spaCy contributor agreement
+
+This spaCy Contributor Agreement (**"SCA"**) is based on the
+[Oracle Contributor Agreement](http://www.oracle.com/technetwork/oca-405177.pdf).
+The SCA applies to any contribution that you make to any product or project
+managed by us (the **"project"**), and sets out the intellectual property rights
+you grant to us in the contributed materials. The term **"us"** shall mean
+[ExplosionAI UG (haftungsbeschränkt)](https://explosion.ai/legal). The term
+**"you"** shall mean the person or entity identified below.
+
+If you agree to be bound by these terms, fill in the information requested
+below and include the filled-in version with your first pull request, under the
+folder [`.github/contributors/`](/.github/contributors/). The name of the file
+should be your GitHub username, with the extension `.md`. For example, the user
+example_user would create the file `.github/contributors/example_user.md`.
+
+Read this agreement carefully before signing. These terms and conditions
+constitute a binding legal agreement.
+
+## Contributor Agreement
+
+1. The term "contribution" or "contributed materials" means any source code,
+object code, patch, tool, sample, graphic, specification, manual,
+documentation, or any other material posted or submitted by you to the project.
+
+2. With respect to any worldwide copyrights, or copyright applications and
+registrations, in your contribution:
+
+ * you hereby assign to us joint ownership, and to the extent that such
+ assignment is or becomes invalid, ineffective or unenforceable, you hereby
+ grant to us a perpetual, irrevocable, non-exclusive, worldwide, no-charge,
+ royalty-free, unrestricted license to exercise all rights under those
+ copyrights. This includes, at our option, the right to sublicense these same
+ rights to third parties through multiple levels of sublicensees or other
+ licensing arrangements;
+
+ * you agree that each of us can do all things in relation to your
+ contribution as if each of us were the sole owners, and if one of us makes
+ a derivative work of your contribution, the one who makes the derivative
+ work (or has it made will be the sole owner of that derivative work;
+
+ * you agree that you will not assert any moral rights in your contribution
+ against us, our licensees or transferees;
+
+ * you agree that we may register a copyright in your contribution and
+ exercise all ownership rights associated with it; and
+
+ * you agree that neither of us has any duty to consult with, obtain the
+ consent of, pay or render an accounting to the other for any use or
+ distribution of your contribution.
+
+3. With respect to any patents you own, or that you can license without payment
+to any third party, you hereby grant to us a perpetual, irrevocable,
+non-exclusive, worldwide, no-charge, royalty-free license to:
+
+ * make, have made, use, sell, offer to sell, import, and otherwise transfer
+ your contribution in whole or in part, alone or in combination with or
+ included in any product, work or materials arising out of the project to
+ which your contribution was submitted, and
+
+ * at our option, to sublicense these same rights to third parties through
+ multiple levels of sublicensees or other licensing arrangements.
+
+4. Except as set out above, you keep all right, title, and interest in your
+contribution. The rights that you grant to us under these terms are effective
+on the date you first submitted a contribution to us, even if your submission
+took place before the date you sign these terms.
+
+5. You covenant, represent, warrant and agree that:
+
+ * Each contribution that you submit is and shall be an original work of
+ authorship and you can legally grant the rights set out in this SCA;
+
+ * to the best of your knowledge, each contribution will not violate any
+ third party's copyrights, trademarks, patents, or other intellectual
+ property rights; and
+
+ * each contribution shall be in compliance with U.S. export control laws and
+ other applicable export and import laws. You agree to notify us if you
+ become aware of any circumstance which would make any of the foregoing
+ representations inaccurate in any respect. We may publicly disclose your
+ participation in the project, including the fact that you have signed the SCA.
+
+6. This SCA is governed by the laws of the State of California and applicable
+U.S. Federal law. Any choice of law rules will not apply.
+
+7. Please place an “x” on one of the applicable statement below. Please do NOT
+mark both statements:
+
+ * [x] I am signing on behalf of myself as an individual and no other person
+ or entity, including my employer, has or will have rights with respect to my
+ contributions.
+
+ * [ ] I am signing on behalf of my employer or a legal entity and I have the
+ actual authority to contractually bind that entity.
+
+## Contributor Details
+
+| Field | Entry |
+|------------------------------- | ------------------------ |
+| Name | Zhuoru Lin |
+| Company name (if applicable) | Bombora Inc. |
+| Title or role (if applicable) | Data Scientist |
+| Date | 2017-11-13 |
+| GitHub username | ZhuoruLin |
+| Website (optional) | |
diff --git a/Makefile b/Makefile
index 0f5c31ca6..5d15bccec 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ dist/spacy.pex : dist/spacy-$(sha).pex
dist/spacy-$(sha).pex : dist/$(wheel)
env3.6/bin/python -m pip install pex==1.5.3
- env3.6/bin/pex pytest dist/$(wheel) -e spacy -o dist/spacy-$(sha).pex
+ env3.6/bin/pex pytest dist/$(wheel) spacy_lookups_data -e spacy -o dist/spacy-$(sha).pex
dist/$(wheel) : setup.py spacy/*.py* spacy/*/*.py*
python3.6 -m venv env3.6
diff --git a/README.md b/README.md
index 18ec75b62..31dc78d63 100644
--- a/README.md
+++ b/README.md
@@ -72,21 +72,21 @@ it.
## Features
-- Non-destructive **tokenization**
-- **Named entity** recognition
-- Support for **50+ languages**
-- pretrained [statistical models](https://spacy.io/models) and word vectors
-- State-of-the-art speed
-- Easy **deep learning** integration
-- Part-of-speech tagging
-- Labelled dependency parsing
-- Syntax-driven sentence segmentation
-- Built in **visualizers** for syntax and NER
-- Convenient string-to-hash mapping
-- Export to numpy data arrays
-- Efficient binary serialization
-- Easy **model packaging** and deployment
-- Robust, rigorously evaluated accuracy
+- Non-destructive **tokenization**
+- **Named entity** recognition
+- Support for **50+ languages**
+- pretrained [statistical models](https://spacy.io/models) and word vectors
+- State-of-the-art speed
+- Easy **deep learning** integration
+- Part-of-speech tagging
+- Labelled dependency parsing
+- Syntax-driven sentence segmentation
+- Built in **visualizers** for syntax and NER
+- Convenient string-to-hash mapping
+- Export to numpy data arrays
+- Efficient binary serialization
+- Easy **model packaging** and deployment
+- Robust, rigorously evaluated accuracy
📖 **For more details, see the
[facts, figures and benchmarks](https://spacy.io/usage/facts-figures).**
@@ -96,10 +96,10 @@ it.
For detailed installation instructions, see the
[documentation](https://spacy.io/usage).
-- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
- Studio)
-- **Python version**: Python 2.7, 3.5+ (only 64 bit)
-- **Package managers**: [pip] · [conda] (via `conda-forge`)
+- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
+ Studio)
+- **Python version**: Python 2.7, 3.5+ (only 64 bit)
+- **Package managers**: [pip] · [conda] (via `conda-forge`)
[pip]: https://pypi.org/project/spacy/
[conda]: https://anaconda.org/conda-forge/spacy
@@ -135,8 +135,7 @@ Thanks to our great community, we've finally re-added conda support. You can now
install spaCy via `conda-forge`:
```bash
-conda config --add channels conda-forge
-conda install spacy
+conda install -c conda-forge spacy
```
For the feedstock including the build recipe and configuration, check out
@@ -181,9 +180,6 @@ pointing pip to a path or URL.
# download best-matching version of specific model for your spaCy installation
python -m spacy download en_core_web_sm
-# out-of-the-box: download best-matching default model
-python -m spacy download en
-
# pip install .tar.gz archive from path or URL
pip install /Users/you/en_core_web_sm-2.2.0.tar.gz
pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
@@ -197,7 +193,7 @@ path to the model data directory.
```python
import spacy
nlp = spacy.load("en_core_web_sm")
-doc = nlp(u"This is a sentence.")
+doc = nlp("This is a sentence.")
```
You can also `import` a model directly via its full name and then call its
@@ -208,22 +204,12 @@ import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
-doc = nlp(u"This is a sentence.")
+doc = nlp("This is a sentence.")
```
📖 **For more info and examples, check out the
[models documentation](https://spacy.io/docs/usage/models).**
-### Support for older versions
-
-If you're using an older version (`v1.6.0` or below), you can still download and
-install the old models from within spaCy using `python -m spacy.en.download all`
-or `python -m spacy.de.download all`. The `.tar.gz` archives are also
-[attached to the v1.6.0 release](https://github.com/explosion/spaCy/tree/v1.6.0).
-To download and install the models manually, unpack the archive, drop the
-contained directory into `spacy/data` and load the model via `spacy.load('en')`
-or `spacy.load('de')`.
-
## Compile from source
The other way to install spaCy is to clone its
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index c23995de6..029cc9dd0 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -9,6 +9,11 @@ trigger:
exclude:
- 'website/*'
- '*.md'
+pr:
+ paths:
+ exclude:
+ - 'website/*'
+ - '*.md'
jobs:
@@ -30,24 +35,12 @@ jobs:
dependsOn: 'Validate'
strategy:
matrix:
- # Python 2.7 currently doesn't work because it seems to be a narrow
- # unicode build, which causes problems with the regular expressions
-
- # Python27Linux:
- # imageName: 'ubuntu-16.04'
- # python.version: '2.7'
- # Python27Mac:
- # imageName: 'macos-10.13'
- # python.version: '2.7'
Python35Linux:
imageName: 'ubuntu-16.04'
python.version: '3.5'
Python35Windows:
imageName: 'vs2017-win2016'
python.version: '3.5'
- Python35Mac:
- imageName: 'macos-10.13'
- python.version: '3.5'
Python36Linux:
imageName: 'ubuntu-16.04'
python.version: '3.6'
@@ -66,6 +59,15 @@ jobs:
Python37Mac:
imageName: 'macos-10.13'
python.version: '3.7'
+ Python38Linux:
+ imageName: 'ubuntu-16.04'
+ python.version: '3.8'
+ Python38Windows:
+ imageName: 'vs2017-win2016'
+ python.version: '3.8'
+ Python38Mac:
+ imageName: 'macos-10.13'
+ python.version: '3.8'
maxParallel: 4
pool:
vmImage: $(imageName)
@@ -76,10 +78,8 @@ jobs:
versionSpec: '$(python.version)'
architecture: 'x64'
- # Downgrading pip is necessary to prevent a wheel version incompatiblity.
- # Might be fixed in the future or some other way, so investigate again.
- script: |
- python -m pip install -U pip==18.1 setuptools
+ python -m pip install -U setuptools
pip install -r requirements.txt
displayName: 'Install dependencies'
diff --git a/bin/ud/ud_run_test.py b/bin/ud/ud_run_test.py
index de01cf350..7cb270d84 100644
--- a/bin/ud/ud_run_test.py
+++ b/bin/ud/ud_run_test.py
@@ -84,7 +84,7 @@ def read_conllu(file_):
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
if text_loc.parts[-1].endswith(".conllu"):
docs = []
- with text_loc.open() as file_:
+ with text_loc.open(encoding="utf8") as file_:
for conllu_doc in read_conllu(file_):
for conllu_sent in conllu_doc:
words = [line[1] for line in conllu_sent]
diff --git a/bin/ud/ud_train.py b/bin/ud/ud_train.py
index 5d4f20d6e..ddd87a31c 100644
--- a/bin/ud/ud_train.py
+++ b/bin/ud/ud_train.py
@@ -7,7 +7,6 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
import re
-import sys
import json
import spacy
@@ -19,12 +18,9 @@ from spacy.util import compounding, minibatch, minibatch_by_words
from spacy.syntax.nonproj import projectivize
from spacy.matcher import Matcher
from spacy import displacy
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
+from collections import defaultdict
-import itertools
import random
-import numpy.random
from spacy import lang
from spacy.lang import zh
@@ -203,7 +199,7 @@ def golds_to_gold_tuples(docs, golds):
def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
if text_loc.parts[-1].endswith(".conllu"):
docs = []
- with text_loc.open() as file_:
+ with text_loc.open(encoding="utf8") as file_:
for conllu_doc in read_conllu(file_):
for conllu_sent in conllu_doc:
words = [line[1] for line in conllu_sent]
@@ -225,6 +221,13 @@ def evaluate(nlp, text_loc, gold_loc, sys_loc, limit=None):
def write_conllu(docs, file_):
+ if not Token.has_extension("get_conllu_lines"):
+ Token.set_extension("get_conllu_lines", method=get_token_conllu)
+ if not Token.has_extension("begins_fused"):
+ Token.set_extension("begins_fused", default=False)
+ if not Token.has_extension("inside_fused"):
+ Token.set_extension("inside_fused", default=False)
+
merger = Matcher(docs[0].vocab)
merger.add("SUBTOK", None, [{"DEP": "subtok", "op": "+"}])
for i, doc in enumerate(docs):
@@ -323,10 +326,6 @@ def get_token_conllu(token, i):
return "\n".join(lines)
-Token.set_extension("get_conllu_lines", method=get_token_conllu, force=True)
-Token.set_extension("begins_fused", default=False, force=True)
-Token.set_extension("inside_fused", default=False, force=True)
-
##################
# Initialization #
@@ -378,7 +377,7 @@ def _load_pretrained_tok2vec(nlp, loc):
"""Load pretrained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
"""
- with Path(loc).open("rb") as file_:
+ with Path(loc).open("rb", encoding="utf8") as file_:
weights_data = file_.read()
loaded = []
for name, component in nlp.pipeline:
@@ -459,13 +458,13 @@ class TreebankPaths(object):
@plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+ parses_dir=("Directory to write the development parses", "positional", None, Path),
corpus=(
- "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+ "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
"positional",
None,
str,
),
- parses_dir=("Directory to write the development parses", "positional", None, Path),
config=("Path to json formatted config file", "option", "C", Path),
limit=("Size limit", "option", "n", int),
gpu_device=("Use GPU", "option", "g", int),
@@ -490,6 +489,10 @@ def main(
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
+ Token.set_extension("get_conllu_lines", method=get_token_conllu)
+ Token.set_extension("begins_fused", default=False)
+ Token.set_extension("inside_fused", default=False)
+
spacy.util.fix_random_seed()
lang.zh.Chinese.Defaults.use_jieba = False
lang.ja.Japanese.Defaults.use_janome = False
@@ -506,8 +509,8 @@ def main(
docs, golds = read_data(
nlp,
- paths.train.conllu.open(),
- paths.train.text.open(),
+ paths.train.conllu.open(encoding="utf8"),
+ paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length,
limit=limit,
)
@@ -519,8 +522,8 @@ def main(
for i in range(config.nr_epoch):
docs, golds = read_data(
nlp,
- paths.train.conllu.open(),
- paths.train.text.open(),
+ paths.train.conllu.open(encoding="utf8"),
+ paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length,
limit=limit,
oracle_segments=use_oracle_segments,
@@ -560,7 +563,7 @@ def main(
def _render_parses(i, to_render):
to_render[0].user_data["title"] = "Batch %d" % i
- with Path("/tmp/parses.html").open("w") as file_:
+ with Path("/tmp/parses.html").open("w", encoding="utf8") as file_:
html = displacy.render(to_render[:5], style="dep", page=True)
file_.write(html)
diff --git a/bin/wiki_entity_linking/README.md b/bin/wiki_entity_linking/README.md
new file mode 100644
index 000000000..540878592
--- /dev/null
+++ b/bin/wiki_entity_linking/README.md
@@ -0,0 +1,34 @@
+## Entity Linking with Wikipedia and Wikidata
+
+### Step 1: Create a Knowledge Base (KB) and training data
+
+Run `wikipedia_pretrain_kb.py`
+* This takes as input the locations of a **Wikipedia and a Wikidata dump**, and produces a **KB directory** + **training file**
+ * WikiData: get `latest-all.json.bz2` from https://dumps.wikimedia.org/wikidatawiki/entities/
+ * Wikipedia: get `enwiki-latest-pages-articles-multistream.xml.bz2` from https://dumps.wikimedia.org/enwiki/latest/ (or for any other language)
+* You can set the filtering parameters for KB construction:
+ * `max_per_alias`: (max) number of candidate entities in the KB per alias/synonym
+ * `min_freq`: threshold of number of times an entity should occur in the corpus to be included in the KB
+ * `min_pair`: threshold of number of times an entity+alias combination should occur in the corpus to be included in the KB
+* Further parameters to set:
+ * `descriptions_from_wikipedia`: whether to parse descriptions from Wikipedia (`True`) or Wikidata (`False`)
+ * `entity_vector_length`: length of the pre-trained entity description vectors
+ * `lang`: language for which to fetch Wikidata information (as the dump contains all languages)
+
+Quick testing and rerunning:
+* When trying out the pipeline for a quick test, set `limit_prior`, `limit_train` and/or `limit_wd` to read only parts of the dumps instead of everything.
+* If you only want to (re)run certain parts of the pipeline, just remove the corresponding files and they will be recalculated or reparsed.
+
+
+### Step 2: Train an Entity Linking model
+
+Run `wikidata_train_entity_linker.py`
+* This takes the **KB directory** produced by Step 1, and trains an **Entity Linking model**
+* You can set the learning parameters for the EL training:
+ * `epochs`: number of training iterations
+ * `dropout`: dropout rate
+ * `lr`: learning rate
+ * `l2`: L2 regularization
+* Specify the number of training and dev testing entities with `train_inst` and `dev_inst` respectively
+* Further parameters to set:
+ * `labels_discard`: NER label types to discard during training
diff --git a/bin/wiki_entity_linking/__init__.py b/bin/wiki_entity_linking/__init__.py
index a604bcc2f..de486bbcf 100644
--- a/bin/wiki_entity_linking/__init__.py
+++ b/bin/wiki_entity_linking/__init__.py
@@ -6,6 +6,7 @@ OUTPUT_MODEL_DIR = "nlp"
PRIOR_PROB_PATH = "prior_prob.csv"
ENTITY_DEFS_PATH = "entity_defs.csv"
ENTITY_FREQ_PATH = "entity_freq.csv"
+ENTITY_ALIAS_PATH = "entity_alias.csv"
ENTITY_DESCR_PATH = "entity_descriptions.csv"
LOG_FORMAT = '%(asctime)s - %(levelname)s - %(name)s - %(message)s'
diff --git a/bin/wiki_entity_linking/entity_linker_evaluation.py b/bin/wiki_entity_linking/entity_linker_evaluation.py
index 1b1200564..94bafbf30 100644
--- a/bin/wiki_entity_linking/entity_linker_evaluation.py
+++ b/bin/wiki_entity_linking/entity_linker_evaluation.py
@@ -15,10 +15,11 @@ class Metrics(object):
candidate_is_correct = true_entity == candidate
# Assume that we have no labeled negatives in the data (i.e. cases where true_entity is "NIL")
- # Therefore, if candidate_is_correct then we have a true positive and never a true negative
+ # Therefore, if candidate_is_correct then we have a true positive and never a true negative.
self.true_pos += candidate_is_correct
self.false_neg += not candidate_is_correct
- if candidate not in {"", "NIL"}:
+ if candidate and candidate not in {"", "NIL"}:
+ # A wrong prediction (e.g. Q42 != Q3) counts both as a FP as well as a FN.
self.false_pos += not candidate_is_correct
def calculate_precision(self):
@@ -33,6 +34,14 @@ class Metrics(object):
else:
return self.true_pos / (self.true_pos + self.false_neg)
+ def calculate_fscore(self):
+ p = self.calculate_precision()
+ r = self.calculate_recall()
+ if p + r == 0:
+ return 0.0
+ else:
+ return 2 * p * r / (p + r)
+
class EvaluationResults(object):
def __init__(self):
@@ -43,18 +52,20 @@ class EvaluationResults(object):
self.metrics.update_results(true_entity, candidate)
self.metrics_by_label[ent_label].update_results(true_entity, candidate)
- def increment_false_negatives(self):
- self.metrics.false_neg += 1
-
def report_metrics(self, model_name):
model_str = model_name.title()
recall = self.metrics.calculate_recall()
precision = self.metrics.calculate_precision()
- return ("{}: ".format(model_str) +
- "Recall = {} | ".format(round(recall, 3)) +
- "Precision = {} | ".format(round(precision, 3)) +
- "Precision by label = {}".format({k: v.calculate_precision()
- for k, v in self.metrics_by_label.items()}))
+ fscore = self.metrics.calculate_fscore()
+ return (
+ "{}: ".format(model_str)
+ + "F-score = {} | ".format(round(fscore, 3))
+ + "Recall = {} | ".format(round(recall, 3))
+ + "Precision = {} | ".format(round(precision, 3))
+ + "F-score by label = {}".format(
+ {k: v.calculate_fscore() for k, v in sorted(self.metrics_by_label.items())}
+ )
+ )
class BaselineResults(object):
@@ -63,40 +74,51 @@ class BaselineResults(object):
self.prior = EvaluationResults()
self.oracle = EvaluationResults()
- def report_accuracy(self, model):
+ def report_performance(self, model):
results = getattr(self, model)
return results.report_metrics(model)
- def update_baselines(self, true_entity, ent_label, random_candidate, prior_candidate, oracle_candidate):
+ def update_baselines(
+ self,
+ true_entity,
+ ent_label,
+ random_candidate,
+ prior_candidate,
+ oracle_candidate,
+ ):
self.oracle.update_metrics(ent_label, true_entity, oracle_candidate)
self.prior.update_metrics(ent_label, true_entity, prior_candidate)
self.random.update_metrics(ent_label, true_entity, random_candidate)
-def measure_performance(dev_data, kb, el_pipe):
- baseline_accuracies = measure_baselines(
- dev_data, kb
- )
+def measure_performance(dev_data, kb, el_pipe, baseline=True, context=True):
+ if baseline:
+ baseline_accuracies, counts = measure_baselines(dev_data, kb)
+ logger.info("Counts: {}".format({k: v for k, v in sorted(counts.items())}))
+ logger.info(baseline_accuracies.report_performance("random"))
+ logger.info(baseline_accuracies.report_performance("prior"))
+ logger.info(baseline_accuracies.report_performance("oracle"))
- logger.info(baseline_accuracies.report_accuracy("random"))
- logger.info(baseline_accuracies.report_accuracy("prior"))
- logger.info(baseline_accuracies.report_accuracy("oracle"))
+ if context:
+ # using only context
+ el_pipe.cfg["incl_context"] = True
+ el_pipe.cfg["incl_prior"] = False
+ results = get_eval_results(dev_data, el_pipe)
+ logger.info(results.report_metrics("context only"))
- # using only context
- el_pipe.cfg["incl_context"] = True
- el_pipe.cfg["incl_prior"] = False
- results = get_eval_results(dev_data, el_pipe)
- logger.info(results.report_metrics("context only"))
-
- # measuring combined accuracy (prior + context)
- el_pipe.cfg["incl_context"] = True
- el_pipe.cfg["incl_prior"] = True
- results = get_eval_results(dev_data, el_pipe)
- logger.info(results.report_metrics("context and prior"))
+ # measuring combined accuracy (prior + context)
+ el_pipe.cfg["incl_context"] = True
+ el_pipe.cfg["incl_prior"] = True
+ results = get_eval_results(dev_data, el_pipe)
+ logger.info(results.report_metrics("context and prior"))
def get_eval_results(data, el_pipe=None):
- # If the docs in the data require further processing with an entity linker, set el_pipe
+ """
+ Evaluate the ent.kb_id_ annotations against the gold standard.
+ Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
+ If the docs in the data require further processing with an entity linker, set el_pipe.
+ """
from tqdm import tqdm
docs = []
@@ -111,18 +133,15 @@ def get_eval_results(data, el_pipe=None):
results = EvaluationResults()
for doc, gold in zip(docs, golds):
- tagged_entries_per_article = {_offset(ent.start_char, ent.end_char): ent for ent in doc.ents}
try:
correct_entries_per_article = dict()
for entity, kb_dict in gold.links.items():
start, end = entity
- # only evaluating on positive examples
for gold_kb, value in kb_dict.items():
if value:
+ # only evaluating on positive examples
offset = _offset(start, end)
correct_entries_per_article[offset] = gold_kb
- if offset not in tagged_entries_per_article:
- results.increment_false_negatives()
for ent in doc.ents:
ent_label = ent.label_
@@ -142,7 +161,11 @@ def get_eval_results(data, el_pipe=None):
def measure_baselines(data, kb):
- # Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound
+ """
+ Measure 3 performance baselines: random selection, prior probabilities, and 'oracle' prediction for upper bound.
+ Only evaluate entities that overlap between gold and NER, to isolate the performance of the NEL.
+ Also return a dictionary of counts by entity label.
+ """
counts_d = dict()
baseline_results = BaselineResults()
@@ -152,7 +175,6 @@ def measure_baselines(data, kb):
for doc, gold in zip(docs, golds):
correct_entries_per_article = dict()
- tagged_entries_per_article = {_offset(ent.start_char, ent.end_char): ent for ent in doc.ents}
for entity, kb_dict in gold.links.items():
start, end = entity
for gold_kb, value in kb_dict.items():
@@ -160,10 +182,6 @@ def measure_baselines(data, kb):
if value:
offset = _offset(start, end)
correct_entries_per_article[offset] = gold_kb
- if offset not in tagged_entries_per_article:
- baseline_results.random.increment_false_negatives()
- baseline_results.oracle.increment_false_negatives()
- baseline_results.prior.increment_false_negatives()
for ent in doc.ents:
ent_label = ent.label_
@@ -176,7 +194,7 @@ def measure_baselines(data, kb):
if gold_entity is not None:
candidates = kb.get_candidates(ent.text)
oracle_candidate = ""
- best_candidate = ""
+ prior_candidate = ""
random_candidate = ""
if candidates:
scores = []
@@ -187,13 +205,21 @@ def measure_baselines(data, kb):
oracle_candidate = c.entity_
best_index = scores.index(max(scores))
- best_candidate = candidates[best_index].entity_
+ prior_candidate = candidates[best_index].entity_
random_candidate = random.choice(candidates).entity_
- baseline_results.update_baselines(gold_entity, ent_label,
- random_candidate, best_candidate, oracle_candidate)
+ current_count = counts_d.get(ent_label, 0)
+ counts_d[ent_label] = current_count+1
- return baseline_results
+ baseline_results.update_baselines(
+ gold_entity,
+ ent_label,
+ random_candidate,
+ prior_candidate,
+ oracle_candidate,
+ )
+
+ return baseline_results, counts_d
def _offset(start, end):
diff --git a/bin/wiki_entity_linking/kb_creator.py b/bin/wiki_entity_linking/kb_creator.py
index 0eeb63803..7778fc701 100644
--- a/bin/wiki_entity_linking/kb_creator.py
+++ b/bin/wiki_entity_linking/kb_creator.py
@@ -1,17 +1,12 @@
# coding: utf-8
from __future__ import unicode_literals
-import csv
import logging
-import spacy
-import sys
from spacy.kb import KnowledgeBase
-from bin.wiki_entity_linking import wikipedia_processor as wp
from bin.wiki_entity_linking.train_descriptions import EntityEncoder
-
-csv.field_size_limit(sys.maxsize)
+from bin.wiki_entity_linking import wiki_io as io
logger = logging.getLogger(__name__)
@@ -22,18 +17,24 @@ def create_kb(
max_entities_per_alias,
min_entity_freq,
min_occ,
- entity_def_input,
+ entity_def_path,
entity_descr_path,
- count_input,
- prior_prob_input,
+ entity_alias_path,
+ entity_freq_path,
+ prior_prob_path,
entity_vector_length,
):
# Create the knowledge base from Wikidata entries
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=entity_vector_length)
+ entity_list, filtered_title_to_id = _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length)
+ _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path)
+ return kb
+
+def _define_entities(nlp, kb, entity_def_path, entity_descr_path, min_entity_freq, entity_freq_path, entity_vector_length):
# read the mappings from file
- title_to_id = get_entity_to_id(entity_def_input)
- id_to_descr = get_id_to_description(entity_descr_path)
+ title_to_id = io.read_title_to_id(entity_def_path)
+ id_to_descr = io.read_id_to_descr(entity_descr_path)
# check the length of the nlp vectors
if "vectors" in nlp.meta and nlp.vocab.vectors.size:
@@ -45,10 +46,8 @@ def create_kb(
" cf. https://spacy.io/usage/models#languages."
)
- logger.info("Get entity frequencies")
- entity_frequencies = wp.get_all_frequencies(count_input=count_input)
-
logger.info("Filtering entities with fewer than {} mentions".format(min_entity_freq))
+ entity_frequencies = io.read_entity_to_count(entity_freq_path)
# filter the entities for in the KB by frequency, because there's just too much data (8M entities) otherwise
filtered_title_to_id, entity_list, description_list, frequency_list = get_filtered_entities(
title_to_id,
@@ -56,36 +55,33 @@ def create_kb(
entity_frequencies,
min_entity_freq
)
- logger.info("Left with {} entities".format(len(description_list)))
+ logger.info("Kept {} entities from the set of {}".format(len(description_list), len(title_to_id.keys())))
- logger.info("Train entity encoder")
+ logger.info("Training entity encoder")
encoder = EntityEncoder(nlp, input_dim, entity_vector_length)
encoder.train(description_list=description_list, to_print=True)
- logger.info("Get entity embeddings:")
+ logger.info("Getting entity embeddings")
embeddings = encoder.apply_encoder(description_list)
logger.info("Adding {} entities".format(len(entity_list)))
kb.set_entities(
entity_list=entity_list, freq_list=frequency_list, vector_list=embeddings
)
+ return entity_list, filtered_title_to_id
- logger.info("Adding aliases")
+
+def _define_aliases(kb, entity_alias_path, entity_list, filtered_title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
+ logger.info("Adding aliases from Wikipedia and Wikidata")
_add_aliases(
kb,
+ entity_list=entity_list,
title_to_id=filtered_title_to_id,
max_entities_per_alias=max_entities_per_alias,
min_occ=min_occ,
- prior_prob_input=prior_prob_input,
+ prior_prob_path=prior_prob_path,
)
- logger.info("KB size: {} entities, {} aliases".format(
- kb.get_size_entities(),
- kb.get_size_aliases()))
-
- logger.info("Done with kb")
- return kb
-
def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies,
min_entity_freq: int = 10):
@@ -104,34 +100,13 @@ def get_filtered_entities(title_to_id, id_to_descr, entity_frequencies,
return filtered_title_to_id, entity_list, description_list, frequency_list
-def get_entity_to_id(entity_def_output):
- entity_to_id = dict()
- with entity_def_output.open("r", encoding="utf8") as csvfile:
- csvreader = csv.reader(csvfile, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- entity_to_id[row[0]] = row[1]
- return entity_to_id
-
-
-def get_id_to_description(entity_descr_path):
- id_to_desc = dict()
- with entity_descr_path.open("r", encoding="utf8") as csvfile:
- csvreader = csv.reader(csvfile, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- id_to_desc[row[0]] = row[1]
- return id_to_desc
-
-
-def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_input):
+def _add_aliases(kb, entity_list, title_to_id, max_entities_per_alias, min_occ, prior_prob_path):
wp_titles = title_to_id.keys()
# adding aliases with prior probabilities
# we can read this file sequentially, it's sorted by alias, and then by count
- with prior_prob_input.open("r", encoding="utf8") as prior_file:
+ logger.info("Adding WP aliases")
+ with prior_prob_path.open("r", encoding="utf8") as prior_file:
# skip header
prior_file.readline()
line = prior_file.readline()
@@ -180,10 +155,7 @@ def _add_aliases(kb, title_to_id, max_entities_per_alias, min_occ, prior_prob_in
line = prior_file.readline()
-def read_nlp_kb(model_dir, kb_file):
- nlp = spacy.load(model_dir)
+def read_kb(nlp, kb_file):
kb = KnowledgeBase(vocab=nlp.vocab)
kb.load_bulk(kb_file)
- logger.info("kb entities: {}".format(kb.get_size_entities()))
- logger.info("kb aliases: {}".format(kb.get_size_aliases()))
- return nlp, kb
+ return kb
diff --git a/bin/wiki_entity_linking/train_descriptions.py b/bin/wiki_entity_linking/train_descriptions.py
index 2cb66909f..af08d6b8f 100644
--- a/bin/wiki_entity_linking/train_descriptions.py
+++ b/bin/wiki_entity_linking/train_descriptions.py
@@ -53,7 +53,7 @@ class EntityEncoder:
start = start + batch_size
stop = min(stop + batch_size, len(description_list))
- logger.info("encoded: {} entities".format(stop))
+ logger.info("Encoded: {} entities".format(stop))
return encodings
@@ -62,7 +62,7 @@ class EntityEncoder:
if to_print:
logger.info(
"Trained entity descriptions on {} ".format(processed) +
- "(non-unique) entities across {} ".format(self.epochs) +
+ "(non-unique) descriptions across {} ".format(self.epochs) +
"epochs"
)
logger.info("Final loss: {}".format(loss))
diff --git a/bin/wiki_entity_linking/training_set_creator.py b/bin/wiki_entity_linking/training_set_creator.py
deleted file mode 100644
index 3f42f8bdd..000000000
--- a/bin/wiki_entity_linking/training_set_creator.py
+++ /dev/null
@@ -1,395 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import logging
-import random
-import re
-import bz2
-import json
-
-from functools import partial
-
-from spacy.gold import GoldParse
-from bin.wiki_entity_linking import kb_creator
-
-"""
-Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
-Gold-standard entities are stored in one file in standoff format (by character offset).
-"""
-
-ENTITY_FILE = "gold_entities.csv"
-logger = logging.getLogger(__name__)
-
-
-def create_training_examples_and_descriptions(wikipedia_input,
- entity_def_input,
- description_output,
- training_output,
- parse_descriptions,
- limit=None):
- wp_to_id = kb_creator.get_entity_to_id(entity_def_input)
- _process_wikipedia_texts(wikipedia_input,
- wp_to_id,
- description_output,
- training_output,
- parse_descriptions,
- limit)
-
-
-def _process_wikipedia_texts(wikipedia_input,
- wp_to_id,
- output,
- training_output,
- parse_descriptions,
- limit=None):
- """
- Read the XML wikipedia data to parse out training data:
- raw text data + positive instances
- """
- title_regex = re.compile(r"(?<=
).*(?=)")
- id_regex = re.compile(r"(?<=)\d*(?=)")
-
- read_ids = set()
-
- with output.open("a", encoding="utf8") as descr_file, training_output.open("w", encoding="utf8") as entity_file:
- if parse_descriptions:
- _write_training_description(descr_file, "WD_id", "description")
- with bz2.open(wikipedia_input, mode="rb") as file:
- article_count = 0
- article_text = ""
- article_title = None
- article_id = None
- reading_text = False
- reading_revision = False
-
- logger.info("Processed {} articles".format(article_count))
-
- for line in file:
- clean_line = line.strip().decode("utf-8")
-
- if clean_line == "":
- reading_revision = True
- elif clean_line == "":
- reading_revision = False
-
- # Start reading new page
- if clean_line == "":
- article_text = ""
- article_title = None
- article_id = None
- # finished reading this page
- elif clean_line == "":
- if article_id:
- clean_text, entities = _process_wp_text(
- article_title,
- article_text,
- wp_to_id
- )
- if clean_text is not None and entities is not None:
- _write_training_entities(entity_file,
- article_id,
- clean_text,
- entities)
-
- if article_title in wp_to_id and parse_descriptions:
- description = " ".join(clean_text[:1000].split(" ")[:-1])
- _write_training_description(
- descr_file,
- wp_to_id[article_title],
- description
- )
- article_count += 1
- if article_count % 10000 == 0:
- logger.info("Processed {} articles".format(article_count))
- if limit and article_count >= limit:
- break
- article_text = ""
- article_title = None
- article_id = None
- reading_text = False
- reading_revision = False
-
- # start reading text within a page
- if ").*(?=")
- clean_text = clean_text.replace(r""", '"')
- clean_text = clean_text.replace(r" ", " ")
- clean_text = clean_text.replace(r"&", "&")
-
- # remove multiple spaces
- while " " in clean_text:
- clean_text = clean_text.replace(" ", " ")
-
- return clean_text.strip()
-
-
-def _remove_links(clean_text, wp_to_id):
- # read the text char by char to get the right offsets for the interwiki links
- entities = []
- final_text = ""
- open_read = 0
- reading_text = True
- reading_entity = False
- reading_mention = False
- reading_special_case = False
- entity_buffer = ""
- mention_buffer = ""
- for index, letter in enumerate(clean_text):
- if letter == "[":
- open_read += 1
- elif letter == "]":
- open_read -= 1
- elif letter == "|":
- if reading_text:
- final_text += letter
- # switch from reading entity to mention in the [[entity|mention]] pattern
- elif reading_entity:
- reading_text = False
- reading_entity = False
- reading_mention = True
- else:
- reading_special_case = True
- else:
- if reading_entity:
- entity_buffer += letter
- elif reading_mention:
- mention_buffer += letter
- elif reading_text:
- final_text += letter
- else:
- raise ValueError("Not sure at point", clean_text[index - 2: index + 2])
-
- if open_read > 2:
- reading_special_case = True
-
- if open_read == 2 and reading_text:
- reading_text = False
- reading_entity = True
- reading_mention = False
-
- # we just finished reading an entity
- if open_read == 0 and not reading_text:
- if "#" in entity_buffer or entity_buffer.startswith(":"):
- reading_special_case = True
- # Ignore cases with nested structures like File: handles etc
- if not reading_special_case:
- if not mention_buffer:
- mention_buffer = entity_buffer
- start = len(final_text)
- end = start + len(mention_buffer)
- qid = wp_to_id.get(entity_buffer, None)
- if qid:
- entities.append((mention_buffer, qid, start, end))
- final_text += mention_buffer
-
- entity_buffer = ""
- mention_buffer = ""
-
- reading_text = True
- reading_entity = False
- reading_mention = False
- reading_special_case = False
- return final_text, entities
-
-
-def _write_training_description(outputfile, qid, description):
- if description is not None:
- line = str(qid) + "|" + description + "\n"
- outputfile.write(line)
-
-
-def _write_training_entities(outputfile, article_id, clean_text, entities):
- entities_data = [{"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]} for ent in entities]
- line = json.dumps(
- {
- "article_id": article_id,
- "clean_text": clean_text,
- "entities": entities_data
- },
- ensure_ascii=False) + "\n"
- outputfile.write(line)
-
-
-def read_training(nlp, entity_file_path, dev, limit, kb):
- """ This method provides training examples that correspond to the entity annotations found by the nlp object.
- For training,, it will include negative training examples by using the candidate generator,
- and it will only keep positive training examples that can be found by using the candidate generator.
- For testing, it will include all positive examples only."""
-
- from tqdm import tqdm
- data = []
- num_entities = 0
- get_gold_parse = partial(_get_gold_parse, dev=dev, kb=kb)
-
- logger.info("Reading {} data with limit {}".format('dev' if dev else 'train', limit))
- with entity_file_path.open("r", encoding="utf8") as file:
- with tqdm(total=limit, leave=False) as pbar:
- for i, line in enumerate(file):
- example = json.loads(line)
- article_id = example["article_id"]
- clean_text = example["clean_text"]
- entities = example["entities"]
-
- if dev != is_dev(article_id) or len(clean_text) >= 30000:
- continue
-
- doc = nlp(clean_text)
- gold = get_gold_parse(doc, entities)
- if gold and len(gold.links) > 0:
- data.append((doc, gold))
- num_entities += len(gold.links)
- pbar.update(len(gold.links))
- if limit and num_entities >= limit:
- break
- logger.info("Read {} entities in {} articles".format(num_entities, len(data)))
- return data
-
-
-def _get_gold_parse(doc, entities, dev, kb):
- gold_entities = {}
- tagged_ent_positions = set(
- [(ent.start_char, ent.end_char) for ent in doc.ents]
- )
-
- for entity in entities:
- entity_id = entity["entity"]
- alias = entity["alias"]
- start = entity["start"]
- end = entity["end"]
-
- candidates = kb.get_candidates(alias)
- candidate_ids = [
- c.entity_ for c in candidates
- ]
-
- should_add_ent = (
- dev or
- (
- (start, end) in tagged_ent_positions and
- entity_id in candidate_ids and
- len(candidates) > 1
- )
- )
-
- if should_add_ent:
- value_by_id = {entity_id: 1.0}
- if not dev:
- random.shuffle(candidate_ids)
- value_by_id.update({
- kb_id: 0.0
- for kb_id in candidate_ids
- if kb_id != entity_id
- })
- gold_entities[(start, end)] = value_by_id
-
- return GoldParse(doc, links=gold_entities)
-
-
-def is_dev(article_id):
- return article_id.endswith("3")
diff --git a/bin/wiki_entity_linking/wiki_io.py b/bin/wiki_entity_linking/wiki_io.py
new file mode 100644
index 000000000..43ae87f0f
--- /dev/null
+++ b/bin/wiki_entity_linking/wiki_io.py
@@ -0,0 +1,127 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import sys
+import csv
+
+# min() needed to prevent error on windows, cf https://stackoverflow.com/questions/52404416/
+csv.field_size_limit(min(sys.maxsize, 2147483646))
+
+""" This class provides reading/writing methods for temp files """
+
+
+# Entity definition: WP title -> WD ID #
+def write_title_to_id(entity_def_output, title_to_id):
+ with entity_def_output.open("w", encoding="utf8") as id_file:
+ id_file.write("WP_title" + "|" + "WD_id" + "\n")
+ for title, qid in title_to_id.items():
+ id_file.write(title + "|" + str(qid) + "\n")
+
+
+def read_title_to_id(entity_def_output):
+ title_to_id = dict()
+ with entity_def_output.open("r", encoding="utf8") as id_file:
+ csvreader = csv.reader(id_file, delimiter="|")
+ # skip header
+ next(csvreader)
+ for row in csvreader:
+ title_to_id[row[0]] = row[1]
+ return title_to_id
+
+
+# Entity aliases from WD: WD ID -> WD alias #
+def write_id_to_alias(entity_alias_path, id_to_alias):
+ with entity_alias_path.open("w", encoding="utf8") as alias_file:
+ alias_file.write("WD_id" + "|" + "alias" + "\n")
+ for qid, alias_list in id_to_alias.items():
+ for alias in alias_list:
+ alias_file.write(str(qid) + "|" + alias + "\n")
+
+
+def read_id_to_alias(entity_alias_path):
+ id_to_alias = dict()
+ with entity_alias_path.open("r", encoding="utf8") as alias_file:
+ csvreader = csv.reader(alias_file, delimiter="|")
+ # skip header
+ next(csvreader)
+ for row in csvreader:
+ qid = row[0]
+ alias = row[1]
+ alias_list = id_to_alias.get(qid, [])
+ alias_list.append(alias)
+ id_to_alias[qid] = alias_list
+ return id_to_alias
+
+
+def read_alias_to_id_generator(entity_alias_path):
+ """ Read (aliases, qid) tuples """
+
+ with entity_alias_path.open("r", encoding="utf8") as alias_file:
+ csvreader = csv.reader(alias_file, delimiter="|")
+ # skip header
+ next(csvreader)
+ for row in csvreader:
+ qid = row[0]
+ alias = row[1]
+ yield alias, qid
+
+
+# Entity descriptions from WD: WD ID -> WD alias #
+def write_id_to_descr(entity_descr_output, id_to_descr):
+ with entity_descr_output.open("w", encoding="utf8") as descr_file:
+ descr_file.write("WD_id" + "|" + "description" + "\n")
+ for qid, descr in id_to_descr.items():
+ descr_file.write(str(qid) + "|" + descr + "\n")
+
+
+def read_id_to_descr(entity_desc_path):
+ id_to_desc = dict()
+ with entity_desc_path.open("r", encoding="utf8") as descr_file:
+ csvreader = csv.reader(descr_file, delimiter="|")
+ # skip header
+ next(csvreader)
+ for row in csvreader:
+ id_to_desc[row[0]] = row[1]
+ return id_to_desc
+
+
+# Entity counts from WP: WP title -> count #
+def write_entity_to_count(prior_prob_input, count_output):
+ # Write entity counts for quick access later
+ entity_to_count = dict()
+ total_count = 0
+
+ with prior_prob_input.open("r", encoding="utf8") as prior_file:
+ # skip header
+ prior_file.readline()
+ line = prior_file.readline()
+
+ while line:
+ splits = line.replace("\n", "").split(sep="|")
+ # alias = splits[0]
+ count = int(splits[1])
+ entity = splits[2]
+
+ current_count = entity_to_count.get(entity, 0)
+ entity_to_count[entity] = current_count + count
+
+ total_count += count
+
+ line = prior_file.readline()
+
+ with count_output.open("w", encoding="utf8") as entity_file:
+ entity_file.write("entity" + "|" + "count" + "\n")
+ for entity, count in entity_to_count.items():
+ entity_file.write(entity + "|" + str(count) + "\n")
+
+
+def read_entity_to_count(count_input):
+ entity_to_count = dict()
+ with count_input.open("r", encoding="utf8") as csvfile:
+ csvreader = csv.reader(csvfile, delimiter="|")
+ # skip header
+ next(csvreader)
+ for row in csvreader:
+ entity_to_count[row[0]] = int(row[1])
+
+ return entity_to_count
diff --git a/bin/wiki_entity_linking/wiki_namespaces.py b/bin/wiki_entity_linking/wiki_namespaces.py
new file mode 100644
index 000000000..e8f099ccd
--- /dev/null
+++ b/bin/wiki_entity_linking/wiki_namespaces.py
@@ -0,0 +1,128 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+# List of meta pages in Wikidata, should be kept out of the Knowledge base
+WD_META_ITEMS = [
+ "Q163875",
+ "Q191780",
+ "Q224414",
+ "Q4167836",
+ "Q4167410",
+ "Q4663903",
+ "Q11266439",
+ "Q13406463",
+ "Q15407973",
+ "Q18616576",
+ "Q19887878",
+ "Q22808320",
+ "Q23894233",
+ "Q33120876",
+ "Q42104522",
+ "Q47460393",
+ "Q64875536",
+ "Q66480449",
+]
+
+
+# TODO: add more cases from non-English WP's
+
+# List of prefixes that refer to Wikipedia "file" pages
+WP_FILE_NAMESPACE = ["Bestand", "File"]
+
+# List of prefixes that refer to Wikipedia "category" pages
+WP_CATEGORY_NAMESPACE = ["Kategori", "Category", "Categorie"]
+
+# List of prefixes that refer to Wikipedia "meta" pages
+# these will/should be matched ignoring case
+WP_META_NAMESPACE = (
+ WP_FILE_NAMESPACE
+ + WP_CATEGORY_NAMESPACE
+ + [
+ "b",
+ "betawikiversity",
+ "Book",
+ "c",
+ "Commons",
+ "d",
+ "dbdump",
+ "download",
+ "Draft",
+ "Education",
+ "Foundation",
+ "Gadget",
+ "Gadget definition",
+ "Gebruiker",
+ "gerrit",
+ "Help",
+ "Image",
+ "Incubator",
+ "m",
+ "mail",
+ "mailarchive",
+ "media",
+ "MediaWiki",
+ "MediaWiki talk",
+ "Mediawikiwiki",
+ "MediaZilla",
+ "Meta",
+ "Metawikipedia",
+ "Module",
+ "mw",
+ "n",
+ "nost",
+ "oldwikisource",
+ "otrs",
+ "OTRSwiki",
+ "Overleg gebruiker",
+ "outreach",
+ "outreachwiki",
+ "Portal",
+ "phab",
+ "Phabricator",
+ "Project",
+ "q",
+ "quality",
+ "rev",
+ "s",
+ "spcom",
+ "Special",
+ "species",
+ "Strategy",
+ "sulutil",
+ "svn",
+ "Talk",
+ "Template",
+ "Template talk",
+ "Testwiki",
+ "ticket",
+ "TimedText",
+ "Toollabs",
+ "tools",
+ "tswiki",
+ "User",
+ "User talk",
+ "v",
+ "voy",
+ "w",
+ "Wikibooks",
+ "Wikidata",
+ "wikiHow",
+ "Wikinvest",
+ "wikilivres",
+ "Wikimedia",
+ "Wikinews",
+ "Wikipedia",
+ "Wikipedia talk",
+ "Wikiquote",
+ "Wikisource",
+ "Wikispecies",
+ "Wikitech",
+ "Wikiversity",
+ "Wikivoyage",
+ "wikt",
+ "wiktionary",
+ "wmf",
+ "wmania",
+ "WP",
+ ]
+)
diff --git a/bin/wiki_entity_linking/wikidata_pretrain_kb.py b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
index 28650f039..940607b72 100644
--- a/bin/wiki_entity_linking/wikidata_pretrain_kb.py
+++ b/bin/wiki_entity_linking/wikidata_pretrain_kb.py
@@ -18,11 +18,12 @@ from pathlib import Path
import plac
from bin.wiki_entity_linking import wikipedia_processor as wp, wikidata_processor as wd
+from bin.wiki_entity_linking import wiki_io as io
from bin.wiki_entity_linking import kb_creator
-from bin.wiki_entity_linking import training_set_creator
from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_FILE, ENTITY_DESCR_PATH, KB_MODEL_DIR, LOG_FORMAT
-from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH
+from bin.wiki_entity_linking import ENTITY_FREQ_PATH, PRIOR_PROB_PATH, ENTITY_DEFS_PATH, ENTITY_ALIAS_PATH
import spacy
+from bin.wiki_entity_linking.kb_creator import read_kb
logger = logging.getLogger(__name__)
@@ -39,9 +40,11 @@ logger = logging.getLogger(__name__)
loc_prior_prob=("Location to file with prior probabilities", "option", "p", Path),
loc_entity_defs=("Location to file with entity definitions", "option", "d", Path),
loc_entity_desc=("Location to file with entity descriptions", "option", "s", Path),
- descriptions_from_wikipedia=("Flag for using wp descriptions not wd", "flag", "wp"),
- limit=("Optional threshold to limit lines read from dumps", "option", "l", int),
- lang=("Optional language for which to get wikidata titles. Defaults to 'en'", "option", "la", str),
+ descr_from_wp=("Flag for using wp descriptions not wd", "flag", "wp"),
+ limit_prior=("Threshold to limit lines read from WP for prior probabilities", "option", "lp", int),
+ limit_train=("Threshold to limit lines read from WP for training set", "option", "lt", int),
+ limit_wd=("Threshold to limit lines read from WD", "option", "lw", int),
+ lang=("Optional language for which to get Wikidata titles. Defaults to 'en'", "option", "la", str),
)
def main(
wd_json,
@@ -54,13 +57,16 @@ def main(
entity_vector_length=64,
loc_prior_prob=None,
loc_entity_defs=None,
+ loc_entity_alias=None,
loc_entity_desc=None,
- descriptions_from_wikipedia=False,
- limit=None,
+ descr_from_wp=False,
+ limit_prior=None,
+ limit_train=None,
+ limit_wd=None,
lang="en",
):
-
entity_defs_path = loc_entity_defs if loc_entity_defs else output_dir / ENTITY_DEFS_PATH
+ entity_alias_path = loc_entity_alias if loc_entity_alias else output_dir / ENTITY_ALIAS_PATH
entity_descr_path = loc_entity_desc if loc_entity_desc else output_dir / ENTITY_DESCR_PATH
entity_freq_path = output_dir / ENTITY_FREQ_PATH
prior_prob_path = loc_prior_prob if loc_prior_prob else output_dir / PRIOR_PROB_PATH
@@ -69,15 +75,12 @@ def main(
logger.info("Creating KB with Wikipedia and WikiData")
- if limit is not None:
- logger.warning("Warning: reading only {} lines of Wikipedia/Wikidata dumps.".format(limit))
-
# STEP 0: set up IO
if not output_dir.exists():
output_dir.mkdir(parents=True)
- # STEP 1: create the NLP object
- logger.info("STEP 1: Loading model {}".format(model))
+ # STEP 1: Load the NLP object
+ logger.info("STEP 1: Loading NLP model {}".format(model))
nlp = spacy.load(model)
# check the length of the nlp vectors
@@ -90,62 +93,83 @@ def main(
# STEP 2: create prior probabilities from WP
if not prior_prob_path.exists():
# It takes about 2h to process 1000M lines of Wikipedia XML dump
- logger.info("STEP 2: writing prior probabilities to {}".format(prior_prob_path))
- wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit)
- logger.info("STEP 2: reading prior probabilities from {}".format(prior_prob_path))
+ logger.info("STEP 2: Writing prior probabilities to {}".format(prior_prob_path))
+ if limit_prior is not None:
+ logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_prior))
+ wp.read_prior_probs(wp_xml, prior_prob_path, limit=limit_prior)
+ else:
+ logger.info("STEP 2: Reading prior probabilities from {}".format(prior_prob_path))
- # STEP 3: deduce entity frequencies from WP (takes only a few minutes)
- logger.info("STEP 3: calculating entity frequencies")
- wp.write_entity_counts(prior_prob_path, entity_freq_path, to_print=False)
+ # STEP 3: calculate entity frequencies
+ if not entity_freq_path.exists():
+ logger.info("STEP 3: Calculating and writing entity frequencies to {}".format(entity_freq_path))
+ io.write_entity_to_count(prior_prob_path, entity_freq_path)
+ else:
+ logger.info("STEP 3: Reading entity frequencies from {}".format(entity_freq_path))
# STEP 4: reading definitions and (possibly) descriptions from WikiData or from file
- message = " and descriptions" if not descriptions_from_wikipedia else ""
- if (not entity_defs_path.exists()) or (not descriptions_from_wikipedia and not entity_descr_path.exists()):
+ if (not entity_defs_path.exists()) or (not descr_from_wp and not entity_descr_path.exists()):
# It takes about 10h to process 55M lines of Wikidata JSON dump
- logger.info("STEP 4: parsing wikidata for entity definitions" + message)
- title_to_id, id_to_descr = wd.read_wikidata_entities_json(
+ logger.info("STEP 4: Parsing and writing Wikidata entity definitions to {}".format(entity_defs_path))
+ if limit_wd is not None:
+ logger.warning("Warning: reading only {} lines of Wikidata dump".format(limit_wd))
+ title_to_id, id_to_descr, id_to_alias = wd.read_wikidata_entities_json(
wd_json,
- limit,
+ limit_wd,
to_print=False,
lang=lang,
- parse_descriptions=(not descriptions_from_wikipedia),
+ parse_descr=(not descr_from_wp),
)
- wd.write_entity_files(entity_defs_path, title_to_id)
- if not descriptions_from_wikipedia:
- wd.write_entity_description_files(entity_descr_path, id_to_descr)
- logger.info("STEP 4: read entity definitions" + message)
+ io.write_title_to_id(entity_defs_path, title_to_id)
- # STEP 5: Getting gold entities from wikipedia
- message = " and descriptions" if descriptions_from_wikipedia else ""
- if (not training_entities_path.exists()) or (descriptions_from_wikipedia and not entity_descr_path.exists()):
- logger.info("STEP 5: parsing wikipedia for gold entities" + message)
- training_set_creator.create_training_examples_and_descriptions(
- wp_xml,
- entity_defs_path,
- entity_descr_path,
- training_entities_path,
- parse_descriptions=descriptions_from_wikipedia,
- limit=limit,
- )
- logger.info("STEP 5: read gold entities" + message)
+ logger.info("STEP 4b: Writing Wikidata entity aliases to {}".format(entity_alias_path))
+ io.write_id_to_alias(entity_alias_path, id_to_alias)
+
+ if not descr_from_wp:
+ logger.info("STEP 4c: Writing Wikidata entity descriptions to {}".format(entity_descr_path))
+ io.write_id_to_descr(entity_descr_path, id_to_descr)
+ else:
+ logger.info("STEP 4: Reading entity definitions from {}".format(entity_defs_path))
+ logger.info("STEP 4b: Reading entity aliases from {}".format(entity_alias_path))
+ if not descr_from_wp:
+ logger.info("STEP 4c: Reading entity descriptions from {}".format(entity_descr_path))
+
+ # STEP 5: Getting gold entities from Wikipedia
+ if (not training_entities_path.exists()) or (descr_from_wp and not entity_descr_path.exists()):
+ logger.info("STEP 5: Parsing and writing Wikipedia gold entities to {}".format(training_entities_path))
+ if limit_train is not None:
+ logger.warning("Warning: reading only {} lines of Wikipedia dump".format(limit_train))
+ wp.create_training_and_desc(wp_xml, entity_defs_path, entity_descr_path,
+ training_entities_path, descr_from_wp, limit_train)
+ if descr_from_wp:
+ logger.info("STEP 5b: Parsing and writing Wikipedia descriptions to {}".format(entity_descr_path))
+ else:
+ logger.info("STEP 5: Reading gold entities from {}".format(training_entities_path))
+ if descr_from_wp:
+ logger.info("STEP 5b: Reading entity descriptions from {}".format(entity_descr_path))
# STEP 6: creating the actual KB
# It takes ca. 30 minutes to pretrain the entity embeddings
- logger.info("STEP 6: creating the KB at {}".format(kb_path))
- kb = kb_creator.create_kb(
- nlp=nlp,
- max_entities_per_alias=max_per_alias,
- min_entity_freq=min_freq,
- min_occ=min_pair,
- entity_def_input=entity_defs_path,
- entity_descr_path=entity_descr_path,
- count_input=entity_freq_path,
- prior_prob_input=prior_prob_path,
- entity_vector_length=entity_vector_length,
- )
-
- kb.dump(kb_path)
- nlp.to_disk(output_dir / KB_MODEL_DIR)
+ if not kb_path.exists():
+ logger.info("STEP 6: Creating the KB at {}".format(kb_path))
+ kb = kb_creator.create_kb(
+ nlp=nlp,
+ max_entities_per_alias=max_per_alias,
+ min_entity_freq=min_freq,
+ min_occ=min_pair,
+ entity_def_path=entity_defs_path,
+ entity_descr_path=entity_descr_path,
+ entity_alias_path=entity_alias_path,
+ entity_freq_path=entity_freq_path,
+ prior_prob_path=prior_prob_path,
+ entity_vector_length=entity_vector_length,
+ )
+ kb.dump(kb_path)
+ logger.info("kb entities: {}".format(kb.get_size_entities()))
+ logger.info("kb aliases: {}".format(kb.get_size_aliases()))
+ nlp.to_disk(output_dir / KB_MODEL_DIR)
+ else:
+ logger.info("STEP 6: KB already exists at {}".format(kb_path))
logger.info("Done!")
diff --git a/bin/wiki_entity_linking/wikidata_processor.py b/bin/wiki_entity_linking/wikidata_processor.py
index b4034cb1a..8a070f567 100644
--- a/bin/wiki_entity_linking/wikidata_processor.py
+++ b/bin/wiki_entity_linking/wikidata_processor.py
@@ -1,40 +1,52 @@
# coding: utf-8
from __future__ import unicode_literals
-import gzip
+import bz2
import json
import logging
-import datetime
+
+from bin.wiki_entity_linking.wiki_namespaces import WD_META_ITEMS
logger = logging.getLogger(__name__)
-def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descriptions=True):
- # Read the JSON wiki data and parse out the entities. Takes about 7u30 to parse 55M lines.
+def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang="en", parse_descr=True):
+ # Read the JSON wiki data and parse out the entities. Takes about 7-10h to parse 55M lines.
# get latest-all.json.bz2 from https://dumps.wikimedia.org/wikidatawiki/entities/
site_filter = '{}wiki'.format(lang)
- # properties filter (currently disabled to get ALL data)
- prop_filter = dict()
- # prop_filter = {'P31': {'Q5', 'Q15632617'}} # currently defined as OR: one property suffices to be selected
+ # filter: currently defined as OR: one hit suffices to be removed from further processing
+ exclude_list = WD_META_ITEMS
+
+ # punctuation
+ exclude_list.extend(["Q1383557", "Q10617810"])
+
+ # letters etc
+ exclude_list.extend(["Q188725", "Q19776628", "Q3841820", "Q17907810", "Q9788", "Q9398093"])
+
+ neg_prop_filter = {
+ 'P31': exclude_list, # instance of
+ 'P279': exclude_list # subclass
+ }
title_to_id = dict()
id_to_descr = dict()
+ id_to_alias = dict()
# parse appropriate fields - depending on what we need in the KB
parse_properties = False
parse_sitelinks = True
parse_labels = False
- parse_aliases = False
- parse_claims = False
+ parse_aliases = True
+ parse_claims = True
- with gzip.open(wikidata_file, mode='rb') as file:
+ with bz2.open(wikidata_file, mode='rb') as file:
for cnt, line in enumerate(file):
if limit and cnt >= limit:
break
- if cnt % 500000 == 0:
- logger.info("processed {} lines of WikiData dump".format(cnt))
+ if cnt % 500000 == 0 and cnt > 0:
+ logger.info("processed {} lines of WikiData JSON dump".format(cnt))
clean_line = line.strip()
if clean_line.endswith(b","):
clean_line = clean_line[:-1]
@@ -43,13 +55,11 @@ def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang=
entry_type = obj["type"]
if entry_type == "item":
- # filtering records on their properties (currently disabled to get ALL data)
- # keep = False
keep = True
claims = obj["claims"]
if parse_claims:
- for prop, value_set in prop_filter.items():
+ for prop, value_set in neg_prop_filter.items():
claim_property = claims.get(prop, None)
if claim_property:
for cp in claim_property:
@@ -61,7 +71,7 @@ def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang=
)
cp_rank = cp["rank"]
if cp_rank != "deprecated" and cp_id in value_set:
- keep = True
+ keep = False
if keep:
unique_id = obj["id"]
@@ -108,7 +118,7 @@ def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang=
"label (" + lang + "):", lang_label["value"]
)
- if found_link and parse_descriptions:
+ if found_link and parse_descr:
descriptions = obj["descriptions"]
if descriptions:
lang_descr = descriptions.get(lang, None)
@@ -130,22 +140,15 @@ def read_wikidata_entities_json(wikidata_file, limit=None, to_print=False, lang=
print(
"alias (" + lang + "):", item["value"]
)
+ alias_list = id_to_alias.get(unique_id, [])
+ alias_list.append(item["value"])
+ id_to_alias[unique_id] = alias_list
if to_print:
print()
- return title_to_id, id_to_descr
+ # log final number of lines processed
+ logger.info("Finished. Processed {} lines of WikiData JSON dump".format(cnt))
+ return title_to_id, id_to_descr, id_to_alias
-def write_entity_files(entity_def_output, title_to_id):
- with entity_def_output.open("w", encoding="utf8") as id_file:
- id_file.write("WP_title" + "|" + "WD_id" + "\n")
- for title, qid in title_to_id.items():
- id_file.write(title + "|" + str(qid) + "\n")
-
-
-def write_entity_description_files(entity_descr_output, id_to_descr):
- with entity_descr_output.open("w", encoding="utf8") as descr_file:
- descr_file.write("WD_id" + "|" + "description" + "\n")
- for qid, descr in id_to_descr.items():
- descr_file.write(str(qid) + "|" + descr + "\n")
diff --git a/bin/wiki_entity_linking/wikidata_train_entity_linker.py b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
index ac131e0ef..8635ae547 100644
--- a/bin/wiki_entity_linking/wikidata_train_entity_linker.py
+++ b/bin/wiki_entity_linking/wikidata_train_entity_linker.py
@@ -6,19 +6,19 @@ as created by the script `wikidata_create_kb`.
For the Wikipedia dump: get enwiki-latest-pages-articles-multistream.xml.bz2
from https://dumps.wikimedia.org/enwiki/latest/
-
"""
from __future__ import unicode_literals
import random
import logging
+import spacy
from pathlib import Path
import plac
-from bin.wiki_entity_linking import training_set_creator
+from bin.wiki_entity_linking import wikipedia_processor
from bin.wiki_entity_linking import TRAINING_DATA_FILE, KB_MODEL_DIR, KB_FILE, LOG_FORMAT, OUTPUT_MODEL_DIR
-from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance, measure_baselines
-from bin.wiki_entity_linking.kb_creator import read_nlp_kb
+from bin.wiki_entity_linking.entity_linker_evaluation import measure_performance
+from bin.wiki_entity_linking.kb_creator import read_kb
from spacy.util import minibatch, compounding
@@ -35,6 +35,7 @@ logger = logging.getLogger(__name__)
l2=("L2 regularization", "option", "r", float),
train_inst=("# training instances (default 90% of all)", "option", "t", int),
dev_inst=("# test instances (default 10% of all)", "option", "d", int),
+ labels_discard=("NER labels to discard (default None)", "option", "l", str),
)
def main(
dir_kb,
@@ -46,13 +47,14 @@ def main(
l2=1e-6,
train_inst=None,
dev_inst=None,
+ labels_discard=None
):
logger.info("Creating Entity Linker with Wikipedia and WikiData")
output_dir = Path(output_dir) if output_dir else dir_kb
- training_path = loc_training if loc_training else output_dir / TRAINING_DATA_FILE
+ training_path = loc_training if loc_training else dir_kb / TRAINING_DATA_FILE
nlp_dir = dir_kb / KB_MODEL_DIR
- kb_path = output_dir / KB_FILE
+ kb_path = dir_kb / KB_FILE
nlp_output_dir = output_dir / OUTPUT_MODEL_DIR
# STEP 0: set up IO
@@ -60,38 +62,49 @@ def main(
output_dir.mkdir()
# STEP 1 : load the NLP object
- logger.info("STEP 1: loading model from {}".format(nlp_dir))
- nlp, kb = read_nlp_kb(nlp_dir, kb_path)
+ logger.info("STEP 1a: Loading model from {}".format(nlp_dir))
+ nlp = spacy.load(nlp_dir)
+ logger.info("STEP 1b: Loading KB from {}".format(kb_path))
+ kb = read_kb(nlp, kb_path)
# check that there is a NER component in the pipeline
if "ner" not in nlp.pipe_names:
raise ValueError("The `nlp` object should have a pretrained `ner` component.")
- # STEP 2: create a training dataset from WP
- logger.info("STEP 2: reading training dataset from {}".format(training_path))
+ # STEP 2: read the training dataset previously created from WP
+ logger.info("STEP 2: Reading training dataset from {}".format(training_path))
- train_data = training_set_creator.read_training(
+ if labels_discard:
+ labels_discard = [x.strip() for x in labels_discard.split(",")]
+ logger.info("Discarding {} NER types: {}".format(len(labels_discard), labels_discard))
+ else:
+ labels_discard = []
+
+ train_data = wikipedia_processor.read_training(
nlp=nlp,
entity_file_path=training_path,
dev=False,
limit=train_inst,
kb=kb,
+ labels_discard=labels_discard
)
- # for testing, get all pos instances, whether or not they are in the kb
- dev_data = training_set_creator.read_training(
+ # for testing, get all pos instances (independently of KB)
+ dev_data = wikipedia_processor.read_training(
nlp=nlp,
entity_file_path=training_path,
dev=True,
limit=dev_inst,
- kb=kb,
+ kb=None,
+ labels_discard=labels_discard
)
- # STEP 3: create and train the entity linking pipe
- logger.info("STEP 3: training Entity Linking pipe")
+ # STEP 3: create and train an entity linking pipe
+ logger.info("STEP 3: Creating and training an Entity Linking pipe")
el_pipe = nlp.create_pipe(
- name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name}
+ name="entity_linker", config={"pretrained_vectors": nlp.vocab.vectors.name,
+ "labels_discard": labels_discard}
)
el_pipe.set_kb(kb)
nlp.add_pipe(el_pipe, last=True)
@@ -105,14 +118,9 @@ def main(
logger.info("Training on {} articles".format(len(train_data)))
logger.info("Dev testing on {} articles".format(len(dev_data)))
- dev_baseline_accuracies = measure_baselines(
- dev_data, kb
- )
-
+ # baseline performance on dev data
logger.info("Dev Baseline Accuracies:")
- logger.info(dev_baseline_accuracies.report_accuracy("random"))
- logger.info(dev_baseline_accuracies.report_accuracy("prior"))
- logger.info(dev_baseline_accuracies.report_accuracy("oracle"))
+ measure_performance(dev_data, kb, el_pipe, baseline=True, context=False)
for itn in range(epochs):
random.shuffle(train_data)
@@ -136,18 +144,18 @@ def main(
logger.error("Error updating batch:" + str(e))
if batchnr > 0:
logging.info("Epoch {}, train loss {}".format(itn, round(losses["entity_linker"] / batchnr, 2)))
- measure_performance(dev_data, kb, el_pipe)
+ measure_performance(dev_data, kb, el_pipe, baseline=False, context=True)
# STEP 4: measure the performance of our trained pipe on an independent dev set
- logger.info("STEP 4: performance measurement of Entity Linking pipe")
+ logger.info("STEP 4: Final performance measurement of Entity Linking pipe")
measure_performance(dev_data, kb, el_pipe)
# STEP 5: apply the EL pipe on a toy example
- logger.info("STEP 5: applying Entity Linking to toy example")
+ logger.info("STEP 5: Applying Entity Linking to toy example")
run_el_toy_example(nlp=nlp)
if output_dir:
- # STEP 6: write the NLP pipeline (including entity linker) to file
+ # STEP 6: write the NLP pipeline (now including an EL model) to file
logger.info("STEP 6: Writing trained NLP to {}".format(nlp_output_dir))
nlp.to_disk(nlp_output_dir)
diff --git a/bin/wiki_entity_linking/wikipedia_processor.py b/bin/wiki_entity_linking/wikipedia_processor.py
index 8f928723e..25e914b32 100644
--- a/bin/wiki_entity_linking/wikipedia_processor.py
+++ b/bin/wiki_entity_linking/wikipedia_processor.py
@@ -3,147 +3,104 @@ from __future__ import unicode_literals
import re
import bz2
-import csv
-import datetime
import logging
+import random
+import json
-from bin.wiki_entity_linking import LOG_FORMAT
+from functools import partial
+
+from spacy.gold import GoldParse
+from bin.wiki_entity_linking import wiki_io as io
+from bin.wiki_entity_linking.wiki_namespaces import (
+ WP_META_NAMESPACE,
+ WP_FILE_NAMESPACE,
+ WP_CATEGORY_NAMESPACE,
+)
"""
Process a Wikipedia dump to calculate entity frequencies and prior probabilities in combination with certain mentions.
Write these results to file for downstream KB and training data generation.
+
+Process Wikipedia interlinks to generate a training dataset for the EL algorithm.
"""
+ENTITY_FILE = "gold_entities.csv"
+
map_alias_to_link = dict()
logger = logging.getLogger(__name__)
-
-# these will/should be matched ignoring case
-wiki_namespaces = [
- "b",
- "betawikiversity",
- "Book",
- "c",
- "Category",
- "Commons",
- "d",
- "dbdump",
- "download",
- "Draft",
- "Education",
- "Foundation",
- "Gadget",
- "Gadget definition",
- "gerrit",
- "File",
- "Help",
- "Image",
- "Incubator",
- "m",
- "mail",
- "mailarchive",
- "media",
- "MediaWiki",
- "MediaWiki talk",
- "Mediawikiwiki",
- "MediaZilla",
- "Meta",
- "Metawikipedia",
- "Module",
- "mw",
- "n",
- "nost",
- "oldwikisource",
- "outreach",
- "outreachwiki",
- "otrs",
- "OTRSwiki",
- "Portal",
- "phab",
- "Phabricator",
- "Project",
- "q",
- "quality",
- "rev",
- "s",
- "spcom",
- "Special",
- "species",
- "Strategy",
- "sulutil",
- "svn",
- "Talk",
- "Template",
- "Template talk",
- "Testwiki",
- "ticket",
- "TimedText",
- "Toollabs",
- "tools",
- "tswiki",
- "User",
- "User talk",
- "v",
- "voy",
- "w",
- "Wikibooks",
- "Wikidata",
- "wikiHow",
- "Wikinvest",
- "wikilivres",
- "Wikimedia",
- "Wikinews",
- "Wikipedia",
- "Wikipedia talk",
- "Wikiquote",
- "Wikisource",
- "Wikispecies",
- "Wikitech",
- "Wikiversity",
- "Wikivoyage",
- "wikt",
- "wiktionary",
- "wmf",
- "wmania",
- "WP",
-]
+title_regex = re.compile(r"(?<=).*(?=)")
+id_regex = re.compile(r"(?<=)\d*(?=)")
+text_regex = re.compile(r"(?<=).*(?= 0:
logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
clean_line = line.strip().decode("utf-8")
- aliases, entities, normalizations = get_wp_links(clean_line)
- for alias, entity, norm in zip(aliases, entities, normalizations):
- _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
- _store_alias(alias, entity, normalize_alias=norm, normalize_entity=True)
+ # we attempt at reading the article's ID (but not the revision or contributor ID)
+ if "" in clean_line or "" in clean_line:
+ read_id = False
+ if "" in clean_line:
+ read_id = True
+
+ if read_id:
+ ids = id_regex.search(clean_line)
+ if ids:
+ current_article_id = ids[0]
+
+ # only processing prior probabilities from true training (non-dev) articles
+ if not is_dev(current_article_id):
+ aliases, entities, normalizations = get_wp_links(clean_line)
+ for alias, entity, norm in zip(aliases, entities, normalizations):
+ _store_alias(
+ alias, entity, normalize_alias=norm, normalize_entity=True
+ )
line = file.readline()
cnt += 1
logger.info("processed {} lines of Wikipedia XML dump".format(cnt))
+ logger.info("Finished. processed {} lines of Wikipedia XML dump".format(cnt))
# write all aliases and their entities and count occurrences to file
with prior_prob_output.open("w", encoding="utf8") as outputfile:
@@ -182,7 +139,7 @@ def get_wp_links(text):
match = match[2:][:-2].replace("_", " ").strip()
if ns_regex.match(match):
- pass # ignore namespaces at the beginning of the string
+ pass # ignore the entity if it points to a "meta" page
# this is a simple [[link]], with the alias the same as the mention
elif "|" not in match:
@@ -218,47 +175,382 @@ def _capitalize_first(text):
return result
-def write_entity_counts(prior_prob_input, count_output, to_print=False):
- # Write entity counts for quick access later
- entity_to_count = dict()
- total_count = 0
-
- with prior_prob_input.open("r", encoding="utf8") as prior_file:
- # skip header
- prior_file.readline()
- line = prior_file.readline()
-
- while line:
- splits = line.replace("\n", "").split(sep="|")
- # alias = splits[0]
- count = int(splits[1])
- entity = splits[2]
-
- current_count = entity_to_count.get(entity, 0)
- entity_to_count[entity] = current_count + count
-
- total_count += count
-
- line = prior_file.readline()
-
- with count_output.open("w", encoding="utf8") as entity_file:
- entity_file.write("entity" + "|" + "count" + "\n")
- for entity, count in entity_to_count.items():
- entity_file.write(entity + "|" + str(count) + "\n")
-
- if to_print:
- for entity, count in entity_to_count.items():
- print("Entity count:", entity, count)
- print("Total count:", total_count)
+def create_training_and_desc(
+ wp_input, def_input, desc_output, training_output, parse_desc, limit=None
+):
+ wp_to_id = io.read_title_to_id(def_input)
+ _process_wikipedia_texts(
+ wp_input, wp_to_id, desc_output, training_output, parse_desc, limit
+ )
-def get_all_frequencies(count_input):
- entity_to_count = dict()
- with count_input.open("r", encoding="utf8") as csvfile:
- csvreader = csv.reader(csvfile, delimiter="|")
- # skip header
- next(csvreader)
- for row in csvreader:
- entity_to_count[row[0]] = int(row[1])
+def _process_wikipedia_texts(
+ wikipedia_input, wp_to_id, output, training_output, parse_descriptions, limit=None
+):
+ """
+ Read the XML wikipedia data to parse out training data:
+ raw text data + positive instances
+ """
- return entity_to_count
+ read_ids = set()
+
+ with output.open("a", encoding="utf8") as descr_file, training_output.open(
+ "w", encoding="utf8"
+ ) as entity_file:
+ if parse_descriptions:
+ _write_training_description(descr_file, "WD_id", "description")
+ with bz2.open(wikipedia_input, mode="rb") as file:
+ article_count = 0
+ article_text = ""
+ article_title = None
+ article_id = None
+ reading_text = False
+ reading_revision = False
+
+ for line in file:
+ clean_line = line.strip().decode("utf-8")
+
+ if clean_line == "":
+ reading_revision = True
+ elif clean_line == "":
+ reading_revision = False
+
+ # Start reading new page
+ if clean_line == "":
+ article_text = ""
+ article_title = None
+ article_id = None
+ # finished reading this page
+ elif clean_line == "":
+ if article_id:
+ clean_text, entities = _process_wp_text(
+ article_title, article_text, wp_to_id
+ )
+ if clean_text is not None and entities is not None:
+ _write_training_entities(
+ entity_file, article_id, clean_text, entities
+ )
+
+ if article_title in wp_to_id and parse_descriptions:
+ description = " ".join(
+ clean_text[:1000].split(" ")[:-1]
+ )
+ _write_training_description(
+ descr_file, wp_to_id[article_title], description
+ )
+ article_count += 1
+ if article_count % 10000 == 0 and article_count > 0:
+ logger.info(
+ "Processed {} articles".format(article_count)
+ )
+ if limit and article_count >= limit:
+ break
+ article_text = ""
+ article_title = None
+ article_id = None
+ reading_text = False
+ reading_revision = False
+
+ # start reading text within a page
+ if "")
+ clean_text = clean_text.replace(r""", '"')
+ clean_text = clean_text.replace(r" ", " ")
+ clean_text = clean_text.replace(r"&", "&")
+
+ # remove multiple spaces
+ while " " in clean_text:
+ clean_text = clean_text.replace(" ", " ")
+
+ return clean_text.strip()
+
+
+def _remove_links(clean_text, wp_to_id):
+ # read the text char by char to get the right offsets for the interwiki links
+ entities = []
+ final_text = ""
+ open_read = 0
+ reading_text = True
+ reading_entity = False
+ reading_mention = False
+ reading_special_case = False
+ entity_buffer = ""
+ mention_buffer = ""
+ for index, letter in enumerate(clean_text):
+ if letter == "[":
+ open_read += 1
+ elif letter == "]":
+ open_read -= 1
+ elif letter == "|":
+ if reading_text:
+ final_text += letter
+ # switch from reading entity to mention in the [[entity|mention]] pattern
+ elif reading_entity:
+ reading_text = False
+ reading_entity = False
+ reading_mention = True
+ else:
+ reading_special_case = True
+ else:
+ if reading_entity:
+ entity_buffer += letter
+ elif reading_mention:
+ mention_buffer += letter
+ elif reading_text:
+ final_text += letter
+ else:
+ raise ValueError("Not sure at point", clean_text[index - 2 : index + 2])
+
+ if open_read > 2:
+ reading_special_case = True
+
+ if open_read == 2 and reading_text:
+ reading_text = False
+ reading_entity = True
+ reading_mention = False
+
+ # we just finished reading an entity
+ if open_read == 0 and not reading_text:
+ if "#" in entity_buffer or entity_buffer.startswith(":"):
+ reading_special_case = True
+ # Ignore cases with nested structures like File: handles etc
+ if not reading_special_case:
+ if not mention_buffer:
+ mention_buffer = entity_buffer
+ start = len(final_text)
+ end = start + len(mention_buffer)
+ qid = wp_to_id.get(entity_buffer, None)
+ if qid:
+ entities.append((mention_buffer, qid, start, end))
+ final_text += mention_buffer
+
+ entity_buffer = ""
+ mention_buffer = ""
+
+ reading_text = True
+ reading_entity = False
+ reading_mention = False
+ reading_special_case = False
+ return final_text, entities
+
+
+def _write_training_description(outputfile, qid, description):
+ if description is not None:
+ line = str(qid) + "|" + description + "\n"
+ outputfile.write(line)
+
+
+def _write_training_entities(outputfile, article_id, clean_text, entities):
+ entities_data = [
+ {"alias": ent[0], "entity": ent[1], "start": ent[2], "end": ent[3]}
+ for ent in entities
+ ]
+ line = (
+ json.dumps(
+ {
+ "article_id": article_id,
+ "clean_text": clean_text,
+ "entities": entities_data,
+ },
+ ensure_ascii=False,
+ )
+ + "\n"
+ )
+ outputfile.write(line)
+
+
+def read_training(nlp, entity_file_path, dev, limit, kb, labels_discard=None):
+ """ This method provides training examples that correspond to the entity annotations found by the nlp object.
+ For training, it will include both positive and negative examples by using the candidate generator from the kb.
+ For testing (kb=None), it will include all positive examples only."""
+
+ from tqdm import tqdm
+
+ if not labels_discard:
+ labels_discard = []
+
+ data = []
+ num_entities = 0
+ get_gold_parse = partial(
+ _get_gold_parse, dev=dev, kb=kb, labels_discard=labels_discard
+ )
+
+ logger.info(
+ "Reading {} data with limit {}".format("dev" if dev else "train", limit)
+ )
+ with entity_file_path.open("r", encoding="utf8") as file:
+ with tqdm(total=limit, leave=False) as pbar:
+ for i, line in enumerate(file):
+ example = json.loads(line)
+ article_id = example["article_id"]
+ clean_text = example["clean_text"]
+ entities = example["entities"]
+
+ if dev != is_dev(article_id) or not is_valid_article(clean_text):
+ continue
+
+ doc = nlp(clean_text)
+ gold = get_gold_parse(doc, entities)
+ if gold and len(gold.links) > 0:
+ data.append((doc, gold))
+ num_entities += len(gold.links)
+ pbar.update(len(gold.links))
+ if limit and num_entities >= limit:
+ break
+ logger.info("Read {} entities in {} articles".format(num_entities, len(data)))
+ return data
+
+
+def _get_gold_parse(doc, entities, dev, kb, labels_discard):
+ gold_entities = {}
+ tagged_ent_positions = {
+ (ent.start_char, ent.end_char): ent
+ for ent in doc.ents
+ if ent.label_ not in labels_discard
+ }
+
+ for entity in entities:
+ entity_id = entity["entity"]
+ alias = entity["alias"]
+ start = entity["start"]
+ end = entity["end"]
+
+ candidate_ids = []
+ if kb and not dev:
+ candidates = kb.get_candidates(alias)
+ candidate_ids = [cand.entity_ for cand in candidates]
+
+ tagged_ent = tagged_ent_positions.get((start, end), None)
+ if tagged_ent:
+ # TODO: check that alias == doc.text[start:end]
+ should_add_ent = (dev or entity_id in candidate_ids) and is_valid_sentence(
+ tagged_ent.sent.text
+ )
+
+ if should_add_ent:
+ value_by_id = {entity_id: 1.0}
+ if not dev:
+ random.shuffle(candidate_ids)
+ value_by_id.update(
+ {kb_id: 0.0 for kb_id in candidate_ids if kb_id != entity_id}
+ )
+ gold_entities[(start, end)] = value_by_id
+
+ return GoldParse(doc, links=gold_entities)
+
+
+def is_dev(article_id):
+ if not article_id:
+ return False
+ return article_id.endswith("3")
+
+
+def is_valid_article(doc_text):
+ # custom length cut-off
+ return 10 < len(doc_text) < 30000
+
+
+def is_valid_sentence(sent_text):
+ if not 10 < len(sent_text) < 3000:
+ # custom length cut-off
+ return False
+
+ if sent_text.strip().startswith("*") or sent_text.strip().startswith("#"):
+ # remove 'enumeration' sentences (occurs often on Wikipedia)
+ return False
+
+ return True
diff --git a/examples/information_extraction/entity_relations.py b/examples/information_extraction/entity_relations.py
index 1b3ba1d27..c40a3c10d 100644
--- a/examples/information_extraction/entity_relations.py
+++ b/examples/information_extraction/entity_relations.py
@@ -7,7 +7,7 @@ dependency tree to find the noun phrase they are referring to – for example:
$9.4 million --> Net income.
Compatible with: spaCy v2.0.0+
-Last tested with: v2.1.0
+Last tested with: v2.2.1
"""
from __future__ import unicode_literals, print_function
@@ -38,14 +38,17 @@ def main(model="en_core_web_sm"):
def filter_spans(spans):
# Filter a sequence of spans so they don't contain overlaps
- get_sort_key = lambda span: (span.end - span.start, span.start)
+ # For spaCy 2.1.4+: this function is available as spacy.util.filter_spans()
+ get_sort_key = lambda span: (span.end - span.start, -span.start)
sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
result = []
seen_tokens = set()
for span in sorted_spans:
+ # Check for end - 1 here because boundaries are inclusive
if span.start not in seen_tokens and span.end - 1 not in seen_tokens:
result.append(span)
- seen_tokens.update(range(span.start, span.end))
+ seen_tokens.update(range(span.start, span.end))
+ result = sorted(result, key=lambda span: span.start)
return result
diff --git a/examples/keras_parikh_entailment/__main__.py b/examples/keras_parikh_entailment/__main__.py
index b5849f0ca..ad398dae3 100644
--- a/examples/keras_parikh_entailment/__main__.py
+++ b/examples/keras_parikh_entailment/__main__.py
@@ -91,8 +91,8 @@ def demo(shape):
nlp = spacy.load("en_vectors_web_lg")
nlp.add_pipe(KerasSimilarityShim.load(nlp.path / "similarity", nlp, shape[0]))
- doc1 = nlp(u"The king of France is bald.")
- doc2 = nlp(u"France has no king.")
+ doc1 = nlp("The king of France is bald.")
+ doc2 = nlp("France has no king.")
print("Sentence 1:", doc1)
print("Sentence 2:", doc2)
diff --git a/examples/load_from_docbin.py b/examples/load_from_docbin.py
new file mode 100644
index 000000000..f26e7fc49
--- /dev/null
+++ b/examples/load_from_docbin.py
@@ -0,0 +1,45 @@
+# coding: utf-8
+"""
+Example of loading previously parsed text using spaCy's DocBin class. The example
+performs an entity count to show that the annotations are available.
+For more details, see https://spacy.io/usage/saving-loading#docs
+Installation:
+python -m spacy download en_core_web_lg
+Usage:
+python examples/load_from_docbin.py en_core_web_lg RC_2015-03-9.spacy
+"""
+from __future__ import unicode_literals
+
+import spacy
+from spacy.tokens import DocBin
+from timeit import default_timer as timer
+from collections import Counter
+
+EXAMPLE_PARSES_PATH = "RC_2015-03-9.spacy"
+
+
+def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
+ nlp = spacy.load(model)
+ print("Reading data from {}".format(docbin_path))
+ with open(docbin_path, "rb") as file_:
+ bytes_data = file_.read()
+ nr_word = 0
+ start_time = timer()
+ entities = Counter()
+ docbin = DocBin().from_bytes(bytes_data)
+ for doc in docbin.get_docs(nlp.vocab):
+ nr_word += len(doc)
+ entities.update((e.label_, e.text) for e in doc.ents)
+ end_time = timer()
+ msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
+ wps = nr_word / (end_time - start_time)
+ print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
+ print("Most common entities:")
+ for (label, entity), freq in entities.most_common(30):
+ print(freq, entity, label)
+
+
+if __name__ == "__main__":
+ import plac
+
+ plac.call(main)
diff --git a/examples/training/conllu-config.json b/examples/training/conllu-config.json
new file mode 100644
index 000000000..9a11dd96b
--- /dev/null
+++ b/examples/training/conllu-config.json
@@ -0,0 +1 @@
+{"nr_epoch": 3, "batch_size": 24, "dropout": 0.001, "vectors": 0, "multitask_tag": 0, "multitask_sent": 0}
diff --git a/examples/training/conllu.py b/examples/training/conllu.py
index dfc790456..d9ee721ec 100644
--- a/examples/training/conllu.py
+++ b/examples/training/conllu.py
@@ -13,8 +13,7 @@ import spacy.util
from spacy.tokens import Token, Doc
from spacy.gold import GoldParse
from spacy.syntax.nonproj import projectivize
-from collections import defaultdict, Counter
-from timeit import default_timer as timer
+from collections import defaultdict
from spacy.matcher import Matcher
import itertools
@@ -290,11 +289,6 @@ def get_token_conllu(token, i):
return "\n".join(lines)
-Token.set_extension("get_conllu_lines", method=get_token_conllu)
-Token.set_extension("begins_fused", default=False)
-Token.set_extension("inside_fused", default=False)
-
-
##################
# Initialization #
##################
@@ -381,20 +375,24 @@ class TreebankPaths(object):
@plac.annotations(
ud_dir=("Path to Universal Dependencies corpus", "positional", None, Path),
+ parses_dir=("Directory to write the development parses", "positional", None, Path),
+ config=("Path to json formatted config file", "positional", None, Config.load),
corpus=(
- "UD corpus to train and evaluate on, e.g. en, es_ancora, etc",
+ "UD corpus to train and evaluate on, e.g. UD_Spanish-AnCora",
"positional",
None,
str,
),
- parses_dir=("Directory to write the development parses", "positional", None, Path),
- config=("Path to json formatted config file", "positional", None, Config.load),
limit=("Size limit", "option", "n", int),
)
def main(ud_dir, parses_dir, config, corpus, limit=0):
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
+ Token.set_extension("get_conllu_lines", method=get_token_conllu)
+ Token.set_extension("begins_fused", default=False)
+ Token.set_extension("inside_fused", default=False)
+
paths = TreebankPaths(ud_dir, corpus)
if not (parses_dir / corpus).exists():
(parses_dir / corpus).mkdir()
@@ -403,8 +401,8 @@ def main(ud_dir, parses_dir, config, corpus, limit=0):
docs, golds = read_data(
nlp,
- paths.train.conllu.open(),
- paths.train.text.open(),
+ paths.train.conllu.open(encoding="utf8"),
+ paths.train.text.open(encoding="utf8"),
max_doc_length=config.max_doc_length,
limit=limit,
)
diff --git a/examples/training/ner_multitask_objective.py b/examples/training/ner_multitask_objective.py
index 5d44ed649..4bf7a008f 100644
--- a/examples/training/ner_multitask_objective.py
+++ b/examples/training/ner_multitask_objective.py
@@ -18,19 +18,21 @@ during training. We discard the auxiliary model before run-time.
The specific example here is not necessarily a good idea --- but it shows
how an arbitrary objective function for some word can be used.
-Developed and tested for spaCy 2.0.6
+Developed and tested for spaCy 2.0.6. Updated for v2.2.2
"""
import random
import plac
import spacy
import os.path
+from spacy.tokens import Doc
from spacy.gold import read_json_file, GoldParse
random.seed(0)
PWD = os.path.dirname(__file__)
-TRAIN_DATA = list(read_json_file(os.path.join(PWD, "training-data.json")))
+TRAIN_DATA = list(read_json_file(
+ os.path.join(PWD, "ner_example_data", "ner-sent-per-line.json")))
def get_position_label(i, words, tags, heads, labels, ents):
@@ -55,6 +57,7 @@ def main(n_iter=10):
ner = nlp.create_pipe("ner")
ner.add_multitask_objective(get_position_label)
nlp.add_pipe(ner)
+ print(nlp.pipeline)
print("Create data", len(TRAIN_DATA))
optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
@@ -62,23 +65,24 @@ def main(n_iter=10):
random.shuffle(TRAIN_DATA)
losses = {}
for text, annot_brackets in TRAIN_DATA:
- annotations, _ = annot_brackets
- doc = nlp.make_doc(text)
- gold = GoldParse.from_annot_tuples(doc, annotations[0])
- nlp.update(
- [doc], # batch of texts
- [gold], # batch of annotations
- drop=0.2, # dropout - make it harder to memorise data
- sgd=optimizer, # callable to update weights
- losses=losses,
- )
+ for annotations, _ in annot_brackets:
+ doc = Doc(nlp.vocab, words=annotations[1])
+ gold = GoldParse.from_annot_tuples(doc, annotations)
+ nlp.update(
+ [doc], # batch of texts
+ [gold], # batch of annotations
+ drop=0.2, # dropout - make it harder to memorise data
+ sgd=optimizer, # callable to update weights
+ losses=losses,
+ )
print(losses.get("nn_labeller", 0.0), losses["ner"])
# test the trained model
for text, _ in TRAIN_DATA:
- doc = nlp(text)
- print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
- print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
+ if text is not None:
+ doc = nlp(text)
+ print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
+ print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
if __name__ == "__main__":
diff --git a/examples/training/training-data.json b/examples/training/training-data.json
index 1f57e1fd9..2565ce149 100644
--- a/examples/training/training-data.json
+++ b/examples/training/training-data.json
@@ -8,7 +8,7 @@
{
"tokens": [
{
- "head": 4,
+ "head": 44,
"dep": "prep",
"tag": "IN",
"orth": "In",
diff --git a/netlify.toml b/netlify.toml
index c116eb49b..45bd2c3b6 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -48,4 +48,6 @@ redirects = [
{from = "/api/sentencesegmenter", to="/api/sentencizer"},
{from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
{from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
+ # Renamed universe projects
+ {from = "/universe/project/spacy-pytorch-transformers", to = "/universe/project/spacy-transformers", force = true}
]
diff --git a/requirements.txt b/requirements.txt
index 601b73559..12f19bb88 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,15 +1,16 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
-thinc>=7.1.1,<7.2.0
+thinc>=7.3.0,<7.4.0
blis>=0.4.0,<0.5.0
murmurhash>=0.28.0,<1.1.0
-wasabi>=0.2.0,<1.1.0
+wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0
+catalogue>=0.0.7,<1.1.0
# Third party dependencies
numpy>=1.15.0
requests>=2.13.0,<3.0.0
-plac<1.0.0,>=0.9.6
+plac>=0.9.6,<1.2.0
pathlib==1.0.1; python_version < "3.4"
# Optional dependencies
jsonschema>=2.6.0,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index bcb85eef3..940066a9e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,6 +22,7 @@ classifiers =
Programming Language :: Python :: 3.5
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
+ Programming Language :: Python :: 3.8
Topic :: Scientific/Engineering
[options]
@@ -37,40 +38,38 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
- thinc>=7.1.1,<7.2.0
+ thinc>=7.3.0,<7.4.0
install_requires =
- numpy>=1.15.0
+ # Our libraries
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
- thinc>=7.1.1,<7.2.0
+ thinc>=7.3.0,<7.4.0
blis>=0.4.0,<0.5.0
- plac<1.0.0,>=0.9.6
- requests>=2.13.0,<3.0.0
- wasabi>=0.2.0,<1.1.0
+ wasabi>=0.4.0,<1.1.0
srsly>=0.1.0,<1.1.0
+ catalogue>=0.0.7,<1.1.0
+ # Third-party dependencies
+ setuptools
+ numpy>=1.15.0
+ plac>=0.9.6,<1.2.0
+ requests>=2.13.0,<3.0.0
pathlib==1.0.1; python_version < "3.4"
[options.extras_require]
lookups =
spacy_lookups_data>=0.0.5<0.2.0
cuda =
- thinc_gpu_ops>=0.0.1,<0.1.0
cupy>=5.0.0b4
cuda80 =
- thinc_gpu_ops>=0.0.1,<0.1.0
cupy-cuda80>=5.0.0b4
cuda90 =
- thinc_gpu_ops>=0.0.1,<0.1.0
cupy-cuda90>=5.0.0b4
cuda91 =
- thinc_gpu_ops>=0.0.1,<0.1.0
cupy-cuda91>=5.0.0b4
cuda92 =
- thinc_gpu_ops>=0.0.1,<0.1.0
cupy-cuda92>=5.0.0b4
cuda100 =
- thinc_gpu_ops>=0.0.1,<0.1.0
cupy-cuda100>=5.0.0b4
# Language tokenizers with external dependencies
ja =
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 9edbab198..4a0d16a49 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -9,11 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
# These are imported as part of the API
from thinc.neural.util import prefer_gpu, require_gpu
+from . import pipeline
from .cli.info import info as cli_info
from .glossary import explain
from .about import __version__
from .errors import Errors, Warnings, deprecation_warning
from . import util
+from .util import registry
+from .language import component
if sys.maxunicode == 65535:
diff --git a/spacy/__main__.py b/spacy/__main__.py
index 716561566..2c285095e 100644
--- a/spacy/__main__.py
+++ b/spacy/__main__.py
@@ -7,12 +7,10 @@ from __future__ import print_function
if __name__ == "__main__":
import plac
import sys
- from wasabi import Printer
+ from wasabi import msg
from spacy.cli import download, link, info, package, train, pretrain, convert
from spacy.cli import init_model, profile, evaluate, validate, debug_data
- msg = Printer()
-
commands = {
"download": download,
"link": link,
diff --git a/spacy/_ml.py b/spacy/_ml.py
index eea490e68..dff1869d7 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -7,12 +7,15 @@ from thinc.i2v import HashEmbed, StaticVectors
from thinc.t2t import ExtractWindow, ParametricAttention
from thinc.t2v import Pooling, sum_pool, mean_pool, max_pool
from thinc.misc import Residual
+from thinc.v2v import Model, Maxout, Softmax, Affine, ReLu
+from thinc.t2t import ExtractWindow, ParametricAttention
+from thinc.t2v import Pooling, sum_pool, mean_pool
+from thinc.i2v import HashEmbed
+from thinc.misc import Residual, FeatureExtracter
from thinc.misc import LayerNorm as LN
-from thinc.misc import FeatureExtracter
from thinc.api import add, layerize, chain, clone, concatenate, with_flatten
from thinc.api import with_getitem, flatten_add_lengths
from thinc.api import uniqued, wrap, noop
-from thinc.api import with_square_sequences
from thinc.linear.linear import LinearModel
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.util import get_array_module, copy_array, to_categorical
@@ -29,14 +32,13 @@ from .strings import get_string_id
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
from .errors import Errors, user_warning, Warnings
from . import util
+from . import ml as new_ml
+from .ml import _legacy_tok2vec
-try:
- import torch.nn
- from thinc.extra.wrappers import PyTorchWrapperRNN
-except ImportError:
- torch = None
VECTORS_KEY = "spacy_pretrained_vectors"
+# Backwards compatibility with <2.2.2
+USE_MODEL_REGISTRY_TOK2VEC = False
def cosine(vec1, vec2):
@@ -314,6 +316,10 @@ def link_vectors_to_models(vocab):
def PyTorchBiLSTM(nO, nI, depth, dropout=0.2):
+ import torch.nn
+ from thinc.api import with_square_sequences
+ from thinc.extra.wrappers import PyTorchWrapperRNN
+
if depth == 0:
return layerize(noop())
model = torch.nn.LSTM(nI, nO // 2, depth, bidirectional=True, dropout=dropout)
@@ -336,161 +342,91 @@ def Tok2Vec_chars_cnn(width, embed_size, **kwargs):
tok2vec.embed = embed
return tok2vec
-def Tok2Vec_chars_selfattention(width, embed_size, **kwargs):
- cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
- sa_depth = kwargs.get("self_attn_depth", 4)
- with Model.define_operators(
- {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
- ):
- embed = (
- CharacterEmbed(nM=64, nC=8)
- >> with_flatten(LN(Maxout(width, 64*8, pieces=cnn_maxout_pieces))))
- tok2vec = (
- embed
- >> PositionEncode(10000, width)
- >> SelfAttention(width, 1, 4) ** sa_depth
- )
-
- # Work around thinc API limitations :(. TODO: Revise in Thinc 7
- tok2vec.nO = width
- tok2vec.embed = embed
- return tok2vec
-
-
-def Tok2Vec_chars_bilstm(width, embed_size, **kwargs):
- cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
- depth = kwargs.get("bilstm_depth", 2)
- with Model.define_operators(
- {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
- ):
- embed = (
- CharacterEmbed(nM=64, nC=8)
- >> with_flatten(LN(Maxout(width, 64*8, pieces=cnn_maxout_pieces))))
- tok2vec = (
- embed
- >> Residual(PyTorchBiLSTM(width, width, depth))
- >> with_flatten(LN(nO=width))
- )
- # Work around thinc API limitations :(. TODO: Revise in Thinc 7
- tok2vec.nO = width
- tok2vec.embed = embed
- return tok2vec
-
-
-
-def CNN(width, depth, pieces, nW=1):
- if pieces == 1:
- layer = chain(
- ExtractWindow(nW=nW),
- Mish(width, width*(nW*2+1)),
- LN(nO=width)
- )
- return clone(Residual(layer), depth)
- else:
- layer = chain(
- ExtractWindow(nW=nW),
- LN(Maxout(width, width * (nW*2+1), pieces=pieces)))
- return clone(Residual(layer), depth)
-
-
-def SelfAttention(width, depth, pieces):
- layer = chain(
- prepare_self_attention(Affine(width * 3, width), nM=width, nH=pieces, window=None),
- MultiHeadedAttention(),
- with_flatten(Maxout(width, width)))
- return clone(Residual(chain(layer, with_flatten(LN(nO=width)))), depth)
-
-
-def PositionEncode(L, D):
- positions = NumpyOps().position_encode(L, D)
- positions = Model.ops.asarray(positions)
- def position_encode_forward(Xs, drop=0.):
- output = []
- for x in Xs:
- output.append(x + positions[:x.shape[0]])
- def position_encode_backward(dYs, sgd=None):
- return dYs
- return output, position_encode_backward
- return layerize(position_encode_forward)
-
def Tok2Vec(width, embed_size, **kwargs):
- pretrained_vectors = kwargs.setdefault("pretrained_vectors", None)
- cnn_maxout_pieces = kwargs.setdefault("cnn_maxout_pieces", 3)
- subword_features = kwargs.setdefault("subword_features", True)
- char_embed = kwargs.setdefault("char_embed", False)
- conv_depth = kwargs.setdefault("conv_depth", 4)
- bilstm_depth = kwargs.setdefault("bilstm_depth", 0)
- self_attn_depth = kwargs.setdefault("self_attn_depth", 0)
- conv_window = kwargs.setdefault("conv_window", 1)
- if char_embed and self_attn_depth:
- return Tok2Vec_chars_selfattention(width, embed_size, **kwargs)
- elif char_embed and bilstm_depth:
- return Tok2Vec_chars_bilstm(width, embed_size, **kwargs)
- elif char_embed and conv_depth:
- return Tok2Vec_chars_cnn(width, embed_size, **kwargs)
- cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
- with Model.define_operators(
- {">>": chain, "|": concatenate, "**": clone, "+": add, "*": reapply}
- ):
- norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
- if subword_features:
- prefix = HashEmbed(
- width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
- )
- suffix = HashEmbed(
- width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
- )
- shape = HashEmbed(
- width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
- )
- else:
- prefix, suffix, shape = (None, None, None)
- if pretrained_vectors is not None:
- glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
+ if not USE_MODEL_REGISTRY_TOK2VEC:
+ # Preserve prior tok2vec for backwards compat, in v2.2.2
+ return _legacy_tok2vec.Tok2Vec(width, embed_size, **kwargs)
+ pretrained_vectors = kwargs.get("pretrained_vectors", None)
+ cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
+ subword_features = kwargs.get("subword_features", True)
+ char_embed = kwargs.get("char_embed", False)
+ conv_depth = kwargs.get("conv_depth", 4)
+ bilstm_depth = kwargs.get("bilstm_depth", 0)
+ conv_window = kwargs.get("conv_window", 1)
- if subword_features:
- embed = uniqued(
- (glove | norm | prefix | suffix | shape)
- >> LN(Maxout(width, width * 5, pieces=3)),
- column=cols.index(ORTH),
- )
- else:
- embed = uniqued(
- (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
- column=cols.index(ORTH),
- )
- elif subword_features:
- embed = uniqued(
- (norm | prefix | suffix | shape)
- >> LN(Maxout(width, width * 4, pieces=3)),
- column=cols.index(ORTH),
- )
- else:
- embed = norm
+ cols = ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]
- tok2vec = (
- FeatureExtracter(cols)
- >> with_bos_eos(
- with_flatten(
- embed
- >> CNN(width, conv_depth, cnn_maxout_pieces, nW=conv_window),
- pad=conv_depth * conv_window)
- )
- )
-
- if bilstm_depth >= 1:
- tok2vec = (
- tok2vec
- >> Residual(
- PyTorchBiLSTM(width, width, 1)
- >> LN(nO=width)
- ) ** bilstm_depth
- )
- # Work around thinc API limitations :(. TODO: Revise in Thinc 7
- tok2vec.nO = width
- tok2vec.embed = embed
- return tok2vec
+ doc2feats_cfg = {"arch": "spacy.Doc2Feats.v1", "config": {"columns": cols}}
+ if char_embed:
+ embed_cfg = {
+ "arch": "spacy.CharacterEmbed.v1",
+ "config": {
+ "width": 64,
+ "chars": 6,
+ "@mix": {
+ "arch": "spacy.LayerNormalizedMaxout.v1",
+ "config": {"width": width, "pieces": 3},
+ },
+ "@embed_features": None,
+ },
+ }
+ else:
+ embed_cfg = {
+ "arch": "spacy.MultiHashEmbed.v1",
+ "config": {
+ "width": width,
+ "rows": embed_size,
+ "columns": cols,
+ "use_subwords": subword_features,
+ "@pretrained_vectors": None,
+ "@mix": {
+ "arch": "spacy.LayerNormalizedMaxout.v1",
+ "config": {"width": width, "pieces": 3},
+ },
+ },
+ }
+ if pretrained_vectors:
+ embed_cfg["config"]["@pretrained_vectors"] = {
+ "arch": "spacy.PretrainedVectors.v1",
+ "config": {
+ "vectors_name": pretrained_vectors,
+ "width": width,
+ "column": cols.index("ID"),
+ },
+ }
+ if cnn_maxout_pieces >= 2:
+ cnn_cfg = {
+ "arch": "spacy.MaxoutWindowEncoder.v1",
+ "config": {
+ "width": width,
+ "window_size": conv_window,
+ "pieces": cnn_maxout_pieces,
+ "depth": conv_depth,
+ },
+ }
+ else:
+ cnn_cfg = {
+ "arch": "spacy.MishWindowEncoder.v1",
+ "config": {"width": width, "window_size": conv_window, "depth": conv_depth},
+ }
+ bilstm_cfg = {
+ "arch": "spacy.TorchBiLSTMEncoder.v1",
+ "config": {"width": width, "depth": bilstm_depth},
+ }
+ if conv_depth == 0 and bilstm_depth == 0:
+ encode_cfg = {}
+ elif conv_depth >= 1 and bilstm_depth >= 1:
+ encode_cfg = {
+ "arch": "thinc.FeedForward.v1",
+ "config": {"children": [cnn_cfg, bilstm_cfg]},
+ }
+ elif conv_depth >= 1:
+ encode_cfg = cnn_cfg
+ else:
+ encode_cfg = bilstm_cfg
+ config = {"@doc2feats": doc2feats_cfg, "@embed": embed_cfg, "@encode": encode_cfg}
+ return new_ml.Tok2Vec(config)
def with_bos_eos(layer):
@@ -1101,6 +1037,7 @@ class CharacterEmbed(Model):
return output, backprop_character_embed
+<<<<<<< HEAD
def get_characters_loss(ops, docs, prediction, nr_char=10):
target_ids = numpy.vstack([doc.to_utf8_array(nr_char=nr_char) for doc in docs])
target_ids = target_ids.reshape((-1,))
@@ -1112,6 +1049,8 @@ def get_characters_loss(ops, docs, prediction, nr_char=10):
return loss, d_target
+=======
+>>>>>>> master
def get_cossim_loss(yh, y, ignore_zeros=False):
xp = get_array_module(yh)
# Find the zero vectors
diff --git a/spacy/about.py b/spacy/about.py
index 7834bfd12..c6db9700f 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "2.2.1"
+__version__ = "2.2.2"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/analysis.py b/spacy/analysis.py
new file mode 100644
index 000000000..761be3de9
--- /dev/null
+++ b/spacy/analysis.py
@@ -0,0 +1,179 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+from wasabi import Printer
+
+from .tokens import Doc, Token, Span
+from .errors import Errors, Warnings, user_warning
+
+
+def analyze_pipes(pipeline, name, pipe, index, warn=True):
+ """Analyze a pipeline component with respect to its position in the current
+ pipeline and the other components. Will check whether requirements are
+ fulfilled (e.g. if previous components assign the attributes).
+
+ pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+ name (unicode): The name of the pipeline component to analyze.
+ pipe (callable): The pipeline component function to analyze.
+ index (int): The index of the component in the pipeline.
+ warn (bool): Show user warning if problem is found.
+ RETURNS (list): The problems found for the given pipeline component.
+ """
+ assert pipeline[index][0] == name
+ prev_pipes = pipeline[:index]
+ pipe_requires = getattr(pipe, "requires", [])
+ requires = OrderedDict([(annot, False) for annot in pipe_requires])
+ if requires:
+ for prev_name, prev_pipe in prev_pipes:
+ prev_assigns = getattr(prev_pipe, "assigns", [])
+ for annot in prev_assigns:
+ requires[annot] = True
+ problems = []
+ for annot, fulfilled in requires.items():
+ if not fulfilled:
+ problems.append(annot)
+ if warn:
+ user_warning(Warnings.W025.format(name=name, attr=annot))
+ return problems
+
+
+def analyze_all_pipes(pipeline, warn=True):
+ """Analyze all pipes in the pipeline in order.
+
+ pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+ warn (bool): Show user warning if problem is found.
+ RETURNS (dict): The problems found, keyed by component name.
+ """
+ problems = {}
+ for i, (name, pipe) in enumerate(pipeline):
+ problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
+ return problems
+
+
+def dot_to_dict(values):
+ """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
+ become {"token": {"pos": True, "_": {"xyz": True }}}.
+
+ values (iterable): The values to convert.
+ RETURNS (dict): The converted values.
+ """
+ result = {}
+ for value in values:
+ path = result
+ parts = value.lower().split(".")
+ for i, item in enumerate(parts):
+ is_last = i == len(parts) - 1
+ path = path.setdefault(item, True if is_last else {})
+ return result
+
+
+def validate_attrs(values):
+ """Validate component attributes provided to "assigns", "requires" etc.
+ Raises error for invalid attributes and formatting. Doesn't check if
+ custom extension attributes are registered, since this is something the
+ user might want to do themselves later in the component.
+
+ values (iterable): The string attributes to check, e.g. `["token.pos"]`.
+ RETURNS (iterable): The checked attributes.
+ """
+ data = dot_to_dict(values)
+ objs = {"doc": Doc, "token": Token, "span": Span}
+ for obj_key, attrs in data.items():
+ if obj_key == "span":
+ # Support Span only for custom extension attributes
+ span_attrs = [attr for attr in values if attr.startswith("span.")]
+ span_attrs = [attr for attr in span_attrs if not attr.startswith("span._.")]
+ if span_attrs:
+ raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
+ if obj_key not in objs: # first element is not doc/token/span
+ invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
+ raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
+ if not isinstance(attrs, dict): # attr is something like "doc"
+ raise ValueError(Errors.E182.format(attr=obj_key))
+ for attr, value in attrs.items():
+ if attr == "_":
+ if value is True: # attr is something like "doc._"
+ raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
+ for ext_attr, ext_value in value.items():
+ # We don't check whether the attribute actually exists
+ if ext_value is not True: # attr is something like doc._.x.y
+ good = "{}._.{}".format(obj_key, ext_attr)
+ bad = "{}.{}".format(good, ".".join(ext_value))
+ raise ValueError(Errors.E183.format(attr=bad, solution=good))
+ continue # we can't validate those further
+ if attr.endswith("_"): # attr is something like "token.pos_"
+ raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
+ if value is not True: # attr is something like doc.x.y
+ good = "{}.{}".format(obj_key, attr)
+ bad = "{}.{}".format(good, ".".join(value))
+ raise ValueError(Errors.E183.format(attr=bad, solution=good))
+ obj = objs[obj_key]
+ if not hasattr(obj, attr):
+ raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
+ return values
+
+
+def _get_feature_for_attr(pipeline, attr, feature):
+ assert feature in ["assigns", "requires"]
+ result = []
+ for pipe_name, pipe in pipeline:
+ pipe_assigns = getattr(pipe, feature, [])
+ if attr in pipe_assigns:
+ result.append((pipe_name, pipe))
+ return result
+
+
+def get_assigns_for_attr(pipeline, attr):
+ """Get all pipeline components that assign an attr, e.g. "doc.tensor".
+
+ pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+ attr (unicode): The attribute to check.
+ RETURNS (list): (name, pipeline) tuples of components that assign the attr.
+ """
+ return _get_feature_for_attr(pipeline, attr, "assigns")
+
+
+def get_requires_for_attr(pipeline, attr):
+ """Get all pipeline components that require an attr, e.g. "doc.tensor".
+
+ pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+ attr (unicode): The attribute to check.
+ RETURNS (list): (name, pipeline) tuples of components that require the attr.
+ """
+ return _get_feature_for_attr(pipeline, attr, "requires")
+
+
+def print_summary(nlp, pretty=True, no_print=False):
+ """Print a formatted summary for the current nlp object's pipeline. Shows
+ a table with the pipeline components and why they assign and require, as
+ well as any problems if available.
+
+ nlp (Language): The nlp object.
+ pretty (bool): Pretty-print the results (color etc).
+ no_print (bool): Don't print anything, just return the data.
+ RETURNS (dict): A dict with "overview" and "problems".
+ """
+ msg = Printer(pretty=pretty, no_print=no_print)
+ overview = []
+ problems = {}
+ for i, (name, pipe) in enumerate(nlp.pipeline):
+ requires = getattr(pipe, "requires", [])
+ assigns = getattr(pipe, "assigns", [])
+ retok = getattr(pipe, "retokenizes", False)
+ overview.append((i, name, requires, assigns, retok))
+ problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
+ msg.divider("Pipeline Overview")
+ header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
+ msg.table(overview, header=header, divider=True, multiline=True)
+ n_problems = sum(len(p) for p in problems.values())
+ if any(p for p in problems.values()):
+ msg.divider("Problems ({})".format(n_problems))
+ for name, problem in problems.items():
+ if problem:
+ problem = ", ".join(problem)
+ msg.warn("'{}' requirements not met: {}".format(name, problem))
+ else:
+ msg.good("No problems found.")
+ if no_print:
+ return {"overview": overview, "problems": problems}
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 67f97f632..fa867fa04 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -57,7 +57,8 @@ def convert(
is written to stdout, so you can pipe them forward to a JSON file:
$ spacy convert some_file.conllu > some_file.json
"""
- msg = Printer()
+ no_print = output_dir == "-"
+ msg = Printer(no_print=no_print)
input_path = Path(input_file)
if file_type not in FILE_TYPES:
msg.fail(
@@ -102,6 +103,7 @@ def convert(
use_morphology=morphology,
lang=lang,
model=model,
+ no_print=no_print,
)
if output_dir != "-":
# Export data to a file
diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py
index a3a37d6c9..46489ad7c 100644
--- a/spacy/cli/converters/conll_ner2json.py
+++ b/spacy/cli/converters/conll_ner2json.py
@@ -9,7 +9,9 @@ from ...tokens.doc import Doc
from ...util import load_model
-def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs):
+def conll_ner2json(
+ input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
+):
"""
Convert files in the CoNLL-2003 NER format and similar
whitespace-separated columns into JSON format for use with train cli.
@@ -34,7 +36,7 @@ def conll_ner2json(input_data, n_sents=10, seg_sents=False, model=None, **kwargs
. O
"""
- msg = Printer()
+ msg = Printer(no_print=no_print)
doc_delimiter = "-DOCSTART- -X- O O"
# check for existing delimiters, which should be preserved
if "\n\n" in input_data and seg_sents:
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 8f2900a9b..e66a8c50e 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -34,6 +34,9 @@ def conllu2json(input_data, n_sents=10, use_morphology=False, lang=None, **_):
doc = create_doc(sentences, i)
docs.append(doc)
sentences = []
+ if sentences:
+ doc = create_doc(sentences, i)
+ docs.append(doc)
return docs
diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py
index 740f29001..61c398f8d 100644
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@@ -8,7 +8,7 @@ from ...util import minibatch
from .conll_ner2json import n_sents_info
-def iob2json(input_data, n_sents=10, *args, **kwargs):
+def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs):
"""
Convert IOB files with one sentence per line and tags separated with '|'
into JSON format for use with train cli. IOB and IOB2 are accepted.
@@ -20,7 +20,7 @@ def iob2json(input_data, n_sents=10, *args, **kwargs):
I|PRP|O like|VBP|O London|NNP|I-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O
"""
- msg = Printer()
+ msg = Printer(no_print=no_print)
docs = read_iob(input_data.split("\n"))
if n_sents > 0:
n_sents_info(msg, n_sents)
diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py
index 91dd42982..1c1bc45c7 100644
--- a/spacy/cli/converters/jsonl2json.py
+++ b/spacy/cli/converters/jsonl2json.py
@@ -7,7 +7,7 @@ from ...gold import docs_to_json
from ...util import get_lang_class, minibatch
-def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False):
+def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_):
if lang is None:
raise ValueError("No --lang specified, but tokenization required")
json_docs = []
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index b649e6666..5d044e617 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -121,6 +121,8 @@ def debug_data(
msg.text("{} training docs".format(len(train_docs)))
msg.text("{} evaluation docs".format(len(dev_docs)))
+ if not len(dev_docs):
+ msg.fail("No evaluation docs")
overlap = len(train_texts.intersection(dev_texts))
if overlap:
msg.warn("{} training examples also in evaluation data".format(overlap))
@@ -206,6 +208,9 @@ def debug_data(
missing_values, "value" if missing_values == 1 else "values"
)
)
+ for label in new_labels:
+ if len(label) == 0:
+ msg.fail("Empty label found in new labels")
if new_labels:
labels_with_counts = [
(label, count)
@@ -360,6 +365,16 @@ def debug_data(
)
)
+ # check for documents with multiple sentences
+ sents_per_doc = gold_train_data["n_sents"] / len(gold_train_data["texts"])
+ if sents_per_doc < 1.1:
+ msg.warn(
+ "The training data contains {:.2f} sentences per "
+ "document. When there are very few documents containing more "
+ "than one sentence, the parser will not learn how to segment "
+ "longer texts into sentences.".format(sents_per_doc)
+ )
+
# profile labels
labels_train = [label for label in gold_train_data["deps"]]
labels_train_unpreprocessed = [
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 64ab03a75..19f3e7860 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -6,17 +6,13 @@ import requests
import os
import subprocess
import sys
-import pkg_resources
-from wasabi import Printer
+from wasabi import msg
from .link import link
from ..util import get_package_path
from .. import about
-msg = Printer()
-
-
@plac.annotations(
model=("Model to download (shortcut or name)", "positional", None, str),
direct=("Force direct download of name + version", "flag", "d", bool),
@@ -87,6 +83,8 @@ def download(model, direct=False, *pip_args):
def require_package(name):
try:
+ import pkg_resources
+
pkg_resources.working_set.require(name)
return True
except: # noqa: E722
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 1114ada08..c24e37038 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals, division, print_function
import plac
from timeit import default_timer as timer
-from wasabi import Printer
+from wasabi import msg
from ..gold import GoldCorpus
from .. import util
@@ -32,7 +32,6 @@ def evaluate(
Evaluate a model. To render a sample of parses in a HTML file, set an
output directory as the displacy_path argument.
"""
- msg = Printer()
util.fix_random_seed()
if gpu_id >= 0:
util.use_gpu(gpu_id)
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 3655327ef..080d0dc77 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac
import platform
from pathlib import Path
-from wasabi import Printer
+from wasabi import msg
import srsly
from ..compat import path2str, basestring_, unicode_
@@ -23,7 +23,6 @@ def info(model=None, markdown=False, silent=False):
speficied as an argument, print model information. Flag --markdown
prints details in Markdown for easy copy-pasting to GitHub issues.
"""
- msg = Printer()
if model:
if util.is_package(model):
model_path = util.get_package_path(model)
diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py
index c285a12a6..cda21cbcc 100644
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@@ -11,7 +11,7 @@ import tarfile
import gzip
import zipfile
import srsly
-from wasabi import Printer
+from wasabi import msg
from ..vectors import Vectors
from ..errors import Errors, Warnings, user_warning
@@ -24,7 +24,6 @@ except ImportError:
DEFAULT_OOV_PROB = -20
-msg = Printer()
@plac.annotations(
diff --git a/spacy/cli/link.py b/spacy/cli/link.py
index 6b719ffe6..8117829b5 100644
--- a/spacy/cli/link.py
+++ b/spacy/cli/link.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
-from wasabi import Printer
+from wasabi import msg
from ..compat import symlink_to, path2str
from .. import util
@@ -20,7 +20,6 @@ def link(origin, link_name, force=False, model_path=None):
either the name of a pip package, or the local path to the model data
directory. Linking models allows loading them via spacy.load(link_name).
"""
- msg = Printer()
if util.is_package(origin):
model_path = util.get_package_path(origin)
else:
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index e99a6d5ff..8ed92259c 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
import plac
import shutil
from pathlib import Path
-from wasabi import Printer, get_raw_input
+from wasabi import msg, get_raw_input
import srsly
from ..compat import path2str
@@ -27,7 +27,6 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, force=Fals
set and a meta.json already exists in the output directory, the existing
values will be used as the defaults in the command-line prompt.
"""
- msg = Printer()
input_path = util.ensure_path(input_dir)
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
diff --git a/spacy/cli/pretrain.py b/spacy/cli/pretrain.py
index ba924f5e8..87910959e 100644
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@@ -133,7 +133,6 @@ def pretrain(
for key in config:
if isinstance(config[key], Path):
config[key] = str(config[key])
- msg = Printer()
util.fix_random_seed(seed)
if gpu_id != -1:
has_gpu = require_gpu(gpu_id=gpu_id)
@@ -272,7 +271,7 @@ def make_update(model, docs, optimizer, drop=0.0, objective="L2"):
"""Perform an update over a single batch of documents.
docs (iterable): A batch of `Doc` objects.
- drop (float): The droput rate.
+ drop (float): The dropout rate.
optimizer (callable): An optimizer.
RETURNS loss: A float for the loss.
"""
diff --git a/spacy/cli/profile.py b/spacy/cli/profile.py
index 201ab13d5..4995224f3 100644
--- a/spacy/cli/profile.py
+++ b/spacy/cli/profile.py
@@ -9,7 +9,7 @@ import pstats
import sys
import itertools
import thinc.extra.datasets
-from wasabi import Printer
+from wasabi import msg
from ..util import load_model
@@ -26,7 +26,6 @@ def profile(model, inputs=None, n_texts=10000):
It can either be provided as a JSONL file, or be read from sys.sytdin.
If no input file is specified, the IMDB dataset is loaded via Thinc.
"""
- msg = Printer()
if inputs is not None:
inputs = _read_inputs(inputs, msg)
if inputs is None:
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index 8f93e3d7e..bf31c1d26 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -8,7 +8,7 @@ from thinc.neural._classes.model import Model
from timeit import default_timer as timer
import shutil
import srsly
-from wasabi import Printer
+from wasabi import msg
import contextlib
import random
from thinc.neural.util import require_gpu
@@ -92,7 +92,6 @@ def train(
# temp fix to avoid import issues cf https://github.com/explosion/spaCy/issues/4200
import tqdm
- msg = Printer()
util.fix_random_seed()
util.set_env_log(verbose)
@@ -159,8 +158,7 @@ def train(
"`lang` argument ('{}') ".format(nlp.lang, lang),
exits=1,
)
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipeline]
- nlp.disable_pipes(*other_pipes)
+ nlp.disable_pipes([p for p in nlp.pipe_names if p not in pipeline])
for pipe in pipeline:
if pipe not in nlp.pipe_names:
if pipe == "parser":
@@ -266,7 +264,11 @@ def train(
exits=1,
)
train_docs = corpus.train_docs(
- nlp, noise_level=noise_level, gold_preproc=gold_preproc, max_length=0
+ nlp,
+ noise_level=noise_level,
+ gold_preproc=gold_preproc,
+ max_length=0,
+ ignore_misaligned=True,
)
train_labels = set()
if textcat_multilabel:
@@ -347,6 +349,7 @@ def train(
orth_variant_level=orth_variant_level,
gold_preproc=gold_preproc,
max_length=0,
+ ignore_misaligned=True,
)
if raw_text:
random.shuffle(raw_text)
@@ -385,7 +388,11 @@ def train(
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_docs = list(
- corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
+ corpus.dev_docs(
+ nlp_loaded,
+ gold_preproc=gold_preproc,
+ ignore_misaligned=True,
+ )
)
nwords = sum(len(doc_gold[0]) for doc_gold in dev_docs)
start_time = timer()
@@ -402,7 +409,11 @@ def train(
if hasattr(component, "cfg"):
component.cfg["beam_width"] = beam_width
dev_docs = list(
- corpus.dev_docs(nlp_loaded, gold_preproc=gold_preproc)
+ corpus.dev_docs(
+ nlp_loaded,
+ gold_preproc=gold_preproc,
+ ignore_misaligned=True,
+ )
)
start_time = timer()
scorer = nlp_loaded.evaluate(dev_docs, verbose=verbose)
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index f608ccd7f..93abad6f6 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -1,12 +1,11 @@
# coding: utf8
from __future__ import unicode_literals, print_function
-import pkg_resources
from pathlib import Path
import sys
import requests
import srsly
-from wasabi import Printer
+from wasabi import msg
from ..compat import path2str
from ..util import get_data_path
@@ -18,7 +17,6 @@ def validate():
Validate that the currently installed version of spaCy is compatible
with the installed models. Should be run after `pip install -U spacy`.
"""
- msg = Printer()
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
@@ -109,6 +107,8 @@ def get_model_links(compat):
def get_model_pkgs(compat, all_models):
+ import pkg_resources
+
pkgs = {}
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
package = pkg_name.replace("-", "_")
diff --git a/spacy/compat.py b/spacy/compat.py
index 16b400ad7..0ea31c6b3 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -12,6 +12,7 @@ import os
import sys
import itertools
import ast
+import types
from thinc.neural.util import copy_array
@@ -62,6 +63,7 @@ if is_python2:
basestring_ = basestring # noqa: F821
input_ = raw_input # noqa: F821
path2str = lambda path: str(path).decode("utf8")
+ class_types = (type, types.ClassType)
elif is_python3:
bytes_ = bytes
@@ -69,6 +71,7 @@ elif is_python3:
basestring_ = str
input_ = input
path2str = lambda path: str(path)
+ class_types = (type, types.ClassType) if is_python_pre_3_5 else type
def b_to_str(b_str):
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 17b67940a..d6e33437b 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -5,7 +5,7 @@ import uuid
from .templates import TPL_DEP_SVG, TPL_DEP_WORDS, TPL_DEP_ARCS, TPL_ENTS
from .templates import TPL_ENT, TPL_ENT_RTL, TPL_FIGURE, TPL_TITLE, TPL_PAGE
-from ..util import minify_html, escape_html, get_entry_points, ENTRY_POINTS
+from ..util import minify_html, escape_html, registry
from ..errors import Errors
@@ -242,7 +242,7 @@ class EntityRenderer(object):
"CARDINAL": "#e4e7d2",
"PERCENT": "#e4e7d2",
}
- user_colors = get_entry_points(ENTRY_POINTS.displacy_colors)
+ user_colors = registry.displacy_colors.get_all()
for user_color in user_colors.values():
colors.update(user_color)
colors.update(options.get("colors", {}))
diff --git a/spacy/displacy/templates.py b/spacy/displacy/templates.py
index 4a7c596d8..ade75d1d6 100644
--- a/spacy/displacy/templates.py
+++ b/spacy/displacy/templates.py
@@ -44,14 +44,14 @@ TPL_ENTS = """
TPL_ENT = """
-
+
{text}
{label}
"""
TPL_ENT_RTL = """
-
+
{text}
{label}
diff --git a/spacy/errors.py b/spacy/errors.py
index ecebc8345..c708f0a5b 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -80,8 +80,8 @@ class Warnings(object):
"the v2.x models cannot release the global interpreter lock. "
"Future versions may introduce a `n_process` argument for "
"parallel inference via multiprocessing.")
- W017 = ("Alias '{alias}' already exists in the Knowledge base.")
- W018 = ("Entity '{entity}' already exists in the Knowledge base.")
+ W017 = ("Alias '{alias}' already exists in the Knowledge Base.")
+ W018 = ("Entity '{entity}' already exists in the Knowledge Base.")
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
"previously loaded vectors. See Issue #3853.")
W020 = ("Unnamed vectors. This won't allow multiple vectors models to be "
@@ -95,6 +95,12 @@ class Warnings(object):
"you can ignore this warning by setting SPACY_WARNING_IGNORE=W022. "
"If this is surprising, make sure you have the spacy-lookups-data "
"package installed.")
+ W023 = ("Multiprocessing of Language.pipe is not supported in Python 2. "
+ "'n_process' will be set to 1.")
+ W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
+ "the Knowledge Base.")
+ W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
+ "previous components in the pipeline declare that they assign it.")
@add_codes
@@ -407,7 +413,7 @@ class Errors(object):
"{probabilities_length} respectively.")
E133 = ("The sum of prior probabilities for alias '{alias}' should not "
"exceed 1, but found {sum}.")
- E134 = ("Alias '{alias}' defined for unknown entity '{entity}'.")
+ E134 = ("Entity '{entity}' is not defined in the Knowledge Base.")
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
E136 = ("This additional feature requires the jsonschema library to be "
@@ -419,7 +425,7 @@ class Errors(object):
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input "
"includes either the `text` or `tokens` key. For more info, see "
"the docs:\nhttps://spacy.io/api/cli#pretrain-jsonl")
- E139 = ("Knowledge base for component '{name}' not initialized. Did you "
+ E139 = ("Knowledge Base for component '{name}' not initialized. Did you "
"forget to call set_kb()?")
E140 = ("The list of entities, prior probabilities and entity vectors "
"should be of equal length.")
@@ -495,6 +501,34 @@ class Errors(object):
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
"Lookups containing the lemmatization tables. See the docs for "
"details: https://spacy.io/api/lemmatizer#init")
+ E174 = ("Architecture '{name}' not found in registry. Available "
+ "names: {names}")
+ E175 = ("Can't remove rule for unknown match pattern ID: {key}")
+ E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
+ E177 = ("Ill-formed IOB input detected: {tag}")
+ E178 = ("Invalid pattern. Expected list of dicts but got: {pat}. Maybe you "
+ "accidentally passed a single pattern to Matcher.add instead of a "
+ "list of patterns? If you only want to add one pattern, make sure "
+ "to wrap it in a list. For example: matcher.add('{key}', [pattern])")
+ E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
+ "Doc. If you only want to add one pattern, make sure to wrap it "
+ "in a list. For example: matcher.add('{key}', [doc])")
+ E180 = ("Span attributes can't be declared as required or assigned by "
+ "components, since spans are only views of the Doc. Use Doc and "
+ "Token attributes (or custom extension attributes) only and remove "
+ "the following: {attrs}")
+ E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
+ "Only Doc and Token attributes are supported.")
+ E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
+ "to define the attribute? For example: {attr}.???")
+ E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
+ "attributes are supported, for example: {solution}")
+ E184 = ("Only attributes without underscores are supported in component "
+ "attribute declarations (because underscore and non-underscore "
+ "attributes are connected anyways): {attr} -> {solution}")
+ E185 = ("Received invalid attribute in component attribute declaration: "
+ "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
+ E186 = ("'{tok_a}' and '{tok_b}' are different texts.")
@add_codes
@@ -527,6 +561,10 @@ class MatchPatternError(ValueError):
ValueError.__init__(self, msg)
+class AlignmentError(ValueError):
+ pass
+
+
class ModelsWarning(UserWarning):
pass
diff --git a/spacy/glossary.py b/spacy/glossary.py
index 52abc7bb5..44a8277da 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -80,7 +80,7 @@ GLOSSARY = {
"RBR": "adverb, comparative",
"RBS": "adverb, superlative",
"RP": "adverb, particle",
- "TO": "infinitival to",
+ "TO": 'infinitival "to"',
"UH": "interjection",
"VB": "verb, base form",
"VBD": "verb, past tense",
@@ -279,6 +279,12 @@ GLOSSARY = {
"re": "repeated element",
"rs": "reported speech",
"sb": "subject",
+ "sb": "subject",
+ "sbp": "passivized subject (PP)",
+ "sp": "subject or predicate",
+ "svp": "separable verb prefix",
+ "uc": "unit component",
+ "vo": "vocative",
# Named Entity Recognition
# OntoNotes 5
# https://catalog.ldc.upenn.edu/docs/LDC2013T19/OntoNotes-Release-5.0.pdf
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 2fa789006..5593263bb 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -11,10 +11,9 @@ import itertools
from pathlib import Path
import srsly
-from . import _align
from .syntax import nonproj
from .tokens import Doc, Span
-from .errors import Errors
+from .errors import Errors, AlignmentError
from .compat import path2str
from . import util
from .util import minibatch, itershuffle
@@ -22,6 +21,7 @@ from .util import minibatch, itershuffle
from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek
+USE_NEW_ALIGN = False
punct_re = re.compile(r"\W")
@@ -73,7 +73,21 @@ def merge_sents(sents):
return [(m_deps, (m_cats, m_brackets))]
-def align(tokens_a, tokens_b):
+_ALIGNMENT_NORM_MAP = [("``", "'"), ("''", "'"), ('"', "'"), ("`", "'")]
+
+
+def _normalize_for_alignment(tokens):
+ tokens = [w.replace(" ", "").lower() for w in tokens]
+ output = []
+ for token in tokens:
+ token = token.replace(" ", "").lower()
+ for before, after in _ALIGNMENT_NORM_MAP:
+ token = token.replace(before, after)
+ output.append(token)
+ return output
+
+
+def _align_before_v2_2_2(tokens_a, tokens_b):
"""Calculate alignment tables between two tokenizations, using the Levenshtein
algorithm. The alignment is case-insensitive.
@@ -92,6 +106,7 @@ def align(tokens_a, tokens_b):
* b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
direction.
"""
+ from . import _align
if tokens_a == tokens_b:
alignment = numpy.arange(len(tokens_a))
return 0, alignment, alignment, {}, {}
@@ -111,6 +126,82 @@ def align(tokens_a, tokens_b):
return cost, i2j, j2i, i2j_multi, j2i_multi
+def align(tokens_a, tokens_b):
+ """Calculate alignment tables between two tokenizations.
+
+ tokens_a (List[str]): The candidate tokenization.
+ tokens_b (List[str]): The reference tokenization.
+ RETURNS: (tuple): A 5-tuple consisting of the following information:
+ * cost (int): The number of misaligned tokens.
+ * a2b (List[int]): Mapping of indices in `tokens_a` to indices in `tokens_b`.
+ For instance, if `a2b[4] == 6`, that means that `tokens_a[4]` aligns
+ to `tokens_b[6]`. If there's no one-to-one alignment for a token,
+ it has the value -1.
+ * b2a (List[int]): The same as `a2b`, but mapping the other direction.
+ * a2b_multi (Dict[int, int]): A dictionary mapping indices in `tokens_a`
+ to indices in `tokens_b`, where multiple tokens of `tokens_a` align to
+ the same token of `tokens_b`.
+ * b2a_multi (Dict[int, int]): As with `a2b_multi`, but mapping the other
+ direction.
+ """
+ if not USE_NEW_ALIGN:
+ return _align_before_v2_2_2(tokens_a, tokens_b)
+ tokens_a = _normalize_for_alignment(tokens_a)
+ tokens_b = _normalize_for_alignment(tokens_b)
+ cost = 0
+ a2b = numpy.empty(len(tokens_a), dtype="i")
+ b2a = numpy.empty(len(tokens_b), dtype="i")
+ a2b_multi = {}
+ b2a_multi = {}
+ i = 0
+ j = 0
+ offset_a = 0
+ offset_b = 0
+ while i < len(tokens_a) and j < len(tokens_b):
+ a = tokens_a[i][offset_a:]
+ b = tokens_b[j][offset_b:]
+ a2b[i] = b2a[j] = -1
+ if a == b:
+ if offset_a == offset_b == 0:
+ a2b[i] = j
+ b2a[j] = i
+ elif offset_a == 0:
+ cost += 2
+ a2b_multi[i] = j
+ elif offset_b == 0:
+ cost += 2
+ b2a_multi[j] = i
+ offset_a = offset_b = 0
+ i += 1
+ j += 1
+ elif a == "":
+ assert offset_a == 0
+ cost += 1
+ i += 1
+ elif b == "":
+ assert offset_b == 0
+ cost += 1
+ j += 1
+ elif b.startswith(a):
+ cost += 1
+ if offset_a == 0:
+ a2b_multi[i] = j
+ i += 1
+ offset_a = 0
+ offset_b += len(a)
+ elif a.startswith(b):
+ cost += 1
+ if offset_b == 0:
+ b2a_multi[j] = i
+ j += 1
+ offset_b = 0
+ offset_a += len(b)
+ else:
+ assert "".join(tokens_a) != "".join(tokens_b)
+ raise AlignmentError(Errors.E186.format(tok_a=tokens_a, tok_b=tokens_b))
+ return cost, a2b, b2a, a2b_multi, b2a_multi
+
+
class GoldCorpus(object):
"""An annotated corpus, using the JSON file format. Manages
annotations for tagging, dependency parsing and NER.
@@ -176,6 +267,11 @@ class GoldCorpus(object):
gold_tuples = read_json_file(loc)
elif loc.parts[-1].endswith("jsonl"):
gold_tuples = srsly.read_jsonl(loc)
+ first_gold_tuple = next(gold_tuples)
+ gold_tuples = itertools.chain([first_gold_tuple], gold_tuples)
+ # TODO: proper format checks with schemas
+ if isinstance(first_gold_tuple, dict):
+ gold_tuples = read_json_object(gold_tuples)
elif loc.parts[-1].endswith("msg"):
gold_tuples = srsly.read_msgpack(loc)
else:
@@ -209,7 +305,8 @@ class GoldCorpus(object):
return n
def train_docs(self, nlp, gold_preproc=False, max_length=None,
- noise_level=0.0, orth_variant_level=0.0):
+ noise_level=0.0, orth_variant_level=0.0,
+ ignore_misaligned=False):
locs = list((self.tmp_dir / 'train').iterdir())
random.shuffle(locs)
train_tuples = self.read_tuples(locs, limit=self.limit)
@@ -217,20 +314,23 @@ class GoldCorpus(object):
max_length=max_length,
noise_level=noise_level,
orth_variant_level=orth_variant_level,
- make_projective=True)
+ make_projective=True,
+ ignore_misaligned=ignore_misaligned)
yield from gold_docs
def train_docs_without_preprocessing(self, nlp, gold_preproc=False):
gold_docs = self.iter_gold_docs(nlp, self.train_tuples, gold_preproc=gold_preproc)
yield from gold_docs
- def dev_docs(self, nlp, gold_preproc=False):
- gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc)
+ def dev_docs(self, nlp, gold_preproc=False, ignore_misaligned=False):
+ gold_docs = self.iter_gold_docs(nlp, self.dev_tuples, gold_preproc=gold_preproc,
+ ignore_misaligned=ignore_misaligned)
yield from gold_docs
@classmethod
def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None,
- noise_level=0.0, orth_variant_level=0.0, make_projective=False):
+ noise_level=0.0, orth_variant_level=0.0, make_projective=False,
+ ignore_misaligned=False):
for raw_text, paragraph_tuples in tuples:
if gold_preproc:
raw_text = None
@@ -239,10 +339,12 @@ class GoldCorpus(object):
docs, paragraph_tuples = cls._make_docs(nlp, raw_text,
paragraph_tuples, gold_preproc, noise_level=noise_level,
orth_variant_level=orth_variant_level)
- golds = cls._make_golds(docs, paragraph_tuples, make_projective)
+ golds = cls._make_golds(docs, paragraph_tuples, make_projective,
+ ignore_misaligned=ignore_misaligned)
for doc, gold in zip(docs, golds):
- if (not max_length) or len(doc) < max_length:
- yield doc, gold
+ if gold is not None:
+ if (not max_length) or len(doc) < max_length:
+ yield doc, gold
@classmethod
def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc, noise_level=0.0, orth_variant_level=0.0):
@@ -257,14 +359,22 @@ class GoldCorpus(object):
@classmethod
- def _make_golds(cls, docs, paragraph_tuples, make_projective):
+ def _make_golds(cls, docs, paragraph_tuples, make_projective, ignore_misaligned=False):
if len(docs) != len(paragraph_tuples):
n_annots = len(paragraph_tuples)
raise ValueError(Errors.E070.format(n_docs=len(docs), n_annots=n_annots))
- return [GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats,
- make_projective=make_projective)
- for doc, (sent_tuples, (cats, brackets))
- in zip(docs, paragraph_tuples)]
+ golds = []
+ for doc, (sent_tuples, (cats, brackets)) in zip(docs, paragraph_tuples):
+ try:
+ gold = GoldParse.from_annot_tuples(doc, sent_tuples, cats=cats,
+ make_projective=make_projective)
+ except AlignmentError:
+ if ignore_misaligned:
+ gold = None
+ else:
+ raise
+ golds.append(gold)
+ return golds
def make_orth_variants(nlp, raw, paragraph_tuples, orth_variant_level=0.0):
@@ -494,7 +604,6 @@ def _json_iterate(loc):
def iob_to_biluo(tags):
out = []
- curr_label = None
tags = list(tags)
while tags:
out.extend(_consume_os(tags))
@@ -519,6 +628,8 @@ def _consume_ent(tags):
tags.pop(0)
label = tag[2:]
if length == 1:
+ if len(label) == 0:
+ raise ValueError(Errors.E177.format(tag=tag))
return ["U-" + label]
else:
start = "B-" + label
@@ -542,7 +653,7 @@ cdef class GoldParse:
def __init__(self, doc, annot_tuples=None, words=None, tags=None, morphology=None,
heads=None, deps=None, entities=None, make_projective=False,
cats=None, links=None, **_):
- """Create a GoldParse.
+ """Create a GoldParse. The fields will not be initialized if len(doc) is zero.
doc (Doc): The document the annotations refer to.
words (iterable): A sequence of unicode word strings.
@@ -571,138 +682,144 @@ cdef class GoldParse:
negative examples respectively.
RETURNS (GoldParse): The newly constructed object.
"""
- if words is None:
- words = [token.text for token in doc]
- if tags is None:
- tags = [None for _ in words]
- if heads is None:
- heads = [None for _ in words]
- if deps is None:
- deps = [None for _ in words]
- if morphology is None:
- morphology = [None for _ in words]
- if entities is None:
- entities = ["-" for _ in doc]
- elif len(entities) == 0:
- entities = ["O" for _ in doc]
- else:
- # Translate the None values to '-', to make processing easier.
- # See Issue #2603
- entities = [(ent if ent is not None else "-") for ent in entities]
- if not isinstance(entities[0], basestring):
- # Assume we have entities specified by character offset.
- entities = biluo_tags_from_offsets(doc, entities)
self.mem = Pool()
self.loss = 0
self.length = len(doc)
- # These are filled by the tagger/parser/entity recogniser
- self.c.tags = self.mem.alloc(len(doc), sizeof(int))
- self.c.heads = self.mem.alloc(len(doc), sizeof(int))
- self.c.labels = self.mem.alloc(len(doc), sizeof(attr_t))
- self.c.has_dep = self.mem.alloc(len(doc), sizeof(int))
- self.c.sent_start = self.mem.alloc(len(doc), sizeof(int))
- self.c.ner = self.mem.alloc(len(doc), sizeof(Transition))
-
self.cats = {} if cats is None else dict(cats)
self.links = links
- self.words = [None] * len(doc)
- self.tags = [None] * len(doc)
- self.heads = [None] * len(doc)
- self.labels = [None] * len(doc)
- self.ner = [None] * len(doc)
- self.morphology = [None] * len(doc)
- # This needs to be done before we align the words
- if make_projective and heads is not None and deps is not None:
- heads, deps = nonproj.projectivize(heads, deps)
-
- # Do many-to-one alignment for misaligned tokens.
- # If we over-segment, we'll have one gold word that covers a sequence
- # of predicted words
- # If we under-segment, we'll have one predicted word that covers a
- # sequence of gold words.
- # If we "mis-segment", we'll have a sequence of predicted words covering
- # a sequence of gold words. That's many-to-many -- we don't do that.
- cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
-
- self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
- self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
-
- annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
- self.orig_annot = list(zip(*annot_tuples))
-
- for i, gold_i in enumerate(self.cand_to_gold):
- if doc[i].text.isspace():
- self.words[i] = doc[i].text
- self.tags[i] = "_SP"
- self.heads[i] = None
- self.labels[i] = None
- self.ner[i] = None
- self.morphology[i] = set()
- if gold_i is None:
- if i in i2j_multi:
- self.words[i] = words[i2j_multi[i]]
- self.tags[i] = tags[i2j_multi[i]]
- self.morphology[i] = morphology[i2j_multi[i]]
- is_last = i2j_multi[i] != i2j_multi.get(i+1)
- is_first = i2j_multi[i] != i2j_multi.get(i-1)
- # Set next word in multi-token span as head, until last
- if not is_last:
- self.heads[i] = i+1
- self.labels[i] = "subtok"
- else:
- self.heads[i] = self.gold_to_cand[heads[i2j_multi[i]]]
- self.labels[i] = deps[i2j_multi[i]]
- # Now set NER...This is annoying because if we've split
- # got an entity word split into two, we need to adjust the
- # BILUO tags. We can't have BB or LL etc.
- # Case 1: O -- easy.
- ner_tag = entities[i2j_multi[i]]
- if ner_tag == "O":
- self.ner[i] = "O"
- # Case 2: U. This has to become a B I* L sequence.
- elif ner_tag.startswith("U-"):
- if is_first:
- self.ner[i] = ner_tag.replace("U-", "B-", 1)
- elif is_last:
- self.ner[i] = ner_tag.replace("U-", "L-", 1)
- else:
- self.ner[i] = ner_tag.replace("U-", "I-", 1)
- # Case 3: L. If not last, change to I.
- elif ner_tag.startswith("L-"):
- if is_last:
- self.ner[i] = ner_tag
- else:
- self.ner[i] = ner_tag.replace("L-", "I-", 1)
- # Case 4: I. Stays correct
- elif ner_tag.startswith("I-"):
- self.ner[i] = ner_tag
+ # avoid allocating memory if the doc does not contain any tokens
+ if self.length > 0:
+ if words is None:
+ words = [token.text for token in doc]
+ if tags is None:
+ tags = [None for _ in words]
+ if heads is None:
+ heads = [None for _ in words]
+ if deps is None:
+ deps = [None for _ in words]
+ if morphology is None:
+ morphology = [None for _ in words]
+ if entities is None:
+ entities = ["-" for _ in words]
+ elif len(entities) == 0:
+ entities = ["O" for _ in words]
else:
- self.words[i] = words[gold_i]
- self.tags[i] = tags[gold_i]
- self.morphology[i] = morphology[gold_i]
- if heads[gold_i] is None:
+ # Translate the None values to '-', to make processing easier.
+ # See Issue #2603
+ entities = [(ent if ent is not None else "-") for ent in entities]
+ if not isinstance(entities[0], basestring):
+ # Assume we have entities specified by character offset.
+ entities = biluo_tags_from_offsets(doc, entities)
+
+ # These are filled by the tagger/parser/entity recogniser
+ self.c.tags = self.mem.alloc(len(doc), sizeof(int))
+ self.c.heads = self.mem.alloc(len(doc), sizeof(int))
+ self.c.labels = self.mem.alloc(len(doc), sizeof(attr_t))
+ self.c.has_dep = self.mem.alloc(len(doc), sizeof(int))
+ self.c.sent_start = self.mem.alloc(len(doc), sizeof(int))
+ self.c.ner = self.mem.alloc(len(doc), sizeof(Transition))
+
+ self.words = [None] * len(doc)
+ self.tags = [None] * len(doc)
+ self.heads = [None] * len(doc)
+ self.labels = [None] * len(doc)
+ self.ner = [None] * len(doc)
+ self.morphology = [None] * len(doc)
+
+ # This needs to be done before we align the words
+ if make_projective and heads is not None and deps is not None:
+ heads, deps = nonproj.projectivize(heads, deps)
+
+ # Do many-to-one alignment for misaligned tokens.
+ # If we over-segment, we'll have one gold word that covers a sequence
+ # of predicted words
+ # If we under-segment, we'll have one predicted word that covers a
+ # sequence of gold words.
+ # If we "mis-segment", we'll have a sequence of predicted words covering
+ # a sequence of gold words. That's many-to-many -- we don't do that.
+ cost, i2j, j2i, i2j_multi, j2i_multi = align([t.orth_ for t in doc], words)
+
+ self.cand_to_gold = [(j if j >= 0 else None) for j in i2j]
+ self.gold_to_cand = [(i if i >= 0 else None) for i in j2i]
+
+ annot_tuples = (range(len(words)), words, tags, heads, deps, entities)
+ self.orig_annot = list(zip(*annot_tuples))
+
+ for i, gold_i in enumerate(self.cand_to_gold):
+ if doc[i].text.isspace():
+ self.words[i] = doc[i].text
+ self.tags[i] = "_SP"
self.heads[i] = None
+ self.labels[i] = None
+ self.ner[i] = None
+ self.morphology[i] = set()
+ if gold_i is None:
+ if i in i2j_multi:
+ self.words[i] = words[i2j_multi[i]]
+ self.tags[i] = tags[i2j_multi[i]]
+ self.morphology[i] = morphology[i2j_multi[i]]
+ is_last = i2j_multi[i] != i2j_multi.get(i+1)
+ is_first = i2j_multi[i] != i2j_multi.get(i-1)
+ # Set next word in multi-token span as head, until last
+ if not is_last:
+ self.heads[i] = i+1
+ self.labels[i] = "subtok"
+ else:
+ head_i = heads[i2j_multi[i]]
+ if head_i:
+ self.heads[i] = self.gold_to_cand[head_i]
+ self.labels[i] = deps[i2j_multi[i]]
+ # Now set NER...This is annoying because if we've split
+ # got an entity word split into two, we need to adjust the
+ # BILUO tags. We can't have BB or LL etc.
+ # Case 1: O -- easy.
+ ner_tag = entities[i2j_multi[i]]
+ if ner_tag == "O":
+ self.ner[i] = "O"
+ # Case 2: U. This has to become a B I* L sequence.
+ elif ner_tag.startswith("U-"):
+ if is_first:
+ self.ner[i] = ner_tag.replace("U-", "B-", 1)
+ elif is_last:
+ self.ner[i] = ner_tag.replace("U-", "L-", 1)
+ else:
+ self.ner[i] = ner_tag.replace("U-", "I-", 1)
+ # Case 3: L. If not last, change to I.
+ elif ner_tag.startswith("L-"):
+ if is_last:
+ self.ner[i] = ner_tag
+ else:
+ self.ner[i] = ner_tag.replace("L-", "I-", 1)
+ # Case 4: I. Stays correct
+ elif ner_tag.startswith("I-"):
+ self.ner[i] = ner_tag
else:
- self.heads[i] = self.gold_to_cand[heads[gold_i]]
- self.labels[i] = deps[gold_i]
- self.ner[i] = entities[gold_i]
+ self.words[i] = words[gold_i]
+ self.tags[i] = tags[gold_i]
+ self.morphology[i] = morphology[gold_i]
+ if heads[gold_i] is None:
+ self.heads[i] = None
+ else:
+ self.heads[i] = self.gold_to_cand[heads[gold_i]]
+ self.labels[i] = deps[gold_i]
+ self.ner[i] = entities[gold_i]
- # Prevent whitespace that isn't within entities from being tagged as
- # an entity.
- for i in range(len(self.ner)):
- if self.tags[i] == "_SP":
- prev_ner = self.ner[i-1] if i >= 1 else None
- next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
- if prev_ner == "O" or next_ner == "O":
- self.ner[i] = "O"
+ # Prevent whitespace that isn't within entities from being tagged as
+ # an entity.
+ for i in range(len(self.ner)):
+ if self.tags[i] == "_SP":
+ prev_ner = self.ner[i-1] if i >= 1 else None
+ next_ner = self.ner[i+1] if (i+1) < len(self.ner) else None
+ if prev_ner == "O" or next_ner == "O":
+ self.ner[i] = "O"
- cycle = nonproj.contains_cycle(self.heads)
- if cycle is not None:
- raise ValueError(Errors.E069.format(cycle=cycle,
- cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
- doc_tokens=" ".join(words[:50])))
+ cycle = nonproj.contains_cycle(self.heads)
+ if cycle is not None:
+ raise ValueError(Errors.E069.format(cycle=cycle,
+ cycle_tokens=" ".join(["'{}'".format(self.words[tok_id]) for tok_id in cycle]),
+ doc_tokens=" ".join(words[:50])))
def __len__(self):
"""Get the number of gold-standard tokens.
@@ -740,7 +857,8 @@ def docs_to_json(docs, id=0):
docs (iterable / Doc): The Doc object(s) to convert.
id (int): Id for the JSON.
- RETURNS (list): The data in spaCy's JSON format.
+ RETURNS (dict): The data in spaCy's JSON format
+ - each input doc will be treated as a paragraph in the output doc
"""
if isinstance(docs, Doc):
docs = [docs]
@@ -795,7 +913,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
"""
# Ensure no overlapping entity labels exist
tokens_in_ents = {}
-
+
starts = {token.idx: token.i for token in doc}
ends = {token.idx + len(token): token.i for token in doc}
biluo = ["-" for _ in doc]
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 6cbc06e2c..31fd1706e 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -142,6 +142,7 @@ cdef class KnowledgeBase:
i = 0
cdef KBEntryC entry
+ cdef hash_t entity_hash
while i < nr_entities:
entity_vector = vector_list[i]
if len(entity_vector) != self.entity_vector_length:
@@ -161,6 +162,14 @@ cdef class KnowledgeBase:
i += 1
+ def contains_entity(self, unicode entity):
+ cdef hash_t entity_hash = self.vocab.strings.add(entity)
+ return entity_hash in self._entry_index
+
+ def contains_alias(self, unicode alias):
+ cdef hash_t alias_hash = self.vocab.strings.add(alias)
+ return alias_hash in self._alias_index
+
def add_alias(self, unicode alias, entities, probabilities):
"""
For a given alias, add its potential entities and prior probabilies to the KB.
@@ -190,7 +199,7 @@ cdef class KnowledgeBase:
for entity, prob in zip(entities, probabilities):
entity_hash = self.vocab.strings[entity]
if not entity_hash in self._entry_index:
- raise ValueError(Errors.E134.format(alias=alias, entity=entity))
+ raise ValueError(Errors.E134.format(entity=entity))
entry_index = self._entry_index.get(entity_hash)
entry_indices.push_back(int(entry_index))
@@ -201,8 +210,63 @@ cdef class KnowledgeBase:
return alias_hash
- def get_candidates(self, unicode alias):
+ def append_alias(self, unicode alias, unicode entity, float prior_prob, ignore_warnings=False):
+ """
+ For an alias already existing in the KB, extend its potential entities with one more.
+ Throw a warning if either the alias or the entity is unknown,
+ or when the combination is already previously recorded.
+ Throw an error if this entity+prior prob would exceed the sum of 1.
+ For efficiency, it's best to use the method `add_alias` as much as possible instead of this one.
+ """
+ # Check if the alias exists in the KB
cdef hash_t alias_hash = self.vocab.strings[alias]
+ if not alias_hash in self._alias_index:
+ raise ValueError(Errors.E176.format(alias=alias))
+
+ # Check if the entity exists in the KB
+ cdef hash_t entity_hash = self.vocab.strings[entity]
+ if not entity_hash in self._entry_index:
+ raise ValueError(Errors.E134.format(entity=entity))
+ entry_index = self._entry_index.get(entity_hash)
+
+ # Throw an error if the prior probabilities (including the new one) sum up to more than 1
+ alias_index = self._alias_index.get(alias_hash)
+ alias_entry = self._aliases_table[alias_index]
+ current_sum = sum([p for p in alias_entry.probs])
+ new_sum = current_sum + prior_prob
+
+ if new_sum > 1.00001:
+ raise ValueError(Errors.E133.format(alias=alias, sum=new_sum))
+
+ entry_indices = alias_entry.entry_indices
+
+ is_present = False
+ for i in range(entry_indices.size()):
+ if entry_indices[i] == int(entry_index):
+ is_present = True
+
+ if is_present:
+ if not ignore_warnings:
+ user_warning(Warnings.W024.format(entity=entity, alias=alias))
+ else:
+ entry_indices.push_back(int(entry_index))
+ alias_entry.entry_indices = entry_indices
+
+ probs = alias_entry.probs
+ probs.push_back(float(prior_prob))
+ alias_entry.probs = probs
+ self._aliases_table[alias_index] = alias_entry
+
+
+ def get_candidates(self, unicode alias):
+ """
+ Return candidate entities for an alias. Each candidate defines the entity, the original alias,
+ and the prior probability of that alias resolving to that entity.
+ If the alias is not known in the KB, and empty list is returned.
+ """
+ cdef hash_t alias_hash = self.vocab.strings[alias]
+ if not alias_hash in self._alias_index:
+ return []
alias_index = self._alias_index.get(alias_hash)
alias_entry = self._aliases_table[alias_index]
@@ -341,7 +405,6 @@ cdef class KnowledgeBase:
assert nr_entities == self.get_size_entities()
# STEP 3: load aliases
-
cdef int64_t nr_aliases
reader.read_alias_length(&nr_aliases)
self._alias_index = PreshMap(nr_aliases+1)
diff --git a/spacy/lang/char_classes.py b/spacy/lang/char_classes.py
index cb5b50ffc..5ed2a2a8c 100644
--- a/spacy/lang/char_classes.py
+++ b/spacy/lang/char_classes.py
@@ -184,7 +184,7 @@ _russian_lower = r"ёа-я"
_russian_upper = r"ЁА-Я"
_russian = r"ёа-яЁА-Я"
-_sinhala = r"\u0D85-\u0D96\u0D9A-\u0DB1\u0DB3-\u0DBB\u0DBD\u0DC0-\u0DC6"
+_sinhala = r"\u0D80-\u0DFF"
_tatar_lower = r"әөүҗңһ"
_tatar_upper = r"ӘӨҮҖҢҺ"
diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py
index 394478145..c169501a9 100644
--- a/spacy/lang/de/tag_map.py
+++ b/spacy/lang/de/tag_map.py
@@ -1,8 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
-from ...symbols import POS, PUNCT, ADJ, CONJ, SCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
+from ...symbols import POS, PUNCT, ADJ, CCONJ, SCONJ, NUM, DET, ADV, ADP, X
+from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX, VERB
TAG_MAP = {
@@ -20,8 +20,8 @@ TAG_MAP = {
"CARD": {POS: NUM, "NumType": "card"},
"FM": {POS: X, "Foreign": "yes"},
"ITJ": {POS: INTJ},
- "KOKOM": {POS: CONJ, "ConjType": "comp"},
- "KON": {POS: CONJ},
+ "KOKOM": {POS: CCONJ, "ConjType": "comp"},
+ "KON": {POS: CCONJ},
"KOUI": {POS: SCONJ},
"KOUS": {POS: SCONJ},
"NE": {POS: PROPN},
@@ -43,7 +43,7 @@ TAG_MAP = {
"PTKA": {POS: PART},
"PTKANT": {POS: PART, "PartType": "res"},
"PTKNEG": {POS: PART, "Polarity": "neg"},
- "PTKVZ": {POS: PART, "PartType": "vbp"},
+ "PTKVZ": {POS: ADP, "PartType": "vbp"},
"PTKZU": {POS: PART, "PartType": "inf"},
"PWAT": {POS: DET, "PronType": "int"},
"PWAV": {POS: ADV, "PronType": "int"},
diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py
index 9bd884a3a..ecb3103cc 100644
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from ...symbols import POS, PUNCT, SYM, ADJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON, AUX
+from ...symbols import NOUN, PROPN, PART, INTJ, SPACE, PRON
TAG_MAP = {
@@ -28,8 +28,8 @@ TAG_MAP = {
"JJR": {POS: ADJ, "Degree": "comp"},
"JJS": {POS: ADJ, "Degree": "sup"},
"LS": {POS: X, "NumType": "ord"},
- "MD": {POS: AUX, "VerbType": "mod"},
- "NIL": {POS: ""},
+ "MD": {POS: VERB, "VerbType": "mod"},
+ "NIL": {POS: X},
"NN": {POS: NOUN, "Number": "sing"},
"NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
"NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
@@ -37,7 +37,7 @@ TAG_MAP = {
"PDT": {POS: DET},
"POS": {POS: PART, "Poss": "yes"},
"PRP": {POS: PRON, "PronType": "prs"},
- "PRP$": {POS: PRON, "PronType": "prs", "Poss": "yes"},
+ "PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"},
"RB": {POS: ADV, "Degree": "pos"},
"RBR": {POS: ADV, "Degree": "comp"},
"RBS": {POS: ADV, "Degree": "sup"},
@@ -58,9 +58,9 @@ TAG_MAP = {
"Number": "sing",
"Person": "three",
},
- "WDT": {POS: PRON},
+ "WDT": {POS: DET},
"WP": {POS: PRON},
- "WP$": {POS: PRON, "Poss": "yes"},
+ "WP$": {POS: DET, "Poss": "yes"},
"WRB": {POS: ADV},
"ADD": {POS: X},
"NFP": {POS: PUNCT},
diff --git a/spacy/lang/es/examples.py b/spacy/lang/es/examples.py
index 96ff9c1ed..0e31b56af 100644
--- a/spacy/lang/es/examples.py
+++ b/spacy/lang/es/examples.py
@@ -11,12 +11,12 @@ Example sentences to test spaCy and its language models.
sentences = [
- "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares",
- "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes",
- "San Francisco analiza prohibir los robots delivery",
- "Londres es una gran ciudad del Reino Unido",
- "El gato come pescado",
- "Veo al hombre con el telescopio",
- "La araña come moscas",
- "El pingüino incuba en su nido",
+ "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
+ "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
+ "San Francisco analiza prohibir los robots delivery.",
+ "Londres es una gran ciudad del Reino Unido.",
+ "El gato come pescado.",
+ "Veo al hombre con el telescopio.",
+ "La araña come moscas.",
+ "El pingüino incuba en su nido.",
]
diff --git a/spacy/lang/it/punctuation.py b/spacy/lang/it/punctuation.py
index 4439376c8..4fa931fde 100644
--- a/spacy/lang/it/punctuation.py
+++ b/spacy/lang/it/punctuation.py
@@ -5,7 +5,7 @@ from ..punctuation import TOKENIZER_INFIXES
from ..char_classes import ALPHA
-ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
+ELISION = " ' ’ ".strip().replace(" ", "")
_infixes = TOKENIZER_INFIXES + [
diff --git a/spacy/lang/lb/__init__.py b/spacy/lang/lb/__init__.py
new file mode 100644
index 000000000..4fcfaddb4
--- /dev/null
+++ b/spacy/lang/lb/__init__.py
@@ -0,0 +1,36 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
+from .norm_exceptions import NORM_EXCEPTIONS
+from .punctuation import TOKENIZER_INFIXES
+from .lex_attrs import LEX_ATTRS
+from .tag_map import TAG_MAP
+from .stop_words import STOP_WORDS
+
+from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from ..norm_exceptions import BASE_NORMS
+from ...language import Language
+from ...attrs import LANG, NORM
+from ...util import update_exc, add_lookups
+
+
+class LuxembourgishDefaults(Language.Defaults):
+ lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+ lex_attr_getters.update(LEX_ATTRS)
+ lex_attr_getters[LANG] = lambda text: "lb"
+ lex_attr_getters[NORM] = add_lookups(
+ Language.Defaults.lex_attr_getters[NORM], NORM_EXCEPTIONS, BASE_NORMS
+ )
+ tokenizer_exceptions = update_exc(BASE_EXCEPTIONS, TOKENIZER_EXCEPTIONS)
+ stop_words = STOP_WORDS
+ tag_map = TAG_MAP
+ infixes = TOKENIZER_INFIXES
+
+
+class Luxembourgish(Language):
+ lang = "lb"
+ Defaults = LuxembourgishDefaults
+
+
+__all__ = ["Luxembourgish"]
diff --git a/spacy/lang/lb/examples.py b/spacy/lang/lb/examples.py
new file mode 100644
index 000000000..3cbba31d9
--- /dev/null
+++ b/spacy/lang/lb/examples.py
@@ -0,0 +1,18 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.lb.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+sentences = [
+ "An der Zäit hunn sech den Nordwand an d’Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum.",
+ "Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen.",
+ "Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet.",
+ "Um Enn huet den Nordwand säi Kampf opginn.",
+ "Dunn huet d’Sonn d’Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen.",
+ "Do huet den Nordwand missen zouginn, dass d’Sonn vun hinnen zwee de Stäerkste wier.",
+]
diff --git a/spacy/lang/lb/lex_attrs.py b/spacy/lang/lb/lex_attrs.py
new file mode 100644
index 000000000..e38c74974
--- /dev/null
+++ b/spacy/lang/lb/lex_attrs.py
@@ -0,0 +1,44 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...attrs import LIKE_NUM
+
+
+_num_words = set(
+ """
+null eent zwee dräi véier fënnef sechs ziwen aacht néng zéng eelef zwielef dräizéng
+véierzéng foffzéng siechzéng siwwenzéng uechtzeng uechzeng nonnzéng nongzéng zwanzeg drësseg véierzeg foffzeg sechzeg siechzeg siwenzeg achtzeg achzeg uechtzeg uechzeg nonnzeg
+honnert dausend millioun milliard billioun billiard trillioun triliard
+""".split()
+)
+
+_ordinal_words = set(
+ """
+éischten zweeten drëtten véierten fënneften sechsten siwenten aachten néngten zéngten eeleften
+zwieleften dräizéngten véierzéngten foffzéngten siechzéngten uechtzéngen uechzéngten nonnzéngten nongzéngten zwanzegsten
+drëssegsten véierzegsten foffzegsten siechzegsten siwenzegsten uechzegsten nonnzegsten
+honnertsten dausendsten milliounsten
+milliardsten billiounsten billiardsten trilliounsten trilliardsten
+""".split()
+)
+
+
+def like_num(text):
+ """
+ check if text resembles a number
+ """
+ text = text.replace(",", "").replace(".", "")
+ if text.isdigit():
+ return True
+ if text.count("/") == 1:
+ num, denom = text.split("/")
+ if num.isdigit() and denom.isdigit():
+ return True
+ if text in _num_words:
+ return True
+ if text in _ordinal_words:
+ return True
+ return False
+
+
+LEX_ATTRS = {LIKE_NUM: like_num}
diff --git a/spacy/lang/lb/norm_exceptions.py b/spacy/lang/lb/norm_exceptions.py
new file mode 100644
index 000000000..101102ca4
--- /dev/null
+++ b/spacy/lang/lb/norm_exceptions.py
@@ -0,0 +1,16 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+# TODO
+# norm execptions: find a possibility to deal with the zillions of spelling
+# variants (vläicht = vlaicht, vleicht, viläicht, viläischt, etc. etc.)
+# here one could include the most common spelling mistakes
+
+_exc = {"datt": "dass", "wgl.": "weg.", "vläicht": "viläicht"}
+
+
+NORM_EXCEPTIONS = {}
+
+for string, norm in _exc.items():
+ NORM_EXCEPTIONS[string] = norm
+ NORM_EXCEPTIONS[string.title()] = norm
diff --git a/spacy/lang/lb/punctuation.py b/spacy/lang/lb/punctuation.py
new file mode 100644
index 000000000..53df6bcd9
--- /dev/null
+++ b/spacy/lang/lb/punctuation.py
@@ -0,0 +1,16 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ..punctuation import TOKENIZER_INFIXES
+from ..char_classes import ALPHA
+
+
+ELISION = " ' ’ ".strip().replace(" ", "")
+HYPHENS = r"- – — ‐ ‑".strip().replace(" ", "")
+
+
+_infixes = TOKENIZER_INFIXES + [
+ r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
+]
+
+TOKENIZER_INFIXES = _infixes
diff --git a/spacy/lang/lb/stop_words.py b/spacy/lang/lb/stop_words.py
new file mode 100644
index 000000000..41e6f79d2
--- /dev/null
+++ b/spacy/lang/lb/stop_words.py
@@ -0,0 +1,214 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+STOP_WORDS = set(
+ """
+a
+à
+äis
+är
+ärt
+äert
+ären
+all
+allem
+alles
+alleguer
+als
+also
+am
+an
+anerefalls
+ass
+aus
+awer
+bei
+beim
+bis
+bis
+d'
+dach
+datt
+däin
+där
+dat
+de
+dee
+den
+deel
+deem
+deen
+deene
+déi
+den
+deng
+denger
+dem
+der
+dësem
+di
+dir
+do
+da
+dann
+domat
+dozou
+drop
+du
+duerch
+duerno
+e
+ee
+em
+een
+eent
+ë
+en
+ënner
+ëm
+ech
+eis
+eise
+eisen
+eiser
+eises
+eisereen
+esou
+een
+eng
+enger
+engem
+entweder
+et
+eréischt
+falls
+fir
+géint
+géif
+gëtt
+gët
+geet
+gi
+ginn
+gouf
+gouff
+goung
+hat
+haten
+hatt
+hätt
+hei
+hu
+huet
+hun
+hunn
+hiren
+hien
+hin
+hier
+hir
+jidderen
+jiddereen
+jiddwereen
+jiddereng
+jiddwerengen
+jo
+ins
+iech
+iwwer
+kann
+kee
+keen
+kënne
+kënnt
+kéng
+kéngen
+kéngem
+koum
+kuckt
+mam
+mat
+ma
+mä
+mech
+méi
+mécht
+meng
+menger
+mer
+mir
+muss
+nach
+nämmlech
+nämmelech
+näischt
+nawell
+nëmme
+nëmmen
+net
+nees
+nee
+no
+nu
+nom
+och
+oder
+ons
+onsen
+onser
+onsereen
+onst
+om
+op
+ouni
+säi
+säin
+schonn
+schonns
+si
+sid
+sie
+se
+sech
+seng
+senge
+sengem
+senger
+selwecht
+selwer
+sinn
+sollten
+souguer
+sou
+soss
+sot
+'t
+tëscht
+u
+un
+um
+virdrun
+vu
+vum
+vun
+wann
+war
+waren
+was
+wat
+wëllt
+weider
+wéi
+wéini
+wéinst
+wi
+wollt
+wou
+wouhin
+zanter
+ze
+zu
+zum
+zwar
+""".split()
+)
diff --git a/spacy/lang/lb/tag_map.py b/spacy/lang/lb/tag_map.py
new file mode 100644
index 000000000..424a83bb4
--- /dev/null
+++ b/spacy/lang/lb/tag_map.py
@@ -0,0 +1,28 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import POS, PUNCT, ADJ, CONJ, NUM, DET, ADV, ADP, X, VERB
+from ...symbols import NOUN, PART, SPACE, AUX
+
+# TODO: tag map is still using POS tags from an internal training set.
+# These POS tags have to be modified to match those from Universal Dependencies
+
+TAG_MAP = {
+ "$": {POS: PUNCT},
+ "ADJ": {POS: ADJ},
+ "AV": {POS: ADV},
+ "APPR": {POS: ADP, "AdpType": "prep"},
+ "APPRART": {POS: ADP, "AdpType": "prep", "PronType": "art"},
+ "D": {POS: DET, "PronType": "art"},
+ "KO": {POS: CONJ},
+ "N": {POS: NOUN},
+ "P": {POS: ADV},
+ "TRUNC": {POS: X, "Hyph": "yes"},
+ "AUX": {POS: AUX},
+ "V": {POS: VERB},
+ "MV": {POS: VERB, "VerbType": "mod"},
+ "PTK": {POS: PART},
+ "INTER": {POS: PART},
+ "NUM": {POS: NUM},
+ "_SP": {POS: SPACE},
+}
diff --git a/spacy/lang/lb/tokenizer_exceptions.py b/spacy/lang/lb/tokenizer_exceptions.py
new file mode 100644
index 000000000..18b58f2b1
--- /dev/null
+++ b/spacy/lang/lb/tokenizer_exceptions.py
@@ -0,0 +1,51 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import ORTH, LEMMA, NORM
+
+# TODO
+# treat other apostrophes within words as part of the word: [op d'mannst], [fir d'éischt] (= exceptions)
+
+_exc = {
+
+}
+
+# translate / delete what is not necessary
+for exc_data in [
+ {ORTH: "wgl.", LEMMA: "wann ech gelift", NORM: "wann ech gelieft"},
+ {ORTH: "M.", LEMMA: "Monsieur", NORM: "Monsieur"},
+ {ORTH: "Mme.", LEMMA: "Madame", NORM: "Madame"},
+ {ORTH: "Dr.", LEMMA: "Dokter", NORM: "Dokter"},
+ {ORTH: "Tel.", LEMMA: "Telefon", NORM: "Telefon"},
+ {ORTH: "asw.", LEMMA: "an sou weider", NORM: "an sou weider"},
+ {ORTH: "etc.", LEMMA: "et cetera", NORM: "et cetera"},
+ {ORTH: "bzw.", LEMMA: "bezéiungsweis", NORM: "bezéiungsweis"},
+ {ORTH: "Jan.", LEMMA: "Januar", NORM: "Januar"},
+]:
+ _exc[exc_data[ORTH]] = [exc_data]
+
+
+# to be extended
+for orth in [
+ "z.B.",
+ "Dipl.",
+ "Dr.",
+ "etc.",
+ "i.e.",
+ "o.k.",
+ "O.K.",
+ "p.a.",
+ "p.s.",
+ "P.S.",
+ "phil.",
+ "q.e.d.",
+ "R.I.P.",
+ "rer.",
+ "sen.",
+ "ë.a.",
+ "U.S.",
+ "U.S.A.",
+]:
+ _exc[orth] = [{ORTH: orth}]
+
+TOKENIZER_EXCEPTIONS = _exc
diff --git a/spacy/lang/nb/examples.py b/spacy/lang/nb/examples.py
index 72d6b5a71..c15426ded 100644
--- a/spacy/lang/nb/examples.py
+++ b/spacy/lang/nb/examples.py
@@ -11,8 +11,8 @@ Example sentences to test spaCy and its language models.
sentences = [
- "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar",
- "Selvkjørende biler flytter forsikringsansvaret over på produsentene ",
- "San Francisco vurderer å forby robotbud på fortauene",
+ "Apple vurderer å kjøpe britisk oppstartfirma for en milliard dollar.",
+ "Selvkjørende biler flytter forsikringsansvaret over på produsentene.",
+ "San Francisco vurderer å forby robotbud på fortauene.",
"London er en stor by i Storbritannia.",
]
diff --git a/spacy/lang/sv/__init__.py b/spacy/lang/sv/__init__.py
index 55a0b97bc..671eefca0 100644
--- a/spacy/lang/sv/__init__.py
+++ b/spacy/lang/sv/__init__.py
@@ -14,6 +14,7 @@ from ..norm_exceptions import BASE_NORMS
from ...language import Language
from ...attrs import LANG, NORM
from ...util import update_exc, add_lookups
+from .syntax_iterators import SYNTAX_ITERATORS
class SwedishDefaults(Language.Defaults):
@@ -29,6 +30,7 @@ class SwedishDefaults(Language.Defaults):
suffixes = TOKENIZER_SUFFIXES
stop_words = STOP_WORDS
morph_rules = MORPH_RULES
+ syntax_iterators = SYNTAX_ITERATORS
class Swedish(Language):
diff --git a/spacy/lang/sv/syntax_iterators.py b/spacy/lang/sv/syntax_iterators.py
new file mode 100644
index 000000000..7a82e6b59
--- /dev/null
+++ b/spacy/lang/sv/syntax_iterators.py
@@ -0,0 +1,50 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...symbols import NOUN, PROPN, PRON
+
+
+def noun_chunks(obj):
+ """
+ Detect base noun phrases from a dependency parse. Works on both Doc and Span.
+ """
+ labels = [
+ "nsubj",
+ "nsubj:pass",
+ "dobj",
+ "obj",
+ "iobj",
+ "ROOT",
+ "appos",
+ "nmod",
+ "nmod:poss",
+ ]
+ doc = obj.doc # Ensure works on both Doc and Span.
+ np_deps = [doc.vocab.strings[label] for label in labels]
+ conj = doc.vocab.strings.add("conj")
+ np_label = doc.vocab.strings.add("NP")
+ seen = set()
+ for i, word in enumerate(obj):
+ if word.pos not in (NOUN, PROPN, PRON):
+ continue
+ # Prevent nested chunks from being produced
+ if word.i in seen:
+ continue
+ if word.dep in np_deps:
+ if any(w.i in seen for w in word.subtree):
+ continue
+ seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ yield word.left_edge.i, word.right_edge.i + 1, np_label
+ elif word.dep == conj:
+ head = word.head
+ while head.dep == conj and head.head.i < head.i:
+ head = head.head
+ # If the head is an NP, and we're coordinated to it, we're an NP
+ if head.dep in np_deps:
+ if any(w.i in seen for w in word.subtree):
+ continue
+ seen.update(j for j in range(word.left_edge.i, word.right_edge.i + 1))
+ yield word.left_edge.i, word.right_edge.i + 1, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/xx/examples.py b/spacy/lang/xx/examples.py
new file mode 100644
index 000000000..38cd5e0cd
--- /dev/null
+++ b/spacy/lang/xx/examples.py
@@ -0,0 +1,99 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+
+"""
+Example sentences to test spaCy and its language models.
+
+>>> from spacy.lang.de.examples import sentences
+>>> docs = nlp.pipe(sentences)
+"""
+
+# combined examples from de/en/es/fr/it/nl/pl/pt/ru
+
+sentences = [
+ "Die ganze Stadt ist ein Startup: Shenzhen ist das Silicon Valley für Hardware-Firmen",
+ "Wie deutsche Startups die Technologie vorantreiben wollen: Künstliche Intelligenz",
+ "Trend zum Urlaub in Deutschland beschert Gastwirten mehr Umsatz",
+ "Bundesanwaltschaft erhebt Anklage gegen mutmaßlichen Schweizer Spion",
+ "San Francisco erwägt Verbot von Lieferrobotern",
+ "Autonome Fahrzeuge verlagern Haftpflicht auf Hersteller",
+ "Wo bist du?",
+ "Was ist die Hauptstadt von Deutschland?",
+ "Apple is looking at buying U.K. startup for $1 billion",
+ "Autonomous cars shift insurance liability toward manufacturers",
+ "San Francisco considers banning sidewalk delivery robots",
+ "London is a big city in the United Kingdom.",
+ "Where are you?",
+ "Who is the president of France?",
+ "What is the capital of the United States?",
+ "When was Barack Obama born?",
+ "Apple está buscando comprar una startup del Reino Unido por mil millones de dólares.",
+ "Los coches autónomos delegan la responsabilidad del seguro en sus fabricantes.",
+ "San Francisco analiza prohibir los robots delivery.",
+ "Londres es una gran ciudad del Reino Unido.",
+ "El gato come pescado.",
+ "Veo al hombre con el telescopio.",
+ "La araña come moscas.",
+ "El pingüino incuba en su nido.",
+ "Apple cherche à acheter une start-up anglaise pour 1 milliard de dollars",
+ "Les voitures autonomes déplacent la responsabilité de l'assurance vers les constructeurs",
+ "San Francisco envisage d'interdire les robots coursiers sur les trottoirs",
+ "Londres est une grande ville du Royaume-Uni",
+ "L’Italie choisit ArcelorMittal pour reprendre la plus grande aciérie d’Europe",
+ "Apple lance HomePod parce qu'il se sent menacé par l'Echo d'Amazon",
+ "La France ne devrait pas manquer d'électricité cet été, même en cas de canicule",
+ "Nouvelles attaques de Trump contre le maire de Londres",
+ "Où es-tu ?",
+ "Qui est le président de la France ?",
+ "Où est la capitale des États-Unis ?",
+ "Quand est né Barack Obama ?",
+ "Apple vuole comprare una startup del Regno Unito per un miliardo di dollari",
+ "Le automobili a guida autonoma spostano la responsabilità assicurativa verso i produttori",
+ "San Francisco prevede di bandire i robot di consegna porta a porta",
+ "Londra è una grande città del Regno Unito.",
+ "Apple overweegt om voor 1 miljard een U.K. startup te kopen",
+ "Autonome auto's verschuiven de verzekeringverantwoordelijkheid naar producenten",
+ "San Francisco overweegt robots op voetpaden te verbieden",
+ "Londen is een grote stad in het Verenigd Koninkrijk",
+ "Poczuł przyjemną woń mocnej kawy.",
+ "Istnieje wiele dróg oddziaływania substancji psychoaktywnej na układ nerwowy.",
+ "Powitał mnie biało-czarny kot, płosząc siedzące na płocie trzy dorodne dudki.",
+ "Nowy abonament pod lupą Komisji Europejskiej",
+ "Czy w ciągu ostatnich 48 godzin spożyłeś leki zawierające paracetamol?",
+ "Kto ma ochotę zapoznać się z innymi niż w książkach przygodami Muminków i ich przyjaciół, temu polecam komiks Tove Jansson „Muminki i morze”.",
+ "Apple está querendo comprar uma startup do Reino Unido por 100 milhões de dólares.",
+ "Carros autônomos empurram a responsabilidade do seguro para os fabricantes.."
+ "São Francisco considera banir os robôs de entrega que andam pelas calçadas.",
+ "Londres é a maior cidade do Reino Unido.",
+ # Translations from English:
+ "Apple рассматривает возможность покупки стартапа из Соединённого Королевства за $1 млрд",
+ "Беспилотные автомобили перекладывают страховую ответственность на производителя",
+ "В Сан-Франциско рассматривается возможность запрета роботов-курьеров, которые перемещаются по тротуару",
+ "Лондон — это большой город в Соединённом Королевстве",
+ # Native Russian sentences:
+ # Colloquial:
+ "Да, нет, наверное!", # Typical polite refusal
+ "Обратите внимание на необыкновенную красоту этого города-героя Москвы, столицы нашей Родины!", # From a tour guide speech
+ # Examples of Bookish Russian:
+ # Quote from "The Golden Calf"
+ "Рио-де-Жанейро — это моя мечта, и не смейте касаться её своими грязными лапами!",
+ # Quotes from "Ivan Vasilievich changes his occupation"
+ "Ты пошто боярыню обидел, смерд?!!",
+ "Оставь меня, старушка, я в печали!",
+ # Quotes from Dostoevsky:
+ "Уж коли я, такой же, как и ты, человек грешный, над тобой умилился и пожалел тебя, кольми паче бог",
+ "В мечтах я нередко, говорит, доходил до страстных помыслов о служении человечеству и может быть действительно пошел бы на крест за людей, если б это вдруг как-нибудь потребовалось, а между тем я двух дней не в состоянии прожить ни с кем в одной комнате, о чем знаю из опыта",
+ "Зато всегда так происходило, что чем более я ненавидел людей в частности, тем пламеннее становилась любовь моя к человечеству вообще",
+ # Quotes from Chekhov:
+ "Ненужные дела и разговоры всё об одном отхватывают на свою долю лучшую часть времени, лучшие силы, и в конце концов остается какая-то куцая, бескрылая жизнь, какая-то чепуха, и уйти и бежать нельзя, точно сидишь в сумасшедшем доме или в арестантских ротах!",
+ # Quotes from Turgenev:
+ "Нравится тебе женщина, старайся добиться толку; а нельзя — ну, не надо, отвернись — земля не клином сошлась",
+ "Узенькое местечко, которое я занимаю, до того крохотно в сравнении с остальным пространством, где меня нет и где дела до меня нет; и часть времени, которую мне удастся прожить, так ничтожна перед вечностью, где меня не было и не будет...",
+ # Quotes from newspapers:
+ # Komsomolskaya Pravda:
+ "На заседании президиума правительства Москвы принято решение присвоить статус инвестиционного приоритетного проекта города Москвы киностудии Союзмультфильм",
+ "Глава Минобороны Сергей Шойгу заявил, что обстановка на этом стратегическом направлении требует непрерывного совершенствования боевого состава войск",
+ # Argumenty i Facty:
+ "На реплику лже-Говина — дескать, он (Волков) будет лучшим революционером — Стамп с энтузиазмом ответил: Непременно!",
+]
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index 91daea099..5bd7b7335 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -4,19 +4,92 @@ from __future__ import unicode_literals
from ...attrs import LANG
from ...language import Language
from ...tokens import Doc
+from ...util import DummyTokenizer
from ..tokenizer_exceptions import BASE_EXCEPTIONS
+from .lex_attrs import LEX_ATTRS
from .stop_words import STOP_WORDS
from .tag_map import TAG_MAP
+def try_jieba_import(use_jieba):
+ try:
+ import jieba
+ return jieba
+ except ImportError:
+ if use_jieba:
+ msg = (
+ "Jieba not installed. Either set Chinese.use_jieba = False, "
+ "or install it https://github.com/fxsjy/jieba"
+ )
+ raise ImportError(msg)
+
+
+class ChineseTokenizer(DummyTokenizer):
+ def __init__(self, cls, nlp=None):
+ self.vocab = nlp.vocab if nlp is not None else cls.create_vocab(nlp)
+ self.use_jieba = cls.use_jieba
+ self.jieba_seg = try_jieba_import(self.use_jieba)
+ self.tokenizer = Language.Defaults().create_tokenizer(nlp)
+
+ def __call__(self, text):
+ # use jieba
+ if self.use_jieba:
+ jieba_words = list([x for x in self.jieba_seg.cut(text, cut_all=False) if x])
+ words = [jieba_words[0]]
+ spaces = [False]
+ for i in range(1, len(jieba_words)):
+ word = jieba_words[i]
+ if word.isspace():
+ # second token in adjacent whitespace following a
+ # non-space token
+ if spaces[-1]:
+ words.append(word)
+ spaces.append(False)
+ # first space token following non-space token
+ elif word == " " and not words[-1].isspace():
+ spaces[-1] = True
+ # token is non-space whitespace or any whitespace following
+ # a whitespace token
+ else:
+ # extend previous whitespace token with more whitespace
+ if words[-1].isspace():
+ words[-1] += word
+ # otherwise it's a new whitespace token
+ else:
+ words.append(word)
+ spaces.append(False)
+ else:
+ words.append(word)
+ spaces.append(False)
+ return Doc(self.vocab, words=words, spaces=spaces)
+
+ # split into individual characters
+ words = []
+ spaces = []
+ for token in self.tokenizer(text):
+ if token.text.isspace():
+ words.append(token.text)
+ spaces.append(False)
+ else:
+ words.extend(list(token.text))
+ spaces.extend([False] * len(token.text))
+ spaces[-1] = bool(token.whitespace_)
+ return Doc(self.vocab, words=words, spaces=spaces)
+
+
class ChineseDefaults(Language.Defaults):
lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+ lex_attr_getters.update(LEX_ATTRS)
lex_attr_getters[LANG] = lambda text: "zh"
- use_jieba = True
tokenizer_exceptions = BASE_EXCEPTIONS
stop_words = STOP_WORDS
tag_map = TAG_MAP
writing_system = {"direction": "ltr", "has_case": False, "has_letters": False}
+ use_jieba = True
+
+ @classmethod
+ def create_tokenizer(cls, nlp=None):
+ return ChineseTokenizer(cls, nlp)
class Chinese(Language):
@@ -24,26 +97,7 @@ class Chinese(Language):
Defaults = ChineseDefaults # override defaults
def make_doc(self, text):
- if self.Defaults.use_jieba:
- try:
- import jieba
- except ImportError:
- msg = (
- "Jieba not installed. Either set Chinese.use_jieba = False, "
- "or install it https://github.com/fxsjy/jieba"
- )
- raise ImportError(msg)
- words = list(jieba.cut(text, cut_all=False))
- words = [x for x in words if x]
- return Doc(self.vocab, words=words, spaces=[False] * len(words))
- else:
- words = []
- spaces = []
- for token in self.tokenizer(text):
- words.extend(list(token.text))
- spaces.extend([False] * len(token.text))
- spaces[-1] = bool(token.whitespace_)
- return Doc(self.vocab, words=words, spaces=spaces)
+ return self.tokenizer(text)
__all__ = ["Chinese"]
diff --git a/spacy/lang/zh/tag_map.py b/spacy/lang/zh/tag_map.py
index 8d2f99d01..41e2d2158 100644
--- a/spacy/lang/zh/tag_map.py
+++ b/spacy/lang/zh/tag_map.py
@@ -1,11 +1,12 @@
# coding: utf8
from __future__ import unicode_literals
-from ...symbols import POS, PUNCT, ADJ, CONJ, CCONJ, NUM, DET, ADV, ADP, X, VERB
-from ...symbols import NOUN, PART, INTJ, PRON
+from ...symbols import POS, PUNCT, ADJ, SCONJ, CCONJ, NUM, DET, ADV, ADP, X
+from ...symbols import NOUN, PART, INTJ, PRON, VERB, SPACE
-# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn Treebank tag set.
-# We also map the tags to the simpler Google Universal POS tag set.
+# The Chinese part-of-speech tagger uses the OntoNotes 5 version of the Penn
+# Treebank tag set. We also map the tags to the simpler Universal Dependencies
+# v2 tag set.
TAG_MAP = {
"AS": {POS: PART},
@@ -38,10 +39,11 @@ TAG_MAP = {
"OD": {POS: NUM},
"DT": {POS: DET},
"CC": {POS: CCONJ},
- "CS": {POS: CONJ},
+ "CS": {POS: SCONJ},
"AD": {POS: ADV},
"JJ": {POS: ADJ},
"P": {POS: ADP},
"PN": {POS: PRON},
"PU": {POS: PUNCT},
+ "_SP": {POS: SPACE},
}
diff --git a/spacy/language.py b/spacy/language.py
index b2a81fc60..266a1727d 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -3,6 +3,7 @@ from __future__ import absolute_import, unicode_literals
import random
import itertools
+from spacy.util import minibatch
import weakref
import functools
from collections import OrderedDict
@@ -10,18 +11,15 @@ from contextlib import contextmanager
from copy import copy, deepcopy
from thinc.neural import Model
import srsly
+import multiprocessing as mp
+from itertools import chain, cycle
from .tokenizer import Tokenizer
from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .lookups import Lookups
-from .pipeline import DependencyParser, Tagger
-from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
-from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
-from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
-from .pipeline import EntityRuler
-from .pipeline import Morphologizer
-from .compat import izip, basestring_
+from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .compat import izip, basestring_, is_python2, class_types
from .gold import GoldParse
from .scorer import Scorer
from ._ml import link_vectors_to_models, create_default_optimizer
@@ -30,12 +28,16 @@ from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES
from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
+from .tokens import Doc
from .lang.lex_attrs import LEX_ATTRS, is_stop
-from .errors import Errors, Warnings, deprecation_warning
+from .errors import Errors, Warnings, deprecation_warning, user_warning
from . import util
from . import about
+ENABLE_PIPELINE_ANALYSIS = False
+
+
class BaseDefaults(object):
@classmethod
def create_lemmatizer(cls, nlp=None, lookups=None):
@@ -49,8 +51,8 @@ class BaseDefaults(object):
filenames = {name: root / filename for name, filename in cls.resources}
if LANG in cls.lex_attr_getters:
lang = cls.lex_attr_getters[LANG](None)
- user_lookups = util.get_entry_point(util.ENTRY_POINTS.lookups, lang, {})
- filenames.update(user_lookups)
+ if lang in util.registry.lookups:
+ filenames.update(util.registry.lookups.get(lang))
lookups = Lookups()
for name, filename in filenames.items():
data = util.load_language_data(filename)
@@ -106,10 +108,6 @@ class BaseDefaults(object):
tag_map = dict(TAG_MAP)
tokenizer_exceptions = {}
stop_words = set()
- lemma_rules = {}
- lemma_exc = {}
- lemma_index = {}
- lemma_lookup = {}
morph_rules = {}
lex_attr_getters = LEX_ATTRS
syntax_iterators = {}
@@ -133,22 +131,7 @@ class Language(object):
Defaults = BaseDefaults
lang = None
- factories = {
- "tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
- "tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
- "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
- "morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
- "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
- "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
- "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
- "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
- "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
- "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
- "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
- "merge_entities": lambda nlp, **cfg: merge_entities,
- "merge_subtokens": lambda nlp, **cfg: merge_subtokens,
- "entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
- }
+ factories = {"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp)}
def __init__(
self, vocab=True, make_doc=True, max_length=10 ** 6, meta={}, **kwargs
@@ -172,7 +155,7 @@ class Language(object):
100,000 characters in one text.
RETURNS (Language): The newly constructed object.
"""
- user_factories = util.get_entry_points(util.ENTRY_POINTS.factories)
+ user_factories = util.registry.factories.get_all()
self.factories.update(user_factories)
self._meta = dict(meta)
self._path = None
@@ -218,6 +201,7 @@ class Language(object):
"name": self.vocab.vectors.name,
}
self._meta["pipeline"] = self.pipe_names
+ self._meta["factories"] = self.pipe_factories
self._meta["labels"] = self.pipe_labels
return self._meta
@@ -259,6 +243,17 @@ class Language(object):
"""
return [pipe_name for pipe_name, _ in self.pipeline]
+ @property
+ def pipe_factories(self):
+ """Get the component factories for the available pipeline components.
+
+ RETURNS (dict): Factory names, keyed by component names.
+ """
+ factories = {}
+ for pipe_name, pipe in self.pipeline:
+ factories[pipe_name] = getattr(pipe, "factory", pipe_name)
+ return factories
+
@property
def pipe_labels(self):
"""Get the labels set by the pipeline components, if available (if
@@ -327,33 +322,30 @@ class Language(object):
msg += Errors.E004.format(component=component)
raise ValueError(msg)
if name is None:
- if hasattr(component, "name"):
- name = component.name
- elif hasattr(component, "__name__"):
- name = component.__name__
- elif hasattr(component, "__class__") and hasattr(
- component.__class__, "__name__"
- ):
- name = component.__class__.__name__
- else:
- name = repr(component)
+ name = util.get_component_name(component)
if name in self.pipe_names:
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
raise ValueError(Errors.E006)
+ pipe_index = 0
pipe = (name, component)
if last or not any([first, before, after]):
+ pipe_index = len(self.pipeline)
self.pipeline.append(pipe)
elif first:
self.pipeline.insert(0, pipe)
elif before and before in self.pipe_names:
+ pipe_index = self.pipe_names.index(before)
self.pipeline.insert(self.pipe_names.index(before), pipe)
elif after and after in self.pipe_names:
+ pipe_index = self.pipe_names.index(after) + 1
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
else:
raise ValueError(
Errors.E001.format(name=before or after, opts=self.pipe_names)
)
+ if ENABLE_PIPELINE_ANALYSIS:
+ analyze_pipes(self.pipeline, name, component, pipe_index)
def has_pipe(self, name):
"""Check if a component name is present in the pipeline. Equivalent to
@@ -382,6 +374,8 @@ class Language(object):
msg += Errors.E135.format(name=name)
raise ValueError(msg)
self.pipeline[self.pipe_names.index(name)] = (name, component)
+ if ENABLE_PIPELINE_ANALYSIS:
+ analyze_all_pipes(self.pipeline)
def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component.
@@ -408,7 +402,10 @@ class Language(object):
"""
if name not in self.pipe_names:
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
- return self.pipeline.pop(self.pipe_names.index(name))
+ removed = self.pipeline.pop(self.pipe_names.index(name))
+ if ENABLE_PIPELINE_ANALYSIS:
+ analyze_all_pipes(self.pipeline)
+ return removed
def __call__(self, text, disable=[], component_cfg=None):
"""Apply the pipeline to some text. The text can span multiple sentences,
@@ -448,6 +445,8 @@ class Language(object):
DOCS: https://spacy.io/api/language#disable_pipes
"""
+ if len(names) == 1 and isinstance(names[0], (list, tuple)):
+ names = names[0] # support list of names instead of spread
return DisabledPipes(self, *names)
def make_doc(self, text):
@@ -477,7 +476,7 @@ class Language(object):
docs (iterable): A batch of `Doc` objects.
golds (iterable): A batch of `GoldParse` objects.
- drop (float): The droput rate.
+ drop (float): The dropout rate.
sgd (callable): An optimizer.
losses (dict): Dictionary to update with the loss, keyed by component.
component_cfg (dict): Config parameters for specific pipeline
@@ -525,7 +524,7 @@ class Language(object):
even if you're updating it with a smaller set of examples.
docs (iterable): A batch of `Doc` objects.
- drop (float): The droput rate.
+ drop (float): The dropout rate.
sgd (callable): An optimizer.
RETURNS (dict): Results from the update.
@@ -679,7 +678,7 @@ class Language(object):
kwargs = component_cfg.get(name, {})
kwargs.setdefault("batch_size", batch_size)
if not hasattr(pipe, "pipe"):
- docs = _pipe(pipe, docs, kwargs)
+ docs = _pipe(docs, pipe, kwargs)
else:
docs = pipe.pipe(docs, **kwargs)
for doc, gold in zip(docs, golds):
@@ -733,6 +732,7 @@ class Language(object):
disable=[],
cleanup=False,
component_cfg=None,
+ n_process=1,
):
"""Process texts as a stream, and yield `Doc` objects in order.
@@ -746,12 +746,21 @@ class Language(object):
use. Experimental.
component_cfg (dict): An optional dictionary with extra keyword
arguments for specific components.
+ n_process (int): Number of processors to process texts, only supported
+ in Python3. If -1, set `multiprocessing.cpu_count()`.
YIELDS (Doc): Documents in the order of the original text.
DOCS: https://spacy.io/api/language#pipe
"""
+ # raw_texts will be used later to stop iterator.
+ texts, raw_texts = itertools.tee(texts)
+ if is_python2 and n_process != 1:
+ user_warning(Warnings.W023)
+ n_process = 1
if n_threads != -1:
deprecation_warning(Warnings.W016)
+ if n_process == -1:
+ n_process = mp.cpu_count()
if as_tuples:
text_context1, text_context2 = itertools.tee(texts)
texts = (tc[0] for tc in text_context1)
@@ -760,14 +769,18 @@ class Language(object):
texts,
batch_size=batch_size,
disable=disable,
+ n_process=n_process,
component_cfg=component_cfg,
)
for doc, context in izip(docs, contexts):
yield (doc, context)
return
- docs = (self.make_doc(text) for text in texts)
if component_cfg is None:
component_cfg = {}
+
+ pipes = (
+ []
+ ) # contains functools.partial objects so that easily create multiprocess worker.
for name, proc in self.pipeline:
if name in disable:
continue
@@ -775,10 +788,20 @@ class Language(object):
# Allow component_cfg to overwrite the top-level kwargs.
kwargs.setdefault("batch_size", batch_size)
if hasattr(proc, "pipe"):
- docs = proc.pipe(docs, **kwargs)
+ f = functools.partial(proc.pipe, **kwargs)
else:
# Apply the function, but yield the doc
- docs = _pipe(proc, docs, kwargs)
+ f = functools.partial(_pipe, proc=proc, kwargs=kwargs)
+ pipes.append(f)
+
+ if n_process != 1:
+ docs = self._multiprocessing_pipe(texts, pipes, n_process, batch_size)
+ else:
+ # if n_process == 1, no processes are forked.
+ docs = (self.make_doc(text) for text in texts)
+ for pipe in pipes:
+ docs = pipe(docs)
+
# Track weakrefs of "recent" documents, so that we can see when they
# expire from memory. When they do, we know we don't need old strings.
# This way, we avoid maintaining an unbounded growth in string entries
@@ -809,6 +832,46 @@ class Language(object):
self.tokenizer._reset_cache(keys)
nr_seen = 0
+ def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size):
+ # raw_texts is used later to stop iteration.
+ texts, raw_texts = itertools.tee(texts)
+ # for sending texts to worker
+ texts_q = [mp.Queue() for _ in range(n_process)]
+ # for receiving byte encoded docs from worker
+ bytedocs_recv_ch, bytedocs_send_ch = zip(
+ *[mp.Pipe(False) for _ in range(n_process)]
+ )
+
+ batch_texts = minibatch(texts, batch_size)
+ # Sender sends texts to the workers.
+ # This is necessary to properly handle infinite length of texts.
+ # (In this case, all data cannot be sent to the workers at once)
+ sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
+ # send twice so that make process busy
+ sender.send()
+ sender.send()
+
+ procs = [
+ mp.Process(target=_apply_pipes, args=(self.make_doc, pipes, rch, sch))
+ for rch, sch in zip(texts_q, bytedocs_send_ch)
+ ]
+ for proc in procs:
+ proc.start()
+
+ # Cycle channels not to break the order of docs.
+ # The received object is batch of byte encoded docs, so flatten them with chain.from_iterable.
+ byte_docs = chain.from_iterable(recv.recv() for recv in cycle(bytedocs_recv_ch))
+ docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
+ try:
+ for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
+ yield doc
+ if i % batch_size == 0:
+ # tell `sender` that one batch was consumed.
+ sender.step()
+ finally:
+ for proc in procs:
+ proc.terminate()
+
def to_disk(self, path, exclude=tuple(), disable=None):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
@@ -936,6 +999,52 @@ class Language(object):
return self
+class component(object):
+ """Decorator for pipeline components. Can decorate both function components
+ and class components and will automatically register components in the
+ Language.factories. If the component is a class and needs access to the
+ nlp object or config parameters, it can expose a from_nlp classmethod
+ that takes the nlp object and **cfg arguments and returns the initialized
+ component.
+ """
+
+ # NB: This decorator needs to live here, because it needs to write to
+ # Language.factories. All other solutions would cause circular import.
+
+ def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
+ """Decorate a pipeline component.
+
+ name (unicode): Default component and factory name.
+ assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
+ requires (list): Attributes required by component, e.g. `["token.dep"]`.
+ retokenizes (bool): Whether the component changes the tokenization.
+ """
+ self.name = name
+ self.assigns = validate_attrs(assigns)
+ self.requires = validate_attrs(requires)
+ self.retokenizes = retokenizes
+
+ def __call__(self, *args, **kwargs):
+ obj = args[0]
+ args = args[1:]
+ factory_name = self.name or util.get_component_name(obj)
+ obj.name = factory_name
+ obj.factory = factory_name
+ obj.assigns = self.assigns
+ obj.requires = self.requires
+ obj.retokenizes = self.retokenizes
+
+ def factory(nlp, **cfg):
+ if hasattr(obj, "from_nlp"):
+ return obj.from_nlp(nlp, **cfg)
+ elif isinstance(obj, class_types):
+ return obj()
+ return obj
+
+ Language.factories[obj.factory] = factory
+ return obj
+
+
def _fix_pretrained_vectors_name(nlp):
# TODO: Replace this once we handle vectors consistently as static
# data
@@ -987,12 +1096,56 @@ class DisabledPipes(list):
self[:] = []
-def _pipe(func, docs, kwargs):
+def _pipe(docs, proc, kwargs):
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)
for arg in ["n_threads", "batch_size"]:
if arg in kwargs:
kwargs.pop(arg)
for doc in docs:
- doc = func(doc, **kwargs)
+ doc = proc(doc, **kwargs)
yield doc
+
+
+def _apply_pipes(make_doc, pipes, reciever, sender):
+ """Worker for Language.pipe
+
+ receiver (multiprocessing.Connection): Pipe to receive text. Usually
+ created by `multiprocessing.Pipe()`
+ sender (multiprocessing.Connection): Pipe to send doc. Usually created by
+ `multiprocessing.Pipe()`
+ """
+ while True:
+ texts = reciever.get()
+ docs = (make_doc(text) for text in texts)
+ for pipe in pipes:
+ docs = pipe(docs)
+ # Connection does not accept unpickable objects, so send list.
+ sender.send([doc.to_bytes() for doc in docs])
+
+
+class _Sender:
+ """Util for sending data to multiprocessing workers in Language.pipe"""
+
+ def __init__(self, data, queues, chunk_size):
+ self.data = iter(data)
+ self.queues = iter(cycle(queues))
+ self.chunk_size = chunk_size
+ self.count = 0
+
+ def send(self):
+ """Send chunk_size items from self.data to channels."""
+ for item, q in itertools.islice(
+ zip(self.data, cycle(self.queues)), self.chunk_size
+ ):
+ # cycle channels so that distribute the texts evenly
+ q.put(item)
+
+ def step(self):
+ """Tell sender that comsumed one item.
+
+ Data is sent to the workers after every chunk_size calls."""
+ self.count += 1
+ if self.count >= self.chunk_size:
+ self.count = 0
+ self.send()
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 5b88e8fcc..5c981bc25 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -375,7 +375,7 @@ cdef class Lexeme:
Lexeme.c_set_flag(self.c, IS_STOP, x)
property is_alpha:
- """RETURNS (bool): Whether the lexeme consists of alphanumeric
+ """RETURNS (bool): Whether the lexeme consists of alphabetic
characters. Equivalent to `lexeme.text.isalpha()`.
"""
def __get__(self):
diff --git a/spacy/matcher/_schemas.py b/spacy/matcher/_schemas.py
index 471e2b7b5..1b10f0dd5 100644
--- a/spacy/matcher/_schemas.py
+++ b/spacy/matcher/_schemas.py
@@ -111,7 +111,7 @@ TOKEN_PATTERN_SCHEMA = {
"$ref": "#/definitions/integer_value",
},
"IS_ALPHA": {
- "title": "Token consists of alphanumeric characters",
+ "title": "Token consists of alphabetic characters",
"$ref": "#/definitions/boolean_value",
},
"IS_ASCII": {
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index b58d36d62..56d27024d 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -102,7 +102,10 @@ cdef class DependencyMatcher:
visitedNodes[relation["SPEC"]["NBOR_NAME"]] = True
idx = idx + 1
- def add(self, key, on_match, *patterns):
+ def add(self, key, patterns, *_patterns, on_match=None):
+ if patterns is None or hasattr(patterns, "__call__"): # old API
+ on_match = patterns
+ patterns = _patterns
for pattern in patterns:
if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key))
@@ -237,7 +240,7 @@ cdef class DependencyMatcher:
for i, (ent_id, nodes) in enumerate(matched_key_trees):
on_match = self._callbacks.get(ent_id)
if on_match is not None:
- on_match(self, doc, i, matches)
+ on_match(self, doc, i, matched_key_trees)
return matched_key_trees
def recurse(self,tree,id_to_position,_node_operator_map,int patternLength,visitedNodes,matched_trees):
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 950a7b977..6f6848102 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -74,7 +74,7 @@ cdef class Matcher:
"""
return self._normalize_key(key) in self._patterns
- def add(self, key, on_match, *patterns):
+ def add(self, key, patterns, *_patterns, on_match=None):
"""Add a match-rule to the matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
@@ -98,16 +98,29 @@ cdef class Matcher:
operator will behave non-greedily. This quirk in the semantics makes
the matcher more efficient, by avoiding the need for back-tracking.
+ As of spaCy v2.2.2, Matcher.add supports the future API, which makes
+ the patterns the second argument and a list (instead of a variable
+ number of arguments). The on_match callback becomes an optional keyword
+ argument.
+
key (unicode): The match ID.
- on_match (callable): Callback executed on match.
- *patterns (list): List of token descriptions.
+ patterns (list): The patterns to add for the given key.
+ on_match (callable): Optional callback executed on match.
+ *_patterns (list): For backwards compatibility: list of patterns to add
+ as variable arguments. Will be ignored if a list of patterns is
+ provided as the second argument.
"""
errors = {}
if on_match is not None and not hasattr(on_match, "__call__"):
raise ValueError(Errors.E171.format(arg_type=type(on_match)))
+ if patterns is None or hasattr(patterns, "__call__"): # old API
+ on_match = patterns
+ patterns = _patterns
for i, pattern in enumerate(patterns):
if len(pattern) == 0:
raise ValueError(Errors.E012.format(key=key))
+ if not isinstance(pattern, list):
+ raise ValueError(Errors.E178.format(pat=pattern, key=key))
if self.validator:
errors[i] = validate_json(pattern, self.validator)
if any(err for err in errors.values()):
@@ -133,13 +146,15 @@ cdef class Matcher:
key (unicode): The ID of the match rule.
"""
- key = self._normalize_key(key)
- self._patterns.pop(key)
- self._callbacks.pop(key)
+ norm_key = self._normalize_key(key)
+ if not norm_key in self._patterns:
+ raise ValueError(Errors.E175.format(key=key))
+ self._patterns.pop(norm_key)
+ self._callbacks.pop(norm_key)
cdef int i = 0
while i < self.patterns.size():
- pattern_key = get_pattern_key(self.patterns.at(i))
- if pattern_key == key:
+ pattern_key = get_ent_id(self.patterns.at(i))
+ if pattern_key == norm_key:
self.patterns.erase(self.patterns.begin()+i)
else:
i += 1
@@ -252,7 +267,12 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
cdef PatternStateC state
cdef int i, j, nr_extra_attr
cdef Pool mem = Pool()
- predicate_cache = mem.alloc(doc.length * len(predicates), sizeof(char))
+ output = []
+ if doc.length == 0:
+ # avoid any processing or mem alloc if the document is empty
+ return output
+ if len(predicates) > 0:
+ predicate_cache = mem.alloc(doc.length * len(predicates), sizeof(char))
if extensions is not None and len(extensions) >= 1:
nr_extra_attr = max(extensions.values()) + 1
extra_attr_values = mem.alloc(doc.length * nr_extra_attr, sizeof(attr_t))
@@ -276,7 +296,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
predicate_cache += len(predicates)
# Handle matches that end in 0-width patterns
finish_states(matches, states)
- output = []
seen = set()
for i in range(matches.size()):
match = (
@@ -293,18 +312,6 @@ cdef find_matches(TokenPatternC** patterns, int n, Doc doc, extensions=None,
return output
-cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
- # There have been a few bugs here.
- # The code was originally designed to always have pattern[1].attrs.value
- # be the ent_id when we get to the end of a pattern. However, Issue #2671
- # showed this wasn't the case when we had a reject-and-continue before a
- # match.
- # The patch to #2671 was wrong though, which came up in #3839.
- while pattern.attrs.attr != ID:
- pattern += 1
- return pattern.attrs.value
-
-
cdef void transition_states(vector[PatternStateC]& states, vector[MatchC]& matches,
char* cached_py_predicates,
Token token, const attr_t* extra_attrs, py_predicates) except *:
@@ -533,9 +540,10 @@ cdef char get_is_match(PatternStateC state,
if predicate_matches[state.pattern.py_predicates[i]] == -1:
return 0
spec = state.pattern
- for attr in spec.attrs[:spec.nr_attr]:
- if get_token_attr(token, attr.attr) != attr.value:
- return 0
+ if spec.nr_attr > 0:
+ for attr in spec.attrs[:spec.nr_attr]:
+ if get_token_attr(token, attr.attr) != attr.value:
+ return 0
for i in range(spec.nr_extra_attr):
if spec.extra_attrs[i].value != extra_attrs[spec.extra_attrs[i].index]:
return 0
@@ -543,7 +551,11 @@ cdef char get_is_match(PatternStateC state,
cdef char get_is_final(PatternStateC state) nogil:
- if state.pattern[1].attrs[0].attr == ID and state.pattern[1].nr_attr == 0:
+ if state.pattern[1].nr_attr == 0 and state.pattern[1].attrs != NULL:
+ id_attr = state.pattern[1].attrs[0]
+ if id_attr.attr != ID:
+ with gil:
+ raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr))
return 1
else:
return 0
@@ -558,22 +570,27 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
cdef int i, index
for i, (quantifier, spec, extensions, predicates) in enumerate(token_specs):
pattern[i].quantifier = quantifier
- pattern[i].attrs = mem.alloc(len(spec), sizeof(AttrValueC))
+ # Ensure attrs refers to a null pointer if nr_attr == 0
+ if len(spec) > 0:
+ pattern[i].attrs = mem.alloc(len(spec), sizeof(AttrValueC))
pattern[i].nr_attr = len(spec)
for j, (attr, value) in enumerate(spec):
pattern[i].attrs[j].attr = attr
pattern[i].attrs[j].value = value
- pattern[i].extra_attrs = mem.alloc(len(extensions), sizeof(IndexValueC))
+ if len(extensions) > 0:
+ pattern[i].extra_attrs = mem.alloc(len(extensions), sizeof(IndexValueC))
for j, (index, value) in enumerate(extensions):
pattern[i].extra_attrs[j].index = index
pattern[i].extra_attrs[j].value = value
pattern[i].nr_extra_attr = len(extensions)
- pattern[i].py_predicates = mem.alloc(len(predicates), sizeof(int32_t))
+ if len(predicates) > 0:
+ pattern[i].py_predicates = mem.alloc(len(predicates), sizeof(int32_t))
for j, index in enumerate(predicates):
pattern[i].py_predicates[j] = index
pattern[i].nr_py = len(predicates)
pattern[i].key = hash64(pattern[i].attrs, pattern[i].nr_attr * sizeof(AttrValueC), 0)
i = len(token_specs)
+ # Even though here, nr_attr == 0, we're storing the ID value in attrs[0] (bug-prone, thread carefully!)
pattern[i].attrs = mem.alloc(2, sizeof(AttrValueC))
pattern[i].attrs[0].attr = ID
pattern[i].attrs[0].value = entity_id
@@ -583,8 +600,26 @@ cdef TokenPatternC* init_pattern(Pool mem, attr_t entity_id, object token_specs)
return pattern
-cdef attr_t get_pattern_key(const TokenPatternC* pattern) nogil:
- while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0:
+cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
+ # There have been a few bugs here. We used to have two functions,
+ # get_ent_id and get_pattern_key that tried to do the same thing. These
+ # are now unified to try to solve the "ghost match" problem.
+ # Below is the previous implementation of get_ent_id and the comment on it,
+ # preserved for reference while we figure out whether the heisenbug in the
+ # matcher is resolved.
+ #
+ #
+ # cdef attr_t get_ent_id(const TokenPatternC* pattern) nogil:
+ # # The code was originally designed to always have pattern[1].attrs.value
+ # # be the ent_id when we get to the end of a pattern. However, Issue #2671
+ # # showed this wasn't the case when we had a reject-and-continue before a
+ # # match.
+ # # The patch to #2671 was wrong though, which came up in #3839.
+ # while pattern.attrs.attr != ID:
+ # pattern += 1
+ # return pattern.attrs.value
+ while pattern.nr_attr != 0 or pattern.nr_extra_attr != 0 or pattern.nr_py != 0 \
+ or pattern.quantifier != ZERO:
pattern += 1
id_attr = pattern[0].attrs[0]
if id_attr.attr != ID:
@@ -642,7 +677,7 @@ def _get_attr_values(spec, string_store):
value = string_store.add(value)
elif isinstance(value, bool):
value = int(value)
- elif isinstance(value, dict):
+ elif isinstance(value, (dict, int)):
continue
else:
raise ValueError(Errors.E153.format(vtype=type(value).__name__))
diff --git a/spacy/matcher/phrasematcher.pxd b/spacy/matcher/phrasematcher.pxd
index 1bce70260..a8e5e5085 100644
--- a/spacy/matcher/phrasematcher.pxd
+++ b/spacy/matcher/phrasematcher.pxd
@@ -4,6 +4,7 @@ from cymem.cymem cimport Pool
from preshed.maps cimport key_t, MapStruct
from ..attrs cimport attr_id_t
+from ..structs cimport SpanC
from ..tokens.doc cimport Doc
from ..vocab cimport Vocab
@@ -18,10 +19,4 @@ cdef class PhraseMatcher:
cdef Pool mem
cdef key_t _terminal_hash
- cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil
-
-
-cdef struct MatchStruct:
- key_t match_id
- int start
- int end
+ cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index b6c9e01d2..4de5782f9 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -9,6 +9,7 @@ from preshed.maps cimport map_init, map_set, map_get, map_clear, map_iter
from ..attrs cimport ORTH, POS, TAG, DEP, LEMMA
from ..structs cimport TokenC
from ..tokens.token cimport Token
+from ..typedefs cimport attr_t
from ._schemas import TOKEN_PATTERN_SCHEMA
from ..errors import Errors, Warnings, deprecation_warning, user_warning
@@ -102,8 +103,10 @@ cdef class PhraseMatcher:
cdef vector[MapStruct*] path_nodes
cdef vector[key_t] path_keys
cdef key_t key_to_remove
- for keyword in self._docs[key]:
+ for keyword in sorted(self._docs[key], key=lambda x: len(x), reverse=True):
current_node = self.c_map
+ path_nodes.clear()
+ path_keys.clear()
for token in keyword:
result = map_get(current_node, token)
if result:
@@ -149,16 +152,27 @@ cdef class PhraseMatcher:
del self._callbacks[key]
del self._docs[key]
- def add(self, key, on_match, *docs):
+ def add(self, key, docs, *_docs, on_match=None):
"""Add a match-rule to the phrase-matcher. A match-rule consists of: an ID
key, an on_match callback, and one or more patterns.
+ As of spaCy v2.2.2, PhraseMatcher.add supports the future API, which
+ makes the patterns the second argument and a list (instead of a variable
+ number of arguments). The on_match callback becomes an optional keyword
+ argument.
+
key (unicode): The match ID.
+ docs (list): List of `Doc` objects representing match patterns.
on_match (callable): Callback executed on match.
- *docs (Doc): `Doc` objects representing match patterns.
+ *_docs (Doc): For backwards compatibility: list of patterns to add
+ as variable arguments. Will be ignored if a list of patterns is
+ provided as the second argument.
DOCS: https://spacy.io/api/phrasematcher#add
"""
+ if docs is None or hasattr(docs, "__call__"): # old API
+ on_match = docs
+ docs = _docs
_ = self.vocab[key]
self._callbacks[key] = on_match
@@ -168,6 +182,8 @@ cdef class PhraseMatcher:
cdef MapStruct* internal_node
cdef void* result
+ if isinstance(docs, Doc):
+ raise ValueError(Errors.E179.format(key=key))
for doc in docs:
if len(doc) == 0:
continue
@@ -220,17 +236,17 @@ cdef class PhraseMatcher:
# if doc is empty or None just return empty list
return matches
- cdef vector[MatchStruct] c_matches
+ cdef vector[SpanC] c_matches
self.find_matches(doc, &c_matches)
for i in range(c_matches.size()):
- matches.append((c_matches[i].match_id, c_matches[i].start, c_matches[i].end))
+ matches.append((c_matches[i].label, c_matches[i].start, c_matches[i].end))
for i, (ent_id, start, end) in enumerate(matches):
- on_match = self._callbacks.get(ent_id)
+ on_match = self._callbacks.get(self.vocab.strings[ent_id])
if on_match is not None:
on_match(self, doc, i, matches)
return matches
- cdef void find_matches(self, Doc doc, vector[MatchStruct] *matches) nogil:
+ cdef void find_matches(self, Doc doc, vector[SpanC] *matches) nogil:
cdef MapStruct* current_node = self.c_map
cdef int start = 0
cdef int idx = 0
@@ -238,7 +254,7 @@ cdef class PhraseMatcher:
cdef key_t key
cdef void* value
cdef int i = 0
- cdef MatchStruct ms
+ cdef SpanC ms
cdef void* result
while idx < doc.length:
start = idx
@@ -253,7 +269,7 @@ cdef class PhraseMatcher:
if result:
i = 0
while map_iter(result, &i, &key, &value):
- ms = make_matchstruct(key, start, idy)
+ ms = make_spanstruct(key, start, idy)
matches.push_back(ms)
inner_token = Token.get_struct_attr(&doc.c[idy], self.attr)
result = map_get(current_node, inner_token)
@@ -268,7 +284,7 @@ cdef class PhraseMatcher:
if result:
i = 0
while map_iter(result, &i, &key, &value):
- ms = make_matchstruct(key, start, idy)
+ ms = make_spanstruct(key, start, idy)
matches.push_back(ms)
current_node = self.c_map
idx += 1
@@ -318,9 +334,9 @@ def unpickle_matcher(vocab, docs, callbacks, attr):
return matcher
-cdef MatchStruct make_matchstruct(key_t match_id, int start, int end) nogil:
- cdef MatchStruct ms
- ms.match_id = match_id
- ms.start = start
- ms.end = end
- return ms
+cdef SpanC make_spanstruct(attr_t label, int start, int end) nogil:
+ cdef SpanC spanc
+ spanc.label = label
+ spanc.start = start
+ spanc.end = end
+ return spanc
diff --git a/spacy/ml/__init__.py b/spacy/ml/__init__.py
new file mode 100644
index 000000000..57e7ef571
--- /dev/null
+++ b/spacy/ml/__init__.py
@@ -0,0 +1,5 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from .tok2vec import Tok2Vec # noqa: F401
+from .common import FeedForward, LayerNormalizedMaxout # noqa: F401
diff --git a/spacy/ml/_legacy_tok2vec.py b/spacy/ml/_legacy_tok2vec.py
new file mode 100644
index 000000000..b077a46b7
--- /dev/null
+++ b/spacy/ml/_legacy_tok2vec.py
@@ -0,0 +1,131 @@
+# coding: utf8
+from __future__ import unicode_literals
+from thinc.v2v import Model, Maxout
+from thinc.i2v import HashEmbed, StaticVectors
+from thinc.t2t import ExtractWindow
+from thinc.misc import Residual
+from thinc.misc import LayerNorm as LN
+from thinc.misc import FeatureExtracter
+from thinc.api import layerize, chain, clone, concatenate, with_flatten
+from thinc.api import uniqued, wrap, noop
+
+from ..attrs import ID, ORTH, NORM, PREFIX, SUFFIX, SHAPE
+
+
+def Tok2Vec(width, embed_size, **kwargs):
+ # Circular imports :(
+ from .._ml import CharacterEmbed
+ from .._ml import PyTorchBiLSTM
+
+ pretrained_vectors = kwargs.get("pretrained_vectors", None)
+ cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
+ subword_features = kwargs.get("subword_features", True)
+ char_embed = kwargs.get("char_embed", False)
+ if char_embed:
+ subword_features = False
+ conv_depth = kwargs.get("conv_depth", 4)
+ bilstm_depth = kwargs.get("bilstm_depth", 0)
+ cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
+ with Model.define_operators({">>": chain, "|": concatenate, "**": clone}):
+ norm = HashEmbed(width, embed_size, column=cols.index(NORM), name="embed_norm")
+ if subword_features:
+ prefix = HashEmbed(
+ width, embed_size // 2, column=cols.index(PREFIX), name="embed_prefix"
+ )
+ suffix = HashEmbed(
+ width, embed_size // 2, column=cols.index(SUFFIX), name="embed_suffix"
+ )
+ shape = HashEmbed(
+ width, embed_size // 2, column=cols.index(SHAPE), name="embed_shape"
+ )
+ else:
+ prefix, suffix, shape = (None, None, None)
+ if pretrained_vectors is not None:
+ glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
+
+ if subword_features:
+ embed = uniqued(
+ (glove | norm | prefix | suffix | shape)
+ >> LN(Maxout(width, width * 5, pieces=3)),
+ column=cols.index(ORTH),
+ )
+ else:
+ embed = uniqued(
+ (glove | norm) >> LN(Maxout(width, width * 2, pieces=3)),
+ column=cols.index(ORTH),
+ )
+ elif subword_features:
+ embed = uniqued(
+ (norm | prefix | suffix | shape)
+ >> LN(Maxout(width, width * 4, pieces=3)),
+ column=cols.index(ORTH),
+ )
+ elif char_embed:
+ embed = concatenate_lists(
+ CharacterEmbed(nM=64, nC=8),
+ FeatureExtracter(cols) >> with_flatten(norm),
+ )
+ reduce_dimensions = LN(
+ Maxout(width, 64 * 8 + width, pieces=cnn_maxout_pieces)
+ )
+ else:
+ embed = norm
+
+ convolution = Residual(
+ ExtractWindow(nW=1)
+ >> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
+ )
+ if char_embed:
+ tok2vec = embed >> with_flatten(
+ reduce_dimensions >> convolution ** conv_depth, pad=conv_depth
+ )
+ else:
+ tok2vec = FeatureExtracter(cols) >> with_flatten(
+ embed >> convolution ** conv_depth, pad=conv_depth
+ )
+
+ if bilstm_depth >= 1:
+ tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
+ # Work around thinc API limitations :(. TODO: Revise in Thinc 7
+ tok2vec.nO = width
+ tok2vec.embed = embed
+ return tok2vec
+
+
+@layerize
+def flatten(seqs, drop=0.0):
+ ops = Model.ops
+ lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
+
+ def finish_update(d_X, sgd=None):
+ return ops.unflatten(d_X, lengths, pad=0)
+
+ X = ops.flatten(seqs, pad=0)
+ return X, finish_update
+
+
+def concatenate_lists(*layers, **kwargs): # pragma: no cover
+ """Compose two or more models `f`, `g`, etc, such that their outputs are
+ concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
+ """
+ if not layers:
+ return noop()
+ drop_factor = kwargs.get("drop_factor", 1.0)
+ ops = layers[0].ops
+ layers = [chain(layer, flatten) for layer in layers]
+ concat = concatenate(*layers)
+
+ def concatenate_lists_fwd(Xs, drop=0.0):
+ if drop is not None:
+ drop *= drop_factor
+ lengths = ops.asarray([len(X) for X in Xs], dtype="i")
+ flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
+ ys = ops.unflatten(flat_y, lengths)
+
+ def concatenate_lists_bwd(d_ys, sgd=None):
+ return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
+
+ return ys, concatenate_lists_bwd
+
+ model = wrap(concatenate_lists_fwd, concat)
+ return model
diff --git a/spacy/ml/_wire.py b/spacy/ml/_wire.py
new file mode 100644
index 000000000..fa271b37c
--- /dev/null
+++ b/spacy/ml/_wire.py
@@ -0,0 +1,42 @@
+from __future__ import unicode_literals
+from thinc.api import layerize, wrap, noop, chain, concatenate
+from thinc.v2v import Model
+
+
+def concatenate_lists(*layers, **kwargs): # pragma: no cover
+ """Compose two or more models `f`, `g`, etc, such that their outputs are
+ concatenated, i.e. `concatenate(f, g)(x)` computes `hstack(f(x), g(x))`
+ """
+ if not layers:
+ return layerize(noop())
+ drop_factor = kwargs.get("drop_factor", 1.0)
+ ops = layers[0].ops
+ layers = [chain(layer, flatten) for layer in layers]
+ concat = concatenate(*layers)
+
+ def concatenate_lists_fwd(Xs, drop=0.0):
+ if drop is not None:
+ drop *= drop_factor
+ lengths = ops.asarray([len(X) for X in Xs], dtype="i")
+ flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
+ ys = ops.unflatten(flat_y, lengths)
+
+ def concatenate_lists_bwd(d_ys, sgd=None):
+ return bp_flat_y(ops.flatten(d_ys), sgd=sgd)
+
+ return ys, concatenate_lists_bwd
+
+ model = wrap(concatenate_lists_fwd, concat)
+ return model
+
+
+@layerize
+def flatten(seqs, drop=0.0):
+ ops = Model.ops
+ lengths = ops.asarray([len(seq) for seq in seqs], dtype="i")
+
+ def finish_update(d_X, sgd=None):
+ return ops.unflatten(d_X, lengths, pad=0)
+
+ X = ops.flatten(seqs, pad=0)
+ return X, finish_update
diff --git a/spacy/ml/common.py b/spacy/ml/common.py
new file mode 100644
index 000000000..f90b53a15
--- /dev/null
+++ b/spacy/ml/common.py
@@ -0,0 +1,23 @@
+from __future__ import unicode_literals
+
+from thinc.api import chain
+from thinc.v2v import Maxout
+from thinc.misc import LayerNorm
+from ..util import registry, make_layer
+
+
+@registry.architectures.register("thinc.FeedForward.v1")
+def FeedForward(config):
+ layers = [make_layer(layer_cfg) for layer_cfg in config["layers"]]
+ model = chain(*layers)
+ model.cfg = config
+ return model
+
+
+@registry.architectures.register("spacy.LayerNormalizedMaxout.v1")
+def LayerNormalizedMaxout(config):
+ width = config["width"]
+ pieces = config["pieces"]
+ layer = LayerNorm(Maxout(width, pieces=pieces))
+ layer.nO = width
+ return layer
diff --git a/spacy/ml/tok2vec.py b/spacy/ml/tok2vec.py
new file mode 100644
index 000000000..8f86475ef
--- /dev/null
+++ b/spacy/ml/tok2vec.py
@@ -0,0 +1,176 @@
+from __future__ import unicode_literals
+
+from thinc.api import chain, layerize, clone, concatenate, with_flatten, uniqued
+from thinc.api import noop, with_square_sequences
+from thinc.v2v import Maxout, Model
+from thinc.i2v import HashEmbed, StaticVectors
+from thinc.t2t import ExtractWindow
+from thinc.misc import Residual, LayerNorm, FeatureExtracter
+from ..util import make_layer, registry
+from ._wire import concatenate_lists
+
+
+@registry.architectures.register("spacy.Tok2Vec.v1")
+def Tok2Vec(config):
+ doc2feats = make_layer(config["@doc2feats"])
+ embed = make_layer(config["@embed"])
+ encode = make_layer(config["@encode"])
+ field_size = getattr(encode, "receptive_field", 0)
+ tok2vec = chain(doc2feats, with_flatten(chain(embed, encode), pad=field_size))
+ tok2vec.cfg = config
+ tok2vec.nO = encode.nO
+ tok2vec.embed = embed
+ tok2vec.encode = encode
+ return tok2vec
+
+
+@registry.architectures.register("spacy.Doc2Feats.v1")
+def Doc2Feats(config):
+ columns = config["columns"]
+ return FeatureExtracter(columns)
+
+
+@registry.architectures.register("spacy.MultiHashEmbed.v1")
+def MultiHashEmbed(config):
+ # For backwards compatibility with models before the architecture registry,
+ # we have to be careful to get exactly the same model structure. One subtle
+ # trick is that when we define concatenation with the operator, the operator
+ # is actually binary associative. So when we write (a | b | c), we're actually
+ # getting concatenate(concatenate(a, b), c). That's why the implementation
+ # is a bit ugly here.
+ cols = config["columns"]
+ width = config["width"]
+ rows = config["rows"]
+
+ norm = HashEmbed(width, rows, column=cols.index("NORM"), name="embed_norm")
+ if config["use_subwords"]:
+ prefix = HashEmbed(
+ width, rows // 2, column=cols.index("PREFIX"), name="embed_prefix"
+ )
+ suffix = HashEmbed(
+ width, rows // 2, column=cols.index("SUFFIX"), name="embed_suffix"
+ )
+ shape = HashEmbed(
+ width, rows // 2, column=cols.index("SHAPE"), name="embed_shape"
+ )
+ if config.get("@pretrained_vectors"):
+ glove = make_layer(config["@pretrained_vectors"])
+ mix = make_layer(config["@mix"])
+
+ with Model.define_operators({">>": chain, "|": concatenate}):
+ if config["use_subwords"] and config["@pretrained_vectors"]:
+ mix._layers[0].nI = width * 5
+ layer = uniqued(
+ (glove | norm | prefix | suffix | shape) >> mix,
+ column=cols.index("ORTH"),
+ )
+ elif config["use_subwords"]:
+ mix._layers[0].nI = width * 4
+ layer = uniqued(
+ (norm | prefix | suffix | shape) >> mix, column=cols.index("ORTH")
+ )
+ elif config["@pretrained_vectors"]:
+ mix._layers[0].nI = width * 2
+ layer = uniqued((glove | norm) >> mix, column=cols.index("ORTH"),)
+ else:
+ layer = norm
+ layer.cfg = config
+ return layer
+
+
+@registry.architectures.register("spacy.CharacterEmbed.v1")
+def CharacterEmbed(config):
+ from .. import _ml
+
+ width = config["width"]
+ chars = config["chars"]
+
+ chr_embed = _ml.CharacterEmbedModel(nM=width, nC=chars)
+ other_tables = make_layer(config["@embed_features"])
+ mix = make_layer(config["@mix"])
+
+ model = chain(concatenate_lists(chr_embed, other_tables), mix)
+ model.cfg = config
+ return model
+
+
+@registry.architectures.register("spacy.MaxoutWindowEncoder.v1")
+def MaxoutWindowEncoder(config):
+ nO = config["width"]
+ nW = config["window_size"]
+ nP = config["pieces"]
+ depth = config["depth"]
+
+ cnn = chain(
+ ExtractWindow(nW=nW), LayerNorm(Maxout(nO, nO * ((nW * 2) + 1), pieces=nP))
+ )
+ model = clone(Residual(cnn), depth)
+ model.nO = nO
+ model.receptive_field = nW * depth
+ return model
+
+
+@registry.architectures.register("spacy.MishWindowEncoder.v1")
+def MishWindowEncoder(config):
+ from thinc.v2v import Mish
+
+ nO = config["width"]
+ nW = config["window_size"]
+ depth = config["depth"]
+
+ cnn = chain(ExtractWindow(nW=nW), LayerNorm(Mish(nO, nO * ((nW * 2) + 1))))
+ model = clone(Residual(cnn), depth)
+ model.nO = nO
+ return model
+
+
+@registry.architectures.register("spacy.PretrainedVectors.v1")
+def PretrainedVectors(config):
+ return StaticVectors(config["vectors_name"], config["width"], config["column"])
+
+
+@registry.architectures.register("spacy.TorchBiLSTMEncoder.v1")
+def TorchBiLSTMEncoder(config):
+ import torch.nn
+ from thinc.extra.wrappers import PyTorchWrapperRNN
+
+ width = config["width"]
+ depth = config["depth"]
+ if depth == 0:
+ return layerize(noop())
+ return with_square_sequences(
+ PyTorchWrapperRNN(torch.nn.LSTM(width, width // 2, depth, bidirectional=True))
+ )
+
+
+_EXAMPLE_CONFIG = {
+ "@doc2feats": {
+ "arch": "Doc2Feats",
+ "config": {"columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"]},
+ },
+ "@embed": {
+ "arch": "spacy.MultiHashEmbed.v1",
+ "config": {
+ "width": 96,
+ "rows": 2000,
+ "columns": ["ID", "NORM", "PREFIX", "SUFFIX", "SHAPE", "ORTH"],
+ "use_subwords": True,
+ "@pretrained_vectors": {
+ "arch": "TransformedStaticVectors",
+ "config": {
+ "vectors_name": "en_vectors_web_lg.vectors",
+ "width": 96,
+ "column": 0,
+ },
+ },
+ "@mix": {
+ "arch": "LayerNormalizedMaxout",
+ "config": {"width": 96, "pieces": 3},
+ },
+ },
+ },
+ "@encode": {
+ "arch": "MaxoutWindowEncode",
+ "config": {"width": 96, "window_size": 1, "depth": 4, "pieces": 3},
+ },
+}
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 956d67291..d926b987b 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
from collections import defaultdict, OrderedDict
import srsly
+from ..language import component
from ..errors import Errors
from ..compat import basestring_
from ..util import ensure_path, to_disk, from_disk
@@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher
DEFAULT_ENT_ID_SEP = "||"
+@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"])
class EntityRuler(object):
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
rules or exact phrase matches. It can be combined with the statistical
@@ -24,8 +26,6 @@ class EntityRuler(object):
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
"""
- name = "entity_ruler"
-
def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
"""Initialize the entitiy ruler. If patterns are supplied here, they
need to be a list of dictionaries with a `"label"` and `"pattern"`
@@ -64,10 +64,15 @@ class EntityRuler(object):
self.phrase_matcher_attr = None
self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
+ self._ent_ids = defaultdict(dict)
patterns = cfg.get("patterns")
if patterns is not None:
self.add_patterns(patterns)
+ @classmethod
+ def from_nlp(cls, nlp, **cfg):
+ return cls(nlp, **cfg)
+
def __len__(self):
"""The number of all patterns added to the entity ruler."""
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
@@ -100,10 +105,9 @@ class EntityRuler(object):
continue
# check for end - 1 here because boundaries are inclusive
if start not in seen_tokens and end - 1 not in seen_tokens:
- if self.ent_ids:
- label_ = self.nlp.vocab.strings[match_id]
- ent_label, ent_id = self._split_label(label_)
- span = Span(doc, start, end, label=ent_label)
+ if match_id in self._ent_ids:
+ label, ent_id = self._ent_ids[match_id]
+ span = Span(doc, start, end, label=label)
if ent_id:
for token in span:
token.ent_id_ = ent_id
@@ -131,11 +135,11 @@ class EntityRuler(object):
@property
def ent_ids(self):
- """All entity ids present in the match patterns meta dicts.
+ """All entity ids present in the match patterns `id` properties.
RETURNS (set): The string entity ids.
- DOCS: https://spacy.io/api/entityruler#labels
+ DOCS: https://spacy.io/api/entityruler#ent_ids
"""
all_ent_ids = set()
for l in self.labels:
@@ -147,7 +151,6 @@ class EntityRuler(object):
@property
def patterns(self):
"""Get all patterns that were added to the entity ruler.
-
RETURNS (list): The original patterns, one dictionary per pattern.
DOCS: https://spacy.io/api/entityruler#patterns
@@ -183,14 +186,20 @@ class EntityRuler(object):
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
try:
current_index = self.nlp.pipe_names.index(self.name)
- subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
+ subsequent_pipes = [
+ pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
+ ]
except ValueError:
subsequent_pipes = []
- with self.nlp.disable_pipes(*subsequent_pipes):
+ with self.nlp.disable_pipes(subsequent_pipes):
for entry in patterns:
label = entry["label"]
if "id" in entry:
+ ent_label = label
label = self._create_label(label, entry["id"])
+ key = self.matcher._normalize_key(label)
+ self._ent_ids[key] = (ent_label, entry["id"])
+
pattern = entry["pattern"]
if isinstance(pattern, basestring_):
self.phrase_patterns[label].append(self.nlp(pattern))
@@ -199,9 +208,9 @@ class EntityRuler(object):
else:
raise ValueError(Errors.E097.format(pattern=pattern))
for label, patterns in self.token_patterns.items():
- self.matcher.add(label, None, *patterns)
+ self.matcher.add(label, patterns)
for label, patterns in self.phrase_patterns.items():
- self.phrase_matcher.add(label, None, *patterns)
+ self.phrase_matcher.add(label, patterns)
def _split_label(self, label):
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 0f7d94df2..69e638da2 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -1,9 +1,16 @@
# coding: utf8
from __future__ import unicode_literals
+from ..language import component
from ..matcher import Matcher
+from ..util import filter_spans
+@component(
+ "merge_noun_chunks",
+ requires=["token.dep", "token.tag", "token.pos"],
+ retokenizes=True,
+)
def merge_noun_chunks(doc):
"""Merge noun chunks into a single token.
@@ -21,6 +28,11 @@ def merge_noun_chunks(doc):
return doc
+@component(
+ "merge_entities",
+ requires=["doc.ents", "token.ent_iob", "token.ent_type"],
+ retokenizes=True,
+)
def merge_entities(doc):
"""Merge entities into a single token.
@@ -36,6 +48,7 @@ def merge_entities(doc):
return doc
+@component("merge_subtokens", requires=["token.dep"], retokenizes=True)
def merge_subtokens(doc, label="subtok"):
"""Merge subtokens into a single token.
@@ -48,7 +61,7 @@ def merge_subtokens(doc, label="subtok"):
merger = Matcher(doc.vocab)
merger.add("SUBTOK", None, [{"DEP": label, "op": "+"}])
matches = merger(doc)
- spans = [doc[start : end + 1] for _, start, end in matches]
+ spans = filter_spans([doc[start : end + 1] for _, start, end in matches])
with doc.retokenize() as retokenizer:
for span in spans:
retokenizer.merge(span)
diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py
index 38672cde0..b61a34c0e 100644
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.neural._classes.difference import Siamese, CauchySimilarity
from .pipes import Pipe
+from ..language import component
from .._ml import link_vectors_to_models
+@component("sentencizer_hook", assigns=["doc.user_hooks"])
class SentenceSegmenter(object):
"""A simple spaCy hook, to allow custom sentence boundary detection logic
(that doesn't require the dependency parse). To change the sentence
@@ -17,8 +19,6 @@ class SentenceSegmenter(object):
and yield `Span` objects for each sentence.
"""
- name = "sentencizer"
-
def __init__(self, vocab, strategy=None):
self.vocab = vocab
if strategy is None or strategy == "on_punct":
@@ -44,6 +44,7 @@ class SentenceSegmenter(object):
yield doc[start : len(doc)]
+@component("similarity", assigns=["doc.user_hooks"])
class SimilarityHook(Pipe):
"""
Experimental: A pipeline component to install a hook for supervised
@@ -58,8 +59,6 @@ class SimilarityHook(Pipe):
Where W is a vector of dimension weights, initialized to 1.
"""
- name = "similarity"
-
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index b14e2bec7..72e31f120 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -8,6 +8,7 @@ from thinc.api import chain
from thinc.neural.util import to_categorical, copy_array, get_array_module
from .. import util
from .pipes import Pipe
+from ..language import component
from .._ml import Tok2Vec, build_morphologizer_model
from .._ml import link_vectors_to_models, zero_init, flatten
from .._ml import create_default_optimizer
@@ -18,9 +19,9 @@ from ..vocab cimport Vocab
from ..morphology cimport Morphology
+@component("morphologizer", assigns=["token.morph", "token.pos"])
class Morphologizer(Pipe):
- name = 'morphologizer'
-
+
@classmethod
def Model(cls, **cfg):
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index bc7d67d07..69cbb6fb5 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
from thinc.neural.util import to_categorical
from thinc.neural.util import get_array_module
-from .functions import merge_subtokens
from ..tokens.doc cimport Doc
from ..syntax.nn_parser cimport Parser
from ..syntax.ner cimport BiluoPushDown
@@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager
from ..morphology cimport Morphology
from ..vocab cimport Vocab
+from .functions import merge_subtokens
+from ..language import Language, component
from ..syntax import nonproj
from ..attrs import POS, ID
from ..parts_of_speech import X
@@ -55,6 +56,10 @@ class Pipe(object):
"""Initialize a model for the pipe."""
raise NotImplementedError
+ @classmethod
+ def from_nlp(cls, nlp, **cfg):
+ return cls(nlp.vocab, **cfg)
+
def __init__(self, vocab, model=True, **cfg):
"""Create a new pipe instance."""
raise NotImplementedError
@@ -224,11 +229,10 @@ class Pipe(object):
return self
+@component("tensorizer", assigns=["doc.tensor"])
class Tensorizer(Pipe):
"""Pre-train position-sensitive vectors for tokens."""
- name = "tensorizer"
-
@classmethod
def Model(cls, output_size=300, **cfg):
"""Create a new statistical model for the class.
@@ -363,14 +367,13 @@ class Tensorizer(Pipe):
return sgd
+@component("tagger", assigns=["token.tag", "token.pos"])
class Tagger(Pipe):
"""Pipeline component for part-of-speech tagging.
DOCS: https://spacy.io/api/tagger
"""
- name = "tagger"
-
def __init__(self, vocab, model=True, **cfg):
self.vocab = vocab
self.model = model
@@ -515,7 +518,6 @@ class Tagger(Pipe):
orig_tag_map = dict(self.vocab.morphology.tag_map)
new_tag_map = OrderedDict()
for raw_text, annots_brackets in get_gold_tuples():
- _ = annots_brackets.pop()
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
for tag in tags:
@@ -658,13 +660,12 @@ class Tagger(Pipe):
return self
+@component("nn_labeller")
class MultitaskObjective(Tagger):
"""Experimental: Assist training of a parser or tagger, by training a
side-objective.
"""
- name = "nn_labeller"
-
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
self.vocab = vocab
self.model = model
@@ -923,12 +924,12 @@ class ClozeMultitask(Pipe):
return words
+@component("textcat", assigns=["doc.cats"])
class TextCategorizer(Pipe):
"""Pipeline component for text classification.
DOCS: https://spacy.io/api/textcategorizer
"""
- name = 'textcat'
@classmethod
def Model(cls, nr_class=1, **cfg):
@@ -1057,9 +1058,10 @@ class TextCategorizer(Pipe):
return 1
def begin_training(self, get_gold_tuples=lambda: [], pipeline=None, sgd=None, **kwargs):
- for raw_text, (_, (cats, _2)) in get_gold_tuples():
- for cat in cats:
- self.add_label(cat)
+ for raw_text, annot_brackets in get_gold_tuples():
+ for _, (cats, _2) in annot_brackets:
+ for cat in cats:
+ self.add_label(cat)
if self.model is True:
self.cfg["pretrained_vectors"] = kwargs.get("pretrained_vectors")
self.require_labels()
@@ -1075,8 +1077,11 @@ cdef class DependencyParser(Parser):
DOCS: https://spacy.io/api/dependencyparser
"""
-
+ # cdef classes can't have decorators, so we're defining this here
name = "parser"
+ factory = "parser"
+ assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
+ requires = []
TransitionSystem = ArcEager
nr_feature = 8
@@ -1122,8 +1127,10 @@ cdef class EntityRecognizer(Parser):
DOCS: https://spacy.io/api/entityrecognizer
"""
-
name = "ner"
+ factory = "ner"
+ assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
+ requires = []
TransitionSystem = BiluoPushDown
nr_feature = 3
@@ -1154,12 +1161,16 @@ cdef class EntityRecognizer(Parser):
return tuple(sorted(labels))
+@component(
+ "entity_linker",
+ requires=["doc.ents", "token.ent_iob", "token.ent_type"],
+ assigns=["token.ent_kb_id"]
+)
class EntityLinker(Pipe):
"""Pipeline component for named entity linking.
DOCS: https://spacy.io/api/entitylinker
"""
- name = 'entity_linker'
NIL = "NIL" # string used to refer to a non-existing link
@classmethod
@@ -1220,23 +1231,26 @@ class EntityLinker(Pipe):
docs = [docs]
golds = [golds]
- context_docs = []
+ sentence_docs = []
for doc, gold in zip(docs, golds):
ents_by_offset = dict()
for ent in doc.ents:
- ents_by_offset["{}_{}".format(ent.start_char, ent.end_char)] = ent
+ ents_by_offset[(ent.start_char, ent.end_char)] = ent
+
for entity, kb_dict in gold.links.items():
start, end = entity
mention = doc.text[start:end]
+ # the gold annotations should link to proper entities - if this fails, the dataset is likely corrupt
+ ent = ents_by_offset[(start, end)]
for kb_id, value in kb_dict.items():
# Currently only training on the positive instances
if value:
- context_docs.append(doc)
+ sentence_docs.append(ent.sent.as_doc())
- context_encodings, bp_context = self.model.begin_update(context_docs, drop=drop)
- loss, d_scores = self.get_similarity_loss(scores=context_encodings, golds=golds, docs=None)
+ sentence_encodings, bp_context = self.model.begin_update(sentence_docs, drop=drop)
+ loss, d_scores = self.get_similarity_loss(scores=sentence_encodings, golds=golds, docs=None)
bp_context(d_scores, sgd=sgd)
if losses is not None:
@@ -1305,50 +1319,69 @@ class EntityLinker(Pipe):
if isinstance(docs, Doc):
docs = [docs]
- context_encodings = self.model(docs)
- xp = get_array_module(context_encodings)
-
for i, doc in enumerate(docs):
if len(doc) > 0:
- # currently, the context is the same for each entity in a sentence (should be refined)
- context_encoding = context_encodings[i]
- context_enc_t = context_encoding.T
- norm_1 = xp.linalg.norm(context_enc_t)
- for ent in doc.ents:
- entity_count += 1
+ # Looping through each sentence and each entity
+ # This may go wrong if there are entities across sentences - because they might not get a KB ID
+ for sent in doc.ents:
+ sent_doc = sent.as_doc()
+ # currently, the context is the same for each entity in a sentence (should be refined)
+ sentence_encoding = self.model([sent_doc])[0]
+ xp = get_array_module(sentence_encoding)
+ sentence_encoding_t = sentence_encoding.T
+ sentence_norm = xp.linalg.norm(sentence_encoding_t)
- candidates = self.kb.get_candidates(ent.text)
- if not candidates:
- final_kb_ids.append(self.NIL) # no prediction possible for this entity
- final_tensors.append(context_encoding)
- else:
- random.shuffle(candidates)
+ for ent in sent_doc.ents:
+ entity_count += 1
- # this will set all prior probabilities to 0 if they should be excluded from the model
- prior_probs = xp.asarray([c.prior_prob for c in candidates])
- if not self.cfg.get("incl_prior", True):
- prior_probs = xp.asarray([0.0 for c in candidates])
- scores = prior_probs
+ to_discard = self.cfg.get("labels_discard", [])
+ if to_discard and ent.label_ in to_discard:
+ # ignoring this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ final_tensors.append(sentence_encoding)
- # add in similarity from the context
- if self.cfg.get("incl_context", True):
- entity_encodings = xp.asarray([c.entity_vector for c in candidates])
- norm_2 = xp.linalg.norm(entity_encodings, axis=1)
+ else:
+ candidates = self.kb.get_candidates(ent.text)
+ if not candidates:
+ # no prediction possible for this entity - setting to NIL
+ final_kb_ids.append(self.NIL)
+ final_tensors.append(sentence_encoding)
- if len(entity_encodings) != len(prior_probs):
- raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length"))
+ elif len(candidates) == 1:
+ # shortcut for efficiency reasons: take the 1 candidate
- # cosine similarity
- sims = xp.dot(entity_encodings, context_enc_t) / (norm_1 * norm_2)
- if sims.shape != prior_probs.shape:
- raise ValueError(Errors.E161)
- scores = prior_probs + sims - (prior_probs*sims)
+ # TODO: thresholding
+ final_kb_ids.append(candidates[0].entity_)
+ final_tensors.append(sentence_encoding)
- # TODO: thresholding
- best_index = scores.argmax()
- best_candidate = candidates[best_index]
- final_kb_ids.append(best_candidate.entity_)
- final_tensors.append(context_encoding)
+ else:
+ random.shuffle(candidates)
+
+ # this will set all prior probabilities to 0 if they should be excluded from the model
+ prior_probs = xp.asarray([c.prior_prob for c in candidates])
+ if not self.cfg.get("incl_prior", True):
+ prior_probs = xp.asarray([0.0 for c in candidates])
+ scores = prior_probs
+
+ # add in similarity from the context
+ if self.cfg.get("incl_context", True):
+ entity_encodings = xp.asarray([c.entity_vector for c in candidates])
+ entity_norm = xp.linalg.norm(entity_encodings, axis=1)
+
+ if len(entity_encodings) != len(prior_probs):
+ raise RuntimeError(Errors.E147.format(method="predict", msg="vectors not of equal length"))
+
+ # cosine similarity
+ sims = xp.dot(entity_encodings, sentence_encoding_t) / (sentence_norm * entity_norm)
+ if sims.shape != prior_probs.shape:
+ raise ValueError(Errors.E161)
+ scores = prior_probs + sims - (prior_probs*sims)
+
+ # TODO: thresholding
+ best_index = scores.argmax()
+ best_candidate = candidates[best_index]
+ final_kb_ids.append(best_candidate.entity_)
+ final_tensors.append(sentence_encoding)
if not (len(final_tensors) == len(final_kb_ids) == entity_count):
raise RuntimeError(Errors.E147.format(method="predict", msg="result variables not of equal length"))
@@ -1408,13 +1441,13 @@ class EntityLinker(Pipe):
raise NotImplementedError
+@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
class Sentencizer(object):
"""Segment the Doc into sentences using a rule-based strategy.
DOCS: https://spacy.io/api/sentencizer
"""
- name = "sentencizer"
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
@@ -1440,6 +1473,10 @@ class Sentencizer(object):
else:
self.punct_chars = set(self.default_punct_chars)
+ @classmethod
+ def from_nlp(cls, nlp, **cfg):
+ return cls(**cfg)
+
def __call__(self, doc):
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
@@ -1506,4 +1543,9 @@ class Sentencizer(object):
return self
+# Cython classes can't be decorated, so we need to add the factories here
+Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
+Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
+
+
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 9c057d0a3..0b4843f41 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -82,6 +82,7 @@ class Scorer(object):
self.sbd = PRFScore()
self.unlabelled = PRFScore()
self.labelled = PRFScore()
+ self.labelled_per_dep = dict()
self.tags = PRFScore()
self.ner = PRFScore()
self.ner_per_ents = dict()
@@ -124,9 +125,18 @@ class Scorer(object):
@property
def las(self):
- """RETURNS (float): Labelled depdendency score."""
+ """RETURNS (float): Labelled dependency score."""
return self.labelled.fscore * 100
+ @property
+ def las_per_type(self):
+ """RETURNS (dict): Scores per dependency label.
+ """
+ return {
+ k: {"p": v.precision * 100, "r": v.recall * 100, "f": v.fscore * 100}
+ for k, v in self.labelled_per_dep.items()
+ }
+
@property
def ents_p(self):
"""RETURNS (float): Named entity accuracy (precision)."""
@@ -196,6 +206,7 @@ class Scorer(object):
return {
"uas": self.uas,
"las": self.las,
+ "las_per_type": self.las_per_type,
"ents_p": self.ents_p,
"ents_r": self.ents_r,
"ents_f": self.ents_f,
@@ -219,15 +230,24 @@ class Scorer(object):
DOCS: https://spacy.io/api/scorer#score
"""
if len(doc) != len(gold):
- gold = GoldParse.from_annot_tuples(doc, zip(*gold.orig_annot))
+ gold = GoldParse.from_annot_tuples(
+ doc, tuple(zip(*gold.orig_annot)) + (gold.cats,)
+ )
gold_deps = set()
+ gold_deps_per_dep = {}
gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] for annot in gold.orig_annot]))
for id_, word, tag, head, dep, ner in gold.orig_annot:
gold_tags.add((id_, tag))
if dep not in (None, "") and dep.lower() not in punct_labels:
gold_deps.add((id_, head, dep.lower()))
+ if dep.lower() not in self.labelled_per_dep:
+ self.labelled_per_dep[dep.lower()] = PRFScore()
+ if dep.lower() not in gold_deps_per_dep:
+ gold_deps_per_dep[dep.lower()] = set()
+ gold_deps_per_dep[dep.lower()].add((id_, head, dep.lower()))
cand_deps = set()
+ cand_deps_per_dep = {}
cand_tags = set()
for token in doc:
if token.orth_.isspace():
@@ -247,6 +267,11 @@ class Scorer(object):
self.labelled.fp += 1
else:
cand_deps.add((gold_i, gold_head, token.dep_.lower()))
+ if token.dep_.lower() not in self.labelled_per_dep:
+ self.labelled_per_dep[token.dep_.lower()] = PRFScore()
+ if token.dep_.lower() not in cand_deps_per_dep:
+ cand_deps_per_dep[token.dep_.lower()] = set()
+ cand_deps_per_dep[token.dep_.lower()].add((gold_i, gold_head, token.dep_.lower()))
if "-" not in [token[-1] for token in gold.orig_annot]:
# Find all NER labels in gold and doc
ent_labels = set([x[0] for x in gold_ents] + [k.label_ for k in doc.ents])
@@ -278,6 +303,8 @@ class Scorer(object):
self.ner.score_set(cand_ents, gold_ents)
self.tags.score_set(cand_tags, gold_tags)
self.labelled.score_set(cand_deps, gold_deps)
+ for dep in self.labelled_per_dep:
+ self.labelled_per_dep[dep].score_set(cand_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()))
self.unlabelled.score_set(
set(item[:2] for item in cand_deps), set(item[:2] for item in gold_deps)
)
diff --git a/spacy/structs.pxd b/spacy/structs.pxd
index 468277f6b..b3878db3f 100644
--- a/spacy/structs.pxd
+++ b/spacy/structs.pxd
@@ -47,11 +47,14 @@ cdef struct SerializedLexemeC:
# + sizeof(float) # l2_norm
-cdef struct Entity:
+cdef struct SpanC:
hash_t id
int start
int end
+ int start_char
+ int end_char
attr_t label
+ attr_t kb_id
cdef struct TokenC:
diff --git a/spacy/syntax/_parser_model.pxd b/spacy/syntax/_parser_model.pxd
index 5aec986d2..9c72f3415 100644
--- a/spacy/syntax/_parser_model.pxd
+++ b/spacy/syntax/_parser_model.pxd
@@ -36,7 +36,9 @@ cdef WeightsC get_c_weights(model) except *
cdef SizesC get_c_sizes(model, int batch_size) except *
-cdef void resize_activations(ActivationsC* A, SizesC n) nogil
+cdef ActivationsC alloc_activations(SizesC n) nogil
+
+cdef void free_activations(const ActivationsC* A) nogil
cdef void predict_states(ActivationsC* A, StateC** states,
const WeightsC* W, SizesC n) nogil
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index e95cc980f..f5369fbd1 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -71,6 +71,21 @@ cdef SizesC get_c_sizes(model, int batch_size) except *:
return output
+cdef ActivationsC alloc_activations(SizesC n) nogil:
+ cdef ActivationsC A
+ memset(&A, 0, sizeof(A))
+ resize_activations(&A, n)
+ return A
+
+
+cdef void free_activations(const ActivationsC* A) nogil:
+ free(A.token_ids)
+ free(A.scores)
+ free(A.unmaxed)
+ free(A.hiddens)
+ free(A.is_valid)
+
+
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
if n.states <= A._max_size:
A._curr_size = n.states
@@ -484,6 +499,8 @@ cdef class precompute_hiddens:
ops = NumpyOps()
else:
ops = CupyOps()
+ mask_ = ops.asarray(mask)
+
# This will usually be on GPU
d_best = ops.asarray(d_best)
# Fix nans (which can occur from unseen classes.)
diff --git a/spacy/syntax/_state.pxd b/spacy/syntax/_state.pxd
index 0c1ad930b..141d796a4 100644
--- a/spacy/syntax/_state.pxd
+++ b/spacy/syntax/_state.pxd
@@ -7,7 +7,7 @@ from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from murmurhash.mrmr cimport hash64
from ..vocab cimport EMPTY_LEXEME
-from ..structs cimport TokenC, Entity
+from ..structs cimport TokenC, SpanC
from ..lexeme cimport Lexeme
from ..symbols cimport punct
from ..attrs cimport IS_SPACE
@@ -40,7 +40,7 @@ cdef cppclass StateC:
int* _buffer
bint* shifted
TokenC* _sent
- Entity* _ents
+ SpanC* _ents
TokenC _empty_token
RingBufferC _hist
int length
@@ -56,7 +56,7 @@ cdef cppclass StateC:
this._stack = calloc(length + (PADDING * 2), sizeof(int))
this.shifted = calloc(length + (PADDING * 2), sizeof(bint))
this._sent = calloc(length + (PADDING * 2), sizeof(TokenC))
- this._ents = calloc(length + (PADDING * 2), sizeof(Entity))
+ this._ents = calloc(length + (PADDING * 2), sizeof(SpanC))
if not (this._buffer and this._stack and this.shifted
and this._sent and this._ents):
with gil:
@@ -426,7 +426,7 @@ cdef cppclass StateC:
memcpy(this._sent, src._sent, this.length * sizeof(TokenC))
memcpy(this._stack, src._stack, this.length * sizeof(int))
memcpy(this._buffer, src._buffer, this.length * sizeof(int))
- memcpy(this._ents, src._ents, this.length * sizeof(Entity))
+ memcpy(this._ents, src._ents, this.length * sizeof(SpanC))
memcpy(this.shifted, src.shifted, this.length * sizeof(this.shifted[0]))
this._b_i = src._b_i
this._s_i = src._s_i
diff --git a/spacy/syntax/arc_eager.pyx b/spacy/syntax/arc_eager.pyx
index 5a7355061..eb39124ce 100644
--- a/spacy/syntax/arc_eager.pyx
+++ b/spacy/syntax/arc_eager.pyx
@@ -342,7 +342,6 @@ cdef class ArcEager(TransitionSystem):
actions[RIGHT][label] = 1
actions[REDUCE][label] = 1
for raw_text, sents in kwargs.get('gold_parses', []):
- _ = sents.pop()
for (ids, words, tags, heads, labels, iob), ctnts in sents:
heads, labels = nonproj.projectivize(heads, labels)
for child, head, label in zip(ids, heads, labels):
diff --git a/spacy/syntax/ner.pyx b/spacy/syntax/ner.pyx
index 3bd096463..9f8ad418c 100644
--- a/spacy/syntax/ner.pyx
+++ b/spacy/syntax/ner.pyx
@@ -73,7 +73,6 @@ cdef class BiluoPushDown(TransitionSystem):
actions[action][entity_type] = 1
moves = ('M', 'B', 'I', 'L', 'U')
for raw_text, sents in kwargs.get('gold_parses', []):
- _ = sents.pop()
for (ids, words, tags, heads, labels, biluo), _ in sents:
for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-':
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 235751de4..21852a1c6 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -27,7 +27,8 @@ from thinc.neural.util import get_array_module
from thinc.linalg cimport Vec, VecVec
import srsly
-from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
+from ._parser_model cimport alloc_activations, free_activations
+from ._parser_model cimport predict_states, arg_max_if_valid
from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ._parser_model cimport get_c_weights, get_c_sizes
from ._parser_model import ParserModel
@@ -145,6 +146,10 @@ cdef class Parser:
self._multitasks = []
self._rehearsal_model = None
+ @classmethod
+ def from_nlp(cls, nlp, **cfg):
+ return cls(nlp.vocab, **cfg)
+
def __reduce__(self):
return (Parser, (self.vocab, self.moves, self.model), None, None)
@@ -330,8 +335,7 @@ cdef class Parser:
WeightsC weights, SizesC sizes) nogil:
cdef int i, j
cdef vector[StateC*] unfinished
- cdef ActivationsC activations
- memset(&activations, 0, sizeof(activations))
+ cdef ActivationsC activations = alloc_activations(sizes)
while sizes.states >= 1:
predict_states(&activations,
states, &weights, sizes)
@@ -345,6 +349,7 @@ cdef class Parser:
states[i] = unfinished[i]
sizes.states = unfinished.size()
unfinished.clear()
+ free_activations(&activations)
def set_annotations(self, docs, states_or_beams, tensors=None):
cdef StateClass state
@@ -381,6 +386,9 @@ cdef class Parser:
cdef void c_transition_batch(self, StateC** states, const float* scores,
int nr_class, int batch_size) nogil:
+ # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+ with gil:
+ assert self.moves.n_moves > 0
is_valid = calloc(self.moves.n_moves, sizeof(int))
cdef int i, guess
cdef Transition action
@@ -564,6 +572,10 @@ cdef class Parser:
cdef GoldParse gold
cdef Pool mem = Pool()
cdef int i
+
+ # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+ assert self.moves.n_moves > 0
+
is_valid = mem.alloc(self.moves.n_moves, sizeof(int))
costs = mem.alloc(self.moves.n_moves, sizeof(float))
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
@@ -612,12 +624,11 @@ cdef class Parser:
doc_sample = []
gold_sample = []
for raw_text, annots_brackets in islice(get_gold_tuples(), 1000):
- _ = annots_brackets.pop()
for annots, brackets in annots_brackets:
ids, words, tags, heads, deps, ents = annots
doc_sample.append(Doc(self.vocab, words=words))
gold_sample.append(GoldParse(doc_sample[-1], words=words, tags=tags,
- heads=heads, deps=deps, ents=ents))
+ heads=heads, deps=deps, entities=ents))
self.model.begin_training(doc_sample, gold_sample)
if pipeline is not None:
self.init_multitask_objectives(get_gold_tuples, pipeline, sgd=sgd, **cfg)
diff --git a/spacy/syntax/stateclass.pxd b/spacy/syntax/stateclass.pxd
index 0a9be3b7f..567982a3f 100644
--- a/spacy/syntax/stateclass.pxd
+++ b/spacy/syntax/stateclass.pxd
@@ -3,7 +3,7 @@ from libc.string cimport memcpy, memset
from cymem.cymem cimport Pool
cimport cython
-from ..structs cimport TokenC, Entity
+from ..structs cimport TokenC, SpanC
from ..typedefs cimport attr_t
from ..vocab cimport EMPTY_LEXEME
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index 58b3a6993..7876813e0 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -83,6 +83,8 @@ cdef class TransitionSystem:
def get_oracle_sequence(self, doc, GoldParse gold):
cdef Pool mem = Pool()
+ # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
+ assert self.n_moves > 0
costs = mem.alloc(self.n_moves, sizeof(float))
is_valid = mem.alloc(self.n_moves, sizeof(int))
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index ba36eeb8f..d6b9ba11f 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -135,6 +135,11 @@ def ko_tokenizer():
return get_lang_class("ko").Defaults.create_tokenizer()
+@pytest.fixture(scope="session")
+def lb_tokenizer():
+ return get_lang_class("lb").Defaults.create_tokenizer()
+
+
@pytest.fixture(scope="session")
def lt_tokenizer():
return get_lang_class("lt").Defaults.create_tokenizer()
@@ -213,3 +218,9 @@ def uk_tokenizer():
@pytest.fixture(scope="session")
def ur_tokenizer():
return get_lang_class("ur").Defaults.create_tokenizer()
+
+
+@pytest.fixture(scope="session")
+def zh_tokenizer():
+ pytest.importorskip("jieba")
+ return get_lang_class("zh").Defaults.create_tokenizer()
diff --git a/spacy/tests/doc/test_retokenize_split.py b/spacy/tests/doc/test_retokenize_split.py
index 6c41a59be..d074fddc6 100644
--- a/spacy/tests/doc/test_retokenize_split.py
+++ b/spacy/tests/doc/test_retokenize_split.py
@@ -183,3 +183,18 @@ def test_doc_retokenizer_split_lex_attrs(en_vocab):
retokenizer.split(doc[0], ["Los", "Angeles"], heads, attrs=attrs)
assert doc[0].is_stop
assert not doc[1].is_stop
+
+
+def test_doc_retokenizer_realloc(en_vocab):
+ """#4604: realloc correctly when new tokens outnumber original tokens"""
+ text = "Hyperglycemic adverse events following antipsychotic drug administration in the"
+ doc = Doc(en_vocab, words=text.split()[:-1])
+ with doc.retokenize() as retokenizer:
+ token = doc[0]
+ heads = [(token, 0)] * len(token)
+ retokenizer.split(doc[token.i], list(token.text), heads=heads)
+ doc = Doc(en_vocab, words=text.split())
+ with doc.retokenize() as retokenizer:
+ token = doc[0]
+ heads = [(token, 0)] * len(token)
+ retokenizer.split(doc[token.i], list(token.text), heads=heads)
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index c8c809d24..f813a9743 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -253,3 +253,11 @@ def test_filter_spans(doc):
assert len(filtered[1]) == 5
assert filtered[0].start == 1 and filtered[0].end == 4
assert filtered[1].start == 5 and filtered[1].end == 10
+ # Test filtering overlaps with earlier preference for identical length
+ spans = [doc[1:4], doc[2:5], doc[5:10], doc[7:9], doc[1:4]]
+ filtered = filter_spans(spans)
+ assert len(filtered) == 2
+ assert len(filtered[0]) == 3
+ assert len(filtered[1]) == 5
+ assert filtered[0].start == 1 and filtered[0].end == 4
+ assert filtered[1].start == 5 and filtered[1].end == 10
diff --git a/examples/pipeline/dummy_entity_linking.py b/spacy/tests/lang/lb/__init__.py
similarity index 100%
rename from examples/pipeline/dummy_entity_linking.py
rename to spacy/tests/lang/lb/__init__.py
diff --git a/spacy/tests/lang/lb/test_exceptions.py b/spacy/tests/lang/lb/test_exceptions.py
new file mode 100644
index 000000000..ca38c2c38
--- /dev/null
+++ b/spacy/tests/lang/lb/test_exceptions.py
@@ -0,0 +1,10 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize("text", ["z.B.", "Jan."])
+def test_lb_tokenizer_handles_abbr(lb_tokenizer, text):
+ tokens = lb_tokenizer(text)
+ assert len(tokens) == 1
diff --git a/spacy/tests/lang/lb/test_prefix_suffix_infix.py b/spacy/tests/lang/lb/test_prefix_suffix_infix.py
new file mode 100644
index 000000000..d85f932be
--- /dev/null
+++ b/spacy/tests/lang/lb/test_prefix_suffix_infix.py
@@ -0,0 +1,22 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+@pytest.mark.parametrize("text,length", [("z.B.", 1), ("zb.", 2), ("(z.B.", 2)])
+def test_lb_tokenizer_splits_prefix_interact(lb_tokenizer, text, length):
+ tokens = lb_tokenizer(text)
+ assert len(tokens) == length
+
+
+@pytest.mark.parametrize("text", ["z.B.)"])
+def test_lb_tokenizer_splits_suffix_interact(lb_tokenizer, text):
+ tokens = lb_tokenizer(text)
+ assert len(tokens) == 2
+
+
+@pytest.mark.parametrize("text", ["(z.B.)"])
+def test_lb_tokenizer_splits_even_wrap_interact(lb_tokenizer, text):
+ tokens = lb_tokenizer(text)
+ assert len(tokens) == 3
diff --git a/spacy/tests/lang/lb/test_text.py b/spacy/tests/lang/lb/test_text.py
new file mode 100644
index 000000000..387cf448d
--- /dev/null
+++ b/spacy/tests/lang/lb/test_text.py
@@ -0,0 +1,23 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+def test_lb_tokenizer_handles_long_text(lb_tokenizer):
+ text = """Den Nordwand an d'Sonn An der Zäit hunn sech den Nordwand an d'Sonn gestridden, wie vun hinnen zwee wuel méi staark wier, wéi e Wanderer, deen an ee waarme Mantel agepak war, iwwert de Wee koum. Si goufen sech eens, dass deejéinege fir de Stäerkste gëlle sollt, deen de Wanderer forcéiere géif, säi Mantel auszedoen. Den Nordwand huet mat aller Force geblosen, awer wat e méi geblosen huet, wat de Wanderer sech méi a säi Mantel agewéckelt huet. Um Enn huet den Nordwand säi Kampf opginn. Dunn huet d'Sonn d'Loft mat hire frëndleche Strale gewiermt, a schonn no kuerzer Zäit huet de Wanderer säi Mantel ausgedoen. Do huet den Nordwand missen zouginn, dass d'Sonn vun hinnen zwee de Stäerkste wier."""
+
+ tokens = lb_tokenizer(text)
+ assert len(tokens) == 142
+
+
+@pytest.mark.parametrize(
+ "text,length",
+ [
+ ("»Wat ass mat mir geschitt?«, huet hie geduecht.", 13),
+ ("“Dëst fréi Opstoen”, denkt hien, “mécht ee ganz duercherneen. ", 15),
+ ],
+)
+def test_lb_tokenizer_handles_examples(lb_tokenizer, text, length):
+ tokens = lb_tokenizer(text)
+ assert len(tokens) == length
diff --git a/spacy/tests/lang/sv/test_noun_chunks.py b/spacy/tests/lang/sv/test_noun_chunks.py
new file mode 100644
index 000000000..ac7c066ba
--- /dev/null
+++ b/spacy/tests/lang/sv/test_noun_chunks.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from ...util import get_doc
+
+
+SV_NP_TEST_EXAMPLES = [
+ (
+ "En student läste en bok", # A student read a book
+ ["DET", "NOUN", "VERB", "DET", "NOUN"],
+ ["det", "nsubj", "ROOT", "det", "dobj"],
+ [1, 1, 0, 1, -2],
+ ["En student", "en bok"],
+ ),
+ (
+ "Studenten läste den bästa boken.", # The student read the best book
+ ["NOUN", "VERB", "DET", "ADJ", "NOUN", "PUNCT"],
+ ["nsubj", "ROOT", "det", "amod", "dobj", "punct"],
+ [1, 0, 2, 1, -3, -4],
+ ["Studenten", "den bästa boken"],
+ ),
+ (
+ "De samvetslösa skurkarna hade stulit de största juvelerna på söndagen", # The remorseless crooks had stolen the largest jewels that sunday
+ ["DET", "ADJ", "NOUN", "VERB", "VERB", "DET", "ADJ", "NOUN", "ADP", "NOUN"],
+ ["det", "amod", "nsubj", "aux", "root", "det", "amod", "dobj", "case", "nmod"],
+ [2, 1, 2, 1, 0, 2, 1, -3, 1, -5],
+ ["De samvetslösa skurkarna", "de största juvelerna", "på söndagen"],
+ ),
+]
+
+
+@pytest.mark.parametrize(
+ "text,pos,deps,heads,expected_noun_chunks", SV_NP_TEST_EXAMPLES
+)
+def test_sv_noun_chunks(sv_tokenizer, text, pos, deps, heads, expected_noun_chunks):
+ tokens = sv_tokenizer(text)
+
+ assert len(heads) == len(pos)
+ doc = get_doc(
+ tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps, pos=pos
+ )
+
+ noun_chunks = list(doc.noun_chunks)
+ assert len(noun_chunks) == len(expected_noun_chunks)
+ for i, np in enumerate(noun_chunks):
+ assert np.text == expected_noun_chunks[i]
diff --git a/examples/pipeline/wikidata_entity_linking.py b/spacy/tests/lang/zh/__init__.py
similarity index 100%
rename from examples/pipeline/wikidata_entity_linking.py
rename to spacy/tests/lang/zh/__init__.py
diff --git a/spacy/tests/lang/zh/test_text.py b/spacy/tests/lang/zh/test_text.py
new file mode 100644
index 000000000..235f597a5
--- /dev/null
+++ b/spacy/tests/lang/zh/test_text.py
@@ -0,0 +1,25 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+
+import pytest
+
+
+@pytest.mark.parametrize(
+ "text,match",
+ [
+ ("10", True),
+ ("1", True),
+ ("999.0", True),
+ ("一", True),
+ ("二", True),
+ ("〇", True),
+ ("十一", True),
+ ("狗", False),
+ (",", False),
+ ],
+)
+def test_lex_attrs_like_number(zh_tokenizer, text, match):
+ tokens = zh_tokenizer(text)
+ assert len(tokens) == 1
+ assert tokens[0].like_num == match
diff --git a/spacy/tests/lang/zh/test_tokenizer.py b/spacy/tests/lang/zh/test_tokenizer.py
new file mode 100644
index 000000000..36d94beb5
--- /dev/null
+++ b/spacy/tests/lang/zh/test_tokenizer.py
@@ -0,0 +1,31 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+
+# fmt: off
+TOKENIZER_TESTS = [
+ ("作为语言而言,为世界使用人数最多的语言,目前世界有五分之一人口做为母语。",
+ ['作为', '语言', '而言', ',', '为', '世界', '使用', '人', '数最多',
+ '的', '语言', ',', '目前', '世界', '有', '五分之一', '人口', '做',
+ '为', '母语', '。']),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_zh_tokenizer(zh_tokenizer, text, expected_tokens):
+ zh_tokenizer.use_jieba = False
+ tokens = [token.text for token in zh_tokenizer(text)]
+ assert tokens == list(text)
+
+ zh_tokenizer.use_jieba = True
+ tokens = [token.text for token in zh_tokenizer(text)]
+ assert tokens == expected_tokens
+
+
+def test_extra_spaces(zh_tokenizer):
+ # note: three spaces after "I"
+ tokens = zh_tokenizer("I like cheese.")
+ assert tokens[1].orth_ == " "
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 0d640e1a2..e4584d03a 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import pytest
import re
+from mock import Mock
from spacy.matcher import Matcher, DependencyMatcher
from spacy.tokens import Doc, Token
@@ -16,7 +17,7 @@ def matcher(en_vocab):
}
matcher = Matcher(en_vocab)
for key, patterns in rules.items():
- matcher.add(key, None, *patterns)
+ matcher.add(key, patterns)
return matcher
@@ -24,11 +25,11 @@ def test_matcher_from_api_docs(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "test"}]
assert len(matcher) == 0
- matcher.add("Rule", None, pattern)
+ matcher.add("Rule", [pattern])
assert len(matcher) == 1
matcher.remove("Rule")
assert "Rule" not in matcher
- matcher.add("Rule", None, pattern)
+ matcher.add("Rule", [pattern])
assert "Rule" in matcher
on_match, patterns = matcher.get("Rule")
assert len(patterns[0])
@@ -51,7 +52,7 @@ def test_matcher_from_usage_docs(en_vocab):
token.vocab[token.text].norm_ = "happy emoji"
matcher = Matcher(en_vocab)
- matcher.add("HAPPY", label_sentiment, *pos_patterns)
+ matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)
matcher(doc)
assert doc.sentiment != 0
assert doc[1].norm_ == "happy emoji"
@@ -59,11 +60,33 @@ def test_matcher_from_usage_docs(en_vocab):
def test_matcher_len_contains(matcher):
assert len(matcher) == 3
- matcher.add("TEST", None, [{"ORTH": "test"}])
+ matcher.add("TEST", [[{"ORTH": "test"}]])
assert "TEST" in matcher
assert "TEST2" not in matcher
+def test_matcher_add_new_old_api(en_vocab):
+ doc = Doc(en_vocab, words=["a", "b"])
+ patterns = [[{"TEXT": "a"}], [{"TEXT": "a"}, {"TEXT": "b"}]]
+ matcher = Matcher(en_vocab)
+ matcher.add("OLD_API", None, *patterns)
+ assert len(matcher(doc)) == 2
+ matcher = Matcher(en_vocab)
+ on_match = Mock()
+ matcher.add("OLD_API_CALLBACK", on_match, *patterns)
+ assert len(matcher(doc)) == 2
+ assert on_match.call_count == 2
+ # New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
+ matcher = Matcher(en_vocab)
+ matcher.add("NEW_API", patterns)
+ assert len(matcher(doc)) == 2
+ matcher = Matcher(en_vocab)
+ on_match = Mock()
+ matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
+ assert len(matcher(doc)) == 2
+ assert on_match.call_count == 2
+
+
def test_matcher_no_match(matcher):
doc = Doc(matcher.vocab, words=["I", "like", "cheese", "."])
assert matcher(doc) == []
@@ -99,12 +122,12 @@ def test_matcher_empty_dict(en_vocab):
"""Test matcher allows empty token specs, meaning match on any token."""
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=["a", "b", "c"])
- matcher.add("A.C", None, [{"ORTH": "a"}, {}, {"ORTH": "c"}])
+ matcher.add("A.C", [[{"ORTH": "a"}, {}, {"ORTH": "c"}]])
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
matcher = Matcher(en_vocab)
- matcher.add("A.", None, [{"ORTH": "a"}, {}])
+ matcher.add("A.", [[{"ORTH": "a"}, {}]])
matches = matcher(doc)
assert matches[0][1:] == (0, 2)
@@ -113,7 +136,7 @@ def test_matcher_operator_shadow(en_vocab):
matcher = Matcher(en_vocab)
doc = Doc(matcher.vocab, words=["a", "b", "c"])
pattern = [{"ORTH": "a"}, {"IS_ALPHA": True, "OP": "+"}, {"ORTH": "c"}]
- matcher.add("A.C", None, pattern)
+ matcher.add("A.C", [pattern])
matches = matcher(doc)
assert len(matches) == 1
assert matches[0][1:] == (0, 3)
@@ -135,12 +158,12 @@ def test_matcher_match_zero(matcher):
{"IS_PUNCT": True},
{"ORTH": '"'},
]
- matcher.add("Quote", None, pattern1)
+ matcher.add("Quote", [pattern1])
doc = Doc(matcher.vocab, words=words1)
assert len(matcher(doc)) == 1
doc = Doc(matcher.vocab, words=words2)
assert len(matcher(doc)) == 0
- matcher.add("Quote", None, pattern2)
+ matcher.add("Quote", [pattern2])
assert len(matcher(doc)) == 0
@@ -148,7 +171,7 @@ def test_matcher_match_zero_plus(matcher):
words = 'He said , " some words " ...'.split()
pattern = [{"ORTH": '"'}, {"OP": "*", "IS_PUNCT": False}, {"ORTH": '"'}]
matcher = Matcher(matcher.vocab)
- matcher.add("Quote", None, pattern)
+ matcher.add("Quote", [pattern])
doc = Doc(matcher.vocab, words=words)
assert len(matcher(doc)) == 1
@@ -159,11 +182,8 @@ def test_matcher_match_one_plus(matcher):
doc = Doc(control.vocab, words=["Philippe", "Philippe"])
m = control(doc)
assert len(m) == 2
- matcher.add(
- "KleenePhilippe",
- None,
- [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}],
- )
+ pattern = [{"ORTH": "Philippe", "OP": "1"}, {"ORTH": "Philippe", "OP": "+"}]
+ matcher.add("KleenePhilippe", [pattern])
m = matcher(doc)
assert len(m) == 1
@@ -171,7 +191,7 @@ def test_matcher_match_one_plus(matcher):
def test_matcher_any_token_operator(en_vocab):
"""Test that patterns with "any token" {} work with operators."""
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, [{"ORTH": "test"}, {"OP": "*"}])
+ matcher.add("TEST", [[{"ORTH": "test"}, {"OP": "*"}]])
doc = Doc(en_vocab, words=["test", "hello", "world"])
matches = [doc[start:end].text for _, start, end in matcher(doc)]
assert len(matches) == 3
@@ -185,7 +205,7 @@ def test_matcher_extension_attribute(en_vocab):
get_is_fruit = lambda token: token.text in ("apple", "banana")
Token.set_extension("is_fruit", getter=get_is_fruit, force=True)
pattern = [{"ORTH": "an"}, {"_": {"is_fruit": True}}]
- matcher.add("HAVING_FRUIT", None, pattern)
+ matcher.add("HAVING_FRUIT", [pattern])
doc = Doc(en_vocab, words=["an", "apple"])
matches = matcher(doc)
assert len(matches) == 1
@@ -197,7 +217,7 @@ def test_matcher_extension_attribute(en_vocab):
def test_matcher_set_value(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["an", "a"]}}]
- matcher.add("A_OR_AN", None, pattern)
+ matcher.add("A_OR_AN", [pattern])
doc = Doc(en_vocab, words=["an", "a", "apple"])
matches = matcher(doc)
assert len(matches) == 2
@@ -209,7 +229,7 @@ def test_matcher_set_value(en_vocab):
def test_matcher_set_value_operator(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"IN": ["a", "the"]}, "OP": "?"}, {"ORTH": "house"}]
- matcher.add("DET_HOUSE", None, pattern)
+ matcher.add("DET_HOUSE", [pattern])
doc = Doc(en_vocab, words=["In", "a", "house"])
matches = matcher(doc)
assert len(matches) == 2
@@ -221,7 +241,7 @@ def test_matcher_set_value_operator(en_vocab):
def test_matcher_regex(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"ORTH": {"REGEX": r"(?:a|an)"}}]
- matcher.add("A_OR_AN", None, pattern)
+ matcher.add("A_OR_AN", [pattern])
doc = Doc(en_vocab, words=["an", "a", "hi"])
matches = matcher(doc)
assert len(matches) == 2
@@ -233,7 +253,7 @@ def test_matcher_regex(en_vocab):
def test_matcher_regex_shape(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"SHAPE": {"REGEX": r"^[^x]+$"}}]
- matcher.add("NON_ALPHA", None, pattern)
+ matcher.add("NON_ALPHA", [pattern])
doc = Doc(en_vocab, words=["99", "problems", "!"])
matches = matcher(doc)
assert len(matches) == 2
@@ -245,7 +265,7 @@ def test_matcher_regex_shape(en_vocab):
def test_matcher_compare_length(en_vocab):
matcher = Matcher(en_vocab)
pattern = [{"LENGTH": {">=": 2}}]
- matcher.add("LENGTH_COMPARE", None, pattern)
+ matcher.add("LENGTH_COMPARE", [pattern])
doc = Doc(en_vocab, words=["a", "aa", "aaa"])
matches = matcher(doc)
assert len(matches) == 2
@@ -259,7 +279,7 @@ def test_matcher_extension_set_membership(en_vocab):
get_reversed = lambda token: "".join(reversed(token.text))
Token.set_extension("reversed", getter=get_reversed, force=True)
pattern = [{"_": {"reversed": {"IN": ["eyb", "ih"]}}}]
- matcher.add("REVERSED", None, pattern)
+ matcher.add("REVERSED", [pattern])
doc = Doc(en_vocab, words=["hi", "bye", "hello"])
matches = matcher(doc)
assert len(matches) == 2
@@ -327,9 +347,9 @@ def dependency_matcher(en_vocab):
]
matcher = DependencyMatcher(en_vocab)
- matcher.add("pattern1", None, pattern1)
- matcher.add("pattern2", None, pattern2)
- matcher.add("pattern3", None, pattern3)
+ matcher.add("pattern1", [pattern1])
+ matcher.add("pattern2", [pattern2])
+ matcher.add("pattern3", [pattern3])
return matcher
@@ -346,6 +366,14 @@ def test_dependency_matcher_compile(dependency_matcher):
# assert matches[2][1] == [[4, 3, 2]]
+def test_matcher_basic_check(en_vocab):
+ matcher = Matcher(en_vocab)
+ # Potential mistake: pass in pattern instead of list of patterns
+ pattern = [{"TEXT": "hello"}, {"TEXT": "world"}]
+ with pytest.raises(ValueError):
+ matcher.add("TEST", pattern)
+
+
def test_attr_pipeline_checks(en_vocab):
doc1 = Doc(en_vocab, words=["Test"])
doc1.is_parsed = True
@@ -354,7 +382,7 @@ def test_attr_pipeline_checks(en_vocab):
doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, [{"DEP": "a"}])
+ matcher.add("TEST", [[{"DEP": "a"}]])
matcher(doc1)
with pytest.raises(ValueError):
matcher(doc2)
@@ -363,7 +391,7 @@ def test_attr_pipeline_checks(en_vocab):
# TAG, POS, LEMMA require is_tagged
for attr in ("TAG", "POS", "LEMMA"):
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, [{attr: "a"}])
+ matcher.add("TEST", [[{attr: "a"}]])
matcher(doc2)
with pytest.raises(ValueError):
matcher(doc1)
@@ -371,12 +399,12 @@ def test_attr_pipeline_checks(en_vocab):
matcher(doc3)
# TEXT/ORTH only require tokens
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, [{"ORTH": "a"}])
+ matcher.add("TEST", [[{"ORTH": "a"}]])
matcher(doc1)
matcher(doc2)
matcher(doc3)
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, [{"TEXT": "a"}])
+ matcher.add("TEST", [[{"TEXT": "a"}]])
matcher(doc1)
matcher(doc2)
matcher(doc3)
@@ -406,7 +434,7 @@ def test_attr_pipeline_checks(en_vocab):
def test_matcher_schema_token_attributes(en_vocab, pattern, text):
matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=text.split(" "))
- matcher.add("Rule", None, pattern)
+ matcher.add("Rule", [pattern])
assert len(matcher) == 1
matches = matcher(doc)
assert len(matches) == 1
@@ -416,5 +444,15 @@ def test_matcher_valid_callback(en_vocab):
"""Test that on_match can only be None or callable."""
matcher = Matcher(en_vocab)
with pytest.raises(ValueError):
- matcher.add("TEST", [], [{"TEXT": "test"}])
+ matcher.add("TEST", [[{"TEXT": "test"}]], on_match=[])
matcher(Doc(en_vocab, words=["test"]))
+
+
+def test_matcher_callback(en_vocab):
+ mock = Mock()
+ matcher = Matcher(en_vocab)
+ pattern = [{"ORTH": "test"}]
+ matcher.add("Rule", [pattern], on_match=mock)
+ doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
+ matches = matcher(doc)
+ mock.assert_called_once_with(matcher, doc, 0, matches)
diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
index 21b5ac52d..240ace537 100644
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
import pytest
import re
+
+from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Doc, Span
@@ -53,7 +55,7 @@ def test_greedy_matching(doc, text, pattern, re_pattern):
"""Test that the greedy matching behavior of the * op is consistant with
other re implementations."""
matcher = Matcher(doc.vocab)
- matcher.add(re_pattern, None, pattern)
+ matcher.add(re_pattern, [pattern])
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
for match, re_match in zip(matches, re_matches):
@@ -75,7 +77,7 @@ def test_match_consuming(doc, text, pattern, re_pattern):
"""Test that matcher.__call__ consumes tokens on a match similar to
re.findall."""
matcher = Matcher(doc.vocab)
- matcher.add(re_pattern, None, pattern)
+ matcher.add(re_pattern, [pattern])
matches = matcher(doc)
re_matches = [m.span() for m in re.finditer(re_pattern, text)]
assert len(matches) == len(re_matches)
@@ -109,7 +111,7 @@ def test_operator_combos(en_vocab):
pattern.append({"ORTH": part[0], "OP": "+"})
else:
pattern.append({"ORTH": part})
- matcher.add("PATTERN", None, pattern)
+ matcher.add("PATTERN", [pattern])
matches = matcher(doc)
if result:
assert matches, (string, pattern_str)
@@ -121,7 +123,7 @@ def test_matcher_end_zero_plus(en_vocab):
"""Test matcher works when patterns end with * operator. (issue 1450)"""
matcher = Matcher(en_vocab)
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
- matcher.add("TSTEND", None, pattern)
+ matcher.add("TSTEND", [pattern])
nlp = lambda string: Doc(matcher.vocab, words=string.split())
assert len(matcher(nlp("a"))) == 1
assert len(matcher(nlp("a b"))) == 2
@@ -138,8 +140,34 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
[{"LOWER": {"IN": ["one"]}}],
[{"LOWER": {"IN": ["two"]}}],
]
- matcher.add("TEST", None, *patterns)
+ matcher.add("TEST", patterns)
doc = Doc(en_vocab, words="zero one two three".split())
matches = matcher(doc)
texts = [Span(doc, s, e, label=L).text for L, s, e in matches]
assert texts == ["zero", "one", "two"]
+
+
+def test_matcher_remove():
+ nlp = English()
+ matcher = Matcher(nlp.vocab)
+ text = "This is a test case."
+
+ pattern = [{"ORTH": "test"}, {"OP": "?"}]
+ assert len(matcher) == 0
+ matcher.add("Rule", [pattern])
+ assert "Rule" in matcher
+
+ # should give two matches
+ results1 = matcher(nlp(text))
+ assert len(results1) == 2
+
+ # removing once should work
+ matcher.remove("Rule")
+
+ # should not return any maches anymore
+ results2 = matcher(nlp(text))
+ assert len(results2) == 0
+
+ # removing again should throw an error
+ with pytest.raises(ValueError):
+ matcher.remove("Rule")
diff --git a/spacy/tests/matcher/test_pattern_validation.py b/spacy/tests/matcher/test_pattern_validation.py
index 665bcf935..2db2f9eb3 100644
--- a/spacy/tests/matcher/test_pattern_validation.py
+++ b/spacy/tests/matcher/test_pattern_validation.py
@@ -12,24 +12,25 @@ from spacy.util import get_json_validator, validate_json
TEST_PATTERNS = [
# Bad patterns flagged in all cases
([{"XX": "foo"}], 1, 1),
- ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 1),
([{"IS_ALPHA": {"==": True}}, {"LIKE_NUM": None}], 2, 1),
([{"IS_PUNCT": True, "OP": "$"}], 1, 1),
- ([{"IS_DIGIT": -1}], 1, 1),
- ([{"ORTH": -1}], 1, 1),
([{"_": "foo"}], 1, 1),
('[{"TEXT": "foo"}, {"LOWER": "bar"}]', 1, 1),
([1, 2, 3], 3, 1),
# Bad patterns flagged outside of Matcher
([{"_": {"foo": "bar", "baz": {"IN": "foo"}}}], 1, 0),
# Bad patterns not flagged with minimal checks
+ ([{"LENGTH": "2", "TEXT": 2}, {"LOWER": "test"}], 2, 0),
([{"LENGTH": {"IN": [1, 2, "3"]}}, {"POS": {"IN": "VERB"}}], 2, 0),
([{"LENGTH": {"VALUE": 5}}], 1, 0),
([{"TEXT": {"VALUE": "foo"}}], 1, 0),
+ ([{"IS_DIGIT": -1}], 1, 0),
+ ([{"ORTH": -1}], 1, 0),
# Good patterns
([{"TEXT": "foo"}, {"LOWER": "bar"}], 0, 0),
([{"LEMMA": {"IN": ["love", "like"]}}, {"POS": "DET", "OP": "?"}], 0, 0),
([{"LIKE_NUM": True, "LENGTH": {">=": 5}}], 0, 0),
+ ([{"LENGTH": 2}], 0, 0),
([{"LOWER": {"REGEX": "^X", "NOT_IN": ["XXX", "XY"]}}], 0, 0),
([{"NORM": "a"}, {"POS": {"IN": ["NOUN"]}}], 0, 0),
([{"_": {"foo": {"NOT_IN": ["bar", "baz"]}, "a": 5, "b": {">": 10}}}], 0, 0),
@@ -49,7 +50,7 @@ def validator():
def test_matcher_pattern_validation(en_vocab, pattern):
matcher = Matcher(en_vocab, validate=True)
with pytest.raises(MatchPatternError):
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
@pytest.mark.parametrize("pattern,n_errors,_", TEST_PATTERNS)
@@ -70,6 +71,6 @@ def test_minimal_pattern_validation(en_vocab, pattern, n_errors, n_min_errors):
matcher = Matcher(en_vocab)
if n_min_errors > 0:
with pytest.raises(ValueError):
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
elif n_errors == 0:
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
diff --git a/spacy/tests/matcher/test_phrase_matcher.py b/spacy/tests/matcher/test_phrase_matcher.py
index 486cbb984..7a6585e06 100644
--- a/spacy/tests/matcher/test_phrase_matcher.py
+++ b/spacy/tests/matcher/test_phrase_matcher.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
+from mock import Mock
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc
from ..util import get_doc
@@ -12,53 +13,75 @@ def test_matcher_phrase_matcher(en_vocab):
# intermediate phrase
pattern = Doc(en_vocab, words=["Google", "Now"])
matcher = PhraseMatcher(en_vocab)
- matcher.add("COMPANY", None, pattern)
+ matcher.add("COMPANY", [pattern])
assert len(matcher(doc)) == 1
# initial token
pattern = Doc(en_vocab, words=["I"])
matcher = PhraseMatcher(en_vocab)
- matcher.add("I", None, pattern)
+ matcher.add("I", [pattern])
assert len(matcher(doc)) == 1
# initial phrase
pattern = Doc(en_vocab, words=["I", "like"])
matcher = PhraseMatcher(en_vocab)
- matcher.add("ILIKE", None, pattern)
+ matcher.add("ILIKE", [pattern])
assert len(matcher(doc)) == 1
# final token
pattern = Doc(en_vocab, words=["best"])
matcher = PhraseMatcher(en_vocab)
- matcher.add("BEST", None, pattern)
+ matcher.add("BEST", [pattern])
assert len(matcher(doc)) == 1
# final phrase
pattern = Doc(en_vocab, words=["Now", "best"])
matcher = PhraseMatcher(en_vocab)
- matcher.add("NOWBEST", None, pattern)
+ matcher.add("NOWBEST", [pattern])
assert len(matcher(doc)) == 1
def test_phrase_matcher_length(en_vocab):
matcher = PhraseMatcher(en_vocab)
assert len(matcher) == 0
- matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
+ matcher.add("TEST", [Doc(en_vocab, words=["test"])])
assert len(matcher) == 1
- matcher.add("TEST2", None, Doc(en_vocab, words=["test2"]))
+ matcher.add("TEST2", [Doc(en_vocab, words=["test2"])])
assert len(matcher) == 2
def test_phrase_matcher_contains(en_vocab):
matcher = PhraseMatcher(en_vocab)
- matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
+ matcher.add("TEST", [Doc(en_vocab, words=["test"])])
assert "TEST" in matcher
assert "TEST2" not in matcher
+def test_phrase_matcher_add_new_api(en_vocab):
+ doc = Doc(en_vocab, words=["a", "b"])
+ patterns = [Doc(en_vocab, words=["a"]), Doc(en_vocab, words=["a", "b"])]
+ matcher = PhraseMatcher(en_vocab)
+ matcher.add("OLD_API", None, *patterns)
+ assert len(matcher(doc)) == 2
+ matcher = PhraseMatcher(en_vocab)
+ on_match = Mock()
+ matcher.add("OLD_API_CALLBACK", on_match, *patterns)
+ assert len(matcher(doc)) == 2
+ assert on_match.call_count == 2
+ # New API: add(key: str, patterns: List[List[dict]], on_match: Callable)
+ matcher = PhraseMatcher(en_vocab)
+ matcher.add("NEW_API", patterns)
+ assert len(matcher(doc)) == 2
+ matcher = PhraseMatcher(en_vocab)
+ on_match = Mock()
+ matcher.add("NEW_API_CALLBACK", patterns, on_match=on_match)
+ assert len(matcher(doc)) == 2
+ assert on_match.call_count == 2
+
+
def test_phrase_matcher_repeated_add(en_vocab):
matcher = PhraseMatcher(en_vocab)
# match ID only gets added once
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+ matcher.add("TEST", [Doc(en_vocab, words=["like"])])
+ matcher.add("TEST", [Doc(en_vocab, words=["like"])])
+ matcher.add("TEST", [Doc(en_vocab, words=["like"])])
+ matcher.add("TEST", [Doc(en_vocab, words=["like"])])
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST" in matcher
assert "TEST2" not in matcher
@@ -67,8 +90,8 @@ def test_phrase_matcher_repeated_add(en_vocab):
def test_phrase_matcher_remove(en_vocab):
matcher = PhraseMatcher(en_vocab)
- matcher.add("TEST1", None, Doc(en_vocab, words=["like"]))
- matcher.add("TEST2", None, Doc(en_vocab, words=["best"]))
+ matcher.add("TEST1", [Doc(en_vocab, words=["like"])])
+ matcher.add("TEST2", [Doc(en_vocab, words=["best"])])
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST1" in matcher
assert "TEST2" in matcher
@@ -94,9 +117,9 @@ def test_phrase_matcher_remove(en_vocab):
def test_phrase_matcher_overlapping_with_remove(en_vocab):
matcher = PhraseMatcher(en_vocab)
- matcher.add("TEST", None, Doc(en_vocab, words=["like"]))
+ matcher.add("TEST", [Doc(en_vocab, words=["like"])])
# TEST2 is added alongside TEST
- matcher.add("TEST2", None, Doc(en_vocab, words=["like"]))
+ matcher.add("TEST2", [Doc(en_vocab, words=["like"])])
doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
assert "TEST" in matcher
assert len(matcher) == 2
@@ -121,7 +144,7 @@ def test_phrase_matcher_string_attrs(en_vocab):
pos2 = ["INTJ", "PUNCT", "PRON", "VERB", "NOUN", "ADV", "ADV"]
pattern = get_doc(en_vocab, words=words1, pos=pos1)
matcher = PhraseMatcher(en_vocab, attr="POS")
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
doc = get_doc(en_vocab, words=words2, pos=pos2)
matches = matcher(doc)
assert len(matches) == 1
@@ -139,7 +162,7 @@ def test_phrase_matcher_string_attrs_negative(en_vocab):
pos2 = ["X", "X", "X"]
pattern = get_doc(en_vocab, words=words1, pos=pos1)
matcher = PhraseMatcher(en_vocab, attr="POS")
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
doc = get_doc(en_vocab, words=words2, pos=pos2)
matches = matcher(doc)
assert len(matches) == 0
@@ -150,7 +173,7 @@ def test_phrase_matcher_bool_attrs(en_vocab):
words2 = ["No", "problem", ",", "he", "said", "."]
pattern = Doc(en_vocab, words=words1)
matcher = PhraseMatcher(en_vocab, attr="IS_PUNCT")
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=words2)
matches = matcher(doc)
assert len(matches) == 2
@@ -172,15 +195,15 @@ def test_phrase_matcher_validation(en_vocab):
doc3 = Doc(en_vocab, words=["Test"])
matcher = PhraseMatcher(en_vocab, validate=True)
with pytest.warns(UserWarning):
- matcher.add("TEST1", None, doc1)
+ matcher.add("TEST1", [doc1])
with pytest.warns(UserWarning):
- matcher.add("TEST2", None, doc2)
+ matcher.add("TEST2", [doc2])
with pytest.warns(None) as record:
- matcher.add("TEST3", None, doc3)
+ matcher.add("TEST3", [doc3])
assert not record.list
matcher = PhraseMatcher(en_vocab, attr="POS", validate=True)
with pytest.warns(None) as record:
- matcher.add("TEST4", None, doc2)
+ matcher.add("TEST4", [doc2])
assert not record.list
@@ -197,21 +220,49 @@ def test_attr_pipeline_checks(en_vocab):
doc3 = Doc(en_vocab, words=["Test"])
# DEP requires is_parsed
matcher = PhraseMatcher(en_vocab, attr="DEP")
- matcher.add("TEST1", None, doc1)
+ matcher.add("TEST1", [doc1])
with pytest.raises(ValueError):
- matcher.add("TEST2", None, doc2)
+ matcher.add("TEST2", [doc2])
with pytest.raises(ValueError):
- matcher.add("TEST3", None, doc3)
+ matcher.add("TEST3", [doc3])
# TAG, POS, LEMMA require is_tagged
for attr in ("TAG", "POS", "LEMMA"):
matcher = PhraseMatcher(en_vocab, attr=attr)
- matcher.add("TEST2", None, doc2)
+ matcher.add("TEST2", [doc2])
with pytest.raises(ValueError):
- matcher.add("TEST1", None, doc1)
+ matcher.add("TEST1", [doc1])
with pytest.raises(ValueError):
- matcher.add("TEST3", None, doc3)
+ matcher.add("TEST3", [doc3])
# TEXT/ORTH only require tokens
matcher = PhraseMatcher(en_vocab, attr="ORTH")
- matcher.add("TEST3", None, doc3)
+ matcher.add("TEST3", [doc3])
matcher = PhraseMatcher(en_vocab, attr="TEXT")
- matcher.add("TEST3", None, doc3)
+ matcher.add("TEST3", [doc3])
+
+
+def test_phrase_matcher_callback(en_vocab):
+ mock = Mock()
+ doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
+ pattern = Doc(en_vocab, words=["Google", "Now"])
+ matcher = PhraseMatcher(en_vocab)
+ matcher.add("COMPANY", [pattern], on_match=mock)
+ matches = matcher(doc)
+ mock.assert_called_once_with(matcher, doc, 0, matches)
+
+
+def test_phrase_matcher_remove_overlapping_patterns(en_vocab):
+ matcher = PhraseMatcher(en_vocab)
+ pattern1 = Doc(en_vocab, words=["this"])
+ pattern2 = Doc(en_vocab, words=["this", "is"])
+ pattern3 = Doc(en_vocab, words=["this", "is", "a"])
+ pattern4 = Doc(en_vocab, words=["this", "is", "a", "word"])
+ matcher.add("THIS", [pattern1, pattern2, pattern3, pattern4])
+ matcher.remove("THIS")
+
+
+def test_phrase_matcher_basic_check(en_vocab):
+ matcher = PhraseMatcher(en_vocab)
+ # Potential mistake: pass in pattern instead of list of patterns
+ pattern = Doc(en_vocab, words=["hello", "world"])
+ with pytest.raises(ValueError):
+ matcher.add("TEST", pattern)
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index 4dc7542ed..d05403891 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -103,7 +103,7 @@ def test_oracle_moves_missing_B(en_vocab):
moves.add_action(move_types.index("L"), label)
moves.add_action(move_types.index("U"), label)
moves.preprocess_gold(gold)
- seq = moves.get_oracle_sequence(doc, gold)
+ moves.get_oracle_sequence(doc, gold)
def test_oracle_moves_whitespace(en_vocab):
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
new file mode 100644
index 000000000..198f11bcd
--- /dev/null
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -0,0 +1,168 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import spacy.language
+from spacy.language import Language, component
+from spacy.analysis import print_summary, validate_attrs
+from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.compat import is_python2
+from mock import Mock, ANY
+import pytest
+
+
+def test_component_decorator_function():
+ @component(name="test")
+ def test_component(doc):
+ """docstring"""
+ return doc
+
+ assert test_component.name == "test"
+ if not is_python2:
+ assert test_component.__doc__ == "docstring"
+ assert test_component("foo") == "foo"
+
+
+def test_component_decorator_class():
+ @component(name="test")
+ class TestComponent(object):
+ """docstring1"""
+
+ foo = "bar"
+
+ def __call__(self, doc):
+ """docstring2"""
+ return doc
+
+ def custom(self, x):
+ """docstring3"""
+ return x
+
+ assert TestComponent.name == "test"
+ assert TestComponent.foo == "bar"
+ assert hasattr(TestComponent, "custom")
+ test_component = TestComponent()
+ assert test_component.foo == "bar"
+ assert test_component("foo") == "foo"
+ assert hasattr(test_component, "custom")
+ assert test_component.custom("bar") == "bar"
+ if not is_python2:
+ assert TestComponent.__doc__ == "docstring1"
+ assert TestComponent.__call__.__doc__ == "docstring2"
+ assert TestComponent.custom.__doc__ == "docstring3"
+ assert test_component.__doc__ == "docstring1"
+ assert test_component.__call__.__doc__ == "docstring2"
+ assert test_component.custom.__doc__ == "docstring3"
+
+
+def test_component_decorator_assigns():
+ spacy.language.ENABLE_PIPELINE_ANALYSIS = True
+
+ @component("c1", assigns=["token.tag", "doc.tensor"])
+ def test_component1(doc):
+ return doc
+
+ @component(
+ "c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
+ )
+ def test_component2(doc):
+ return doc
+
+ @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
+ def test_component3(doc):
+ return doc
+
+ assert "c1" in Language.factories
+ assert "c2" in Language.factories
+ assert "c3" in Language.factories
+
+ nlp = Language()
+ nlp.add_pipe(test_component1)
+ with pytest.warns(UserWarning):
+ nlp.add_pipe(test_component2)
+ nlp.add_pipe(test_component3)
+ assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
+ assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
+ test_component4 = nlp.create_pipe("c1")
+ assert test_component4.name == "c1"
+ assert test_component4.factory == "c1"
+ nlp.add_pipe(test_component4, name="c4")
+ assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
+ assert "c4" not in Language.factories
+ assert nlp.pipe_factories["c1"] == "c1"
+ assert nlp.pipe_factories["c4"] == "c1"
+ assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
+ assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
+ requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
+ assert [name for name, _ in requires_pos] == ["c2"]
+ assert print_summary(nlp, no_print=True)
+ assert nlp("hello world")
+
+
+def test_component_factories_from_nlp():
+ """Test that class components can implement a from_nlp classmethod that
+ gives them access to the nlp object and config via the factory."""
+
+ class TestComponent5(object):
+ def __call__(self, doc):
+ return doc
+
+ mock = Mock()
+ mock.return_value = TestComponent5()
+ TestComponent5.from_nlp = classmethod(mock)
+ TestComponent5 = component("c5")(TestComponent5)
+
+ assert "c5" in Language.factories
+ nlp = Language()
+ pipe = nlp.create_pipe("c5", config={"foo": "bar"})
+ nlp.add_pipe(pipe)
+ assert nlp("hello world")
+ # The first argument here is the class itself, so we're accepting any here
+ mock.assert_called_once_with(ANY, nlp, foo="bar")
+
+
+def test_analysis_validate_attrs_valid():
+ attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz", "span._.xyz"]
+ assert validate_attrs(attrs)
+ for attr in attrs:
+ assert validate_attrs([attr])
+ with pytest.raises(ValueError):
+ validate_attrs(["doc.sents", "doc.xyz"])
+
+
+@pytest.mark.parametrize(
+ "attr",
+ [
+ "doc",
+ "doc_ents",
+ "doc.xyz",
+ "token.xyz",
+ "token.tag_",
+ "token.tag.xyz",
+ "token._.xyz.abc",
+ "span.label",
+ ],
+)
+def test_analysis_validate_attrs_invalid(attr):
+ with pytest.raises(ValueError):
+ validate_attrs([attr])
+
+
+def test_analysis_validate_attrs_remove_pipe():
+ """Test that attributes are validated correctly on remove."""
+ spacy.language.ENABLE_PIPELINE_ANALYSIS = True
+
+ @component("c1", assigns=["token.tag"])
+ def c1(doc):
+ return doc
+
+ @component("c2", requires=["token.pos"])
+ def c2(doc):
+ return doc
+
+ nlp = Language()
+ nlp.add_pipe(c1)
+ with pytest.warns(UserWarning):
+ nlp.add_pipe(c2)
+ with pytest.warns(None) as record:
+ nlp.remove_pipe("c2")
+ assert not record.list
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 0c89a2e14..8023f72a6 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -131,6 +131,54 @@ def test_candidate_generation(nlp):
assert_almost_equal(mykb.get_candidates("adam")[0].prior_prob, 0.9)
+def test_append_alias(nlp):
+ """Test that we can append additional alias-entity pairs"""
+ mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+ # adding entities
+ mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+ mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+ mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+
+ # adding aliases
+ mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.4, 0.1])
+ mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+ # test the size of the relevant candidates
+ assert len(mykb.get_candidates("douglas")) == 2
+
+ # append an alias
+ mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
+
+ # test the size of the relevant candidates has been incremented
+ assert len(mykb.get_candidates("douglas")) == 3
+
+ # append the same alias-entity pair again should not work (will throw a warning)
+ with pytest.warns(UserWarning):
+ mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.3)
+
+ # test the size of the relevant candidates remained unchanged
+ assert len(mykb.get_candidates("douglas")) == 3
+
+
+def test_append_invalid_alias(nlp):
+ """Test that append an alias will throw an error if prior probs are exceeding 1"""
+ mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
+
+ # adding entities
+ mykb.add_entity(entity="Q1", freq=27, entity_vector=[1])
+ mykb.add_entity(entity="Q2", freq=12, entity_vector=[2])
+ mykb.add_entity(entity="Q3", freq=5, entity_vector=[3])
+
+ # adding aliases
+ mykb.add_alias(alias="douglas", entities=["Q2", "Q3"], probabilities=[0.8, 0.1])
+ mykb.add_alias(alias="adam", entities=["Q2"], probabilities=[0.9])
+
+ # append an alias - should fail because the entities and probabilities vectors are not of equal length
+ with pytest.raises(ValueError):
+ mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
+
+
def test_preserving_links_asdoc(nlp):
"""Test that Span.as_doc preserves the existing entity links"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
diff --git a/spacy/tests/pipeline/test_functions.py b/spacy/tests/pipeline/test_functions.py
new file mode 100644
index 000000000..5b5fcd2fd
--- /dev/null
+++ b/spacy/tests/pipeline/test_functions.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.pipeline.functions import merge_subtokens
+from ..util import get_doc
+
+
+@pytest.fixture
+def doc(en_tokenizer):
+ # fmt: off
+ text = "This is a sentence. This is another sentence. And a third."
+ heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3, 1, 1, 1, 0]
+ deps = ["nsubj", "ROOT", "subtok", "attr", "punct", "nsubj", "ROOT",
+ "subtok", "attr", "punct", "subtok", "subtok", "subtok", "ROOT"]
+ # fmt: on
+ tokens = en_tokenizer(text)
+ return get_doc(tokens.vocab, words=[t.text for t in tokens], heads=heads, deps=deps)
+
+
+def test_merge_subtokens(doc):
+ doc = merge_subtokens(doc)
+ # get_doc() doesn't set spaces, so the result is "And a third ."
+ assert [t.text for t in doc] == [
+ "This",
+ "is",
+ "a sentence",
+ ".",
+ "This",
+ "is",
+ "another sentence",
+ ".",
+ "And a third .",
+ ]
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 5f1fa5cfe..27fb57b18 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -105,6 +105,16 @@ def test_disable_pipes_context(nlp, name):
assert nlp.has_pipe(name)
+def test_disable_pipes_list_arg(nlp):
+ for name in ["c1", "c2", "c3"]:
+ nlp.add_pipe(new_pipe, name=name)
+ assert nlp.has_pipe(name)
+ with nlp.disable_pipes(["c1", "c2"]):
+ assert not nlp.has_pipe("c1")
+ assert not nlp.has_pipe("c2")
+ assert nlp.has_pipe("c3")
+
+
@pytest.mark.parametrize("n_pipes", [100])
def test_add_lots_of_pipes(nlp, n_pipes):
for i in range(n_pipes):
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
index 1e03dc743..d91fdd198 100644
--- a/spacy/tests/pipeline/test_sentencizer.py
+++ b/spacy/tests/pipeline/test_sentencizer.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import pytest
+import spacy
from spacy.pipeline import Sentencizer
from spacy.tokens import Doc
@@ -85,3 +86,26 @@ def test_sentencizer_serialize_bytes(en_vocab):
bytes_data = sentencizer.to_bytes()
new_sentencizer = Sentencizer().from_bytes(bytes_data)
assert new_sentencizer.punct_chars == set(punct_chars)
+
+
+@pytest.mark.parametrize(
+ # fmt: off
+ "lang,text",
+ [
+ ('bn', 'বাংলা ভাষা (বাঙলা, বাঙ্গলা, তথা বাঙ্গালা নামগুলোতেও পরিচিত) একটি ইন্দো-আর্য ভাষা, যা দক্ষিণ এশিয়ার বাঙালি জাতির প্রধান কথ্য ও লেখ্য ভাষা। মাতৃভাষীর সংখ্যায় বাংলা ইন্দো-ইউরোপীয় ভাষা পরিবারের চতুর্থ ও বিশ্বের ষষ্ঠ বৃহত্তম ভাষা।[৫] মোট ব্যবহারকারীর সংখ্যা অনুসারে বাংলা বিশ্বের সপ্তম বৃহত্তম ভাষা। বাংলা সার্বভৌম ভাষাভিত্তিক জাতিরাষ্ট্র বাংলাদেশের একমাত্র রাষ্ট্রভাষা তথা সরকারি ভাষা[৬] এবং ভারতের পশ্চিমবঙ্গ, ত্রিপুরা, আসামের বরাক উপত্যকার সরকারি ভাষা। বঙ্গোপসাগরে অবস্থিত আন্দামান দ্বীপপুঞ্জের প্রধান কথ্য ভাষা বাংলা। এছাড়া ভারতের ঝাড়খণ্ড, বিহার, মেঘালয়, মিজোরাম, উড়িষ্যা রাজ্যগুলোতে উল্লেখযোগ্য পরিমাণে বাংলাভাষী জনগণ রয়েছে। ভারতে হিন্দির পরেই সর্বাধিক প্রচলিত ভাষা বাংলা।[৭][৮] এছাড়াও মধ্য প্রাচ্য, আমেরিকা ও ইউরোপে উল্লেখযোগ্য পরিমাণে বাংলাভাষী অভিবাসী রয়েছে।[৯] সারা বিশ্বে সব মিলিয়ে ২৬ কোটির অধিক লোক দৈনন্দিন জীবনে বাংলা ব্যবহার করে।[২] বাংলাদেশের জাতীয় সঙ্গীত এবং ভারতের জাতীয় সঙ্গীত ও স্তোত্র বাংলাতে রচিত।'),
+ ('de', 'Die deutsche Sprache bzw. Deutsch ([dɔʏ̯t͡ʃ]; abgekürzt dt. oder dtsch.) ist eine westgermanische Sprache. Ihr Sprachraum umfasst Deutschland, Österreich, die Deutschschweiz, Liechtenstein, Luxemburg, Ostbelgien, Südtirol, das Elsass und Lothringen sowie Nordschleswig. Außerdem ist sie eine Minderheitensprache in einigen europäischen und außereuropäischen Ländern, z. B. in Rumänien und Südafrika, sowie Nationalsprache im afrikanischen Namibia.'),
+ ('hi', 'हिन्दी विश्व की एक प्रमुख भाषा है एवं भारत की राजभाषा है। केन्द्रीय स्तर पर भारत में दूसरी आधिकारिक भाषा अंग्रेजी है। यह हिंदुस्तानी भाषा की एक मानकीकृत रूप है जिसमें संस्कृत के तत्सम तथा तद्भव शब्दों का प्रयोग अधिक है और अरबी-फ़ारसी शब्द कम हैं। हिंदी संवैधानिक रूप से भारत की राजभाषा और भारत की सबसे अधिक बोली और समझी जाने वाली भाषा है। हालाँकि, हिन्दी भारत की राष्ट्रभाषा नहीं है,[3] क्योंकि भारत के संविधान में कोई भी भाषा को ऐसा दर्जा नहीं दिया गया था।[4][5] चीनी के बाद यह विश्व में सबसे अधिक बोली जाने वाली भाषा भी है। विश्व आर्थिक मंच की गणना के अनुसार यह विश्व की दस शक्तिशाली भाषाओं में से एक है।[6]'),
+ ('kn', 'ದ್ರಾವಿಡ ಭಾಷೆಗಳಲ್ಲಿ ಪ್ರಾಮುಖ್ಯವುಳ್ಳ ಭಾಷೆಯೂ ಭಾರತದ ಪುರಾತನವಾದ ಭಾಷೆಗಳಲ್ಲಿ ಒಂದೂ ಆಗಿರುವ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಅದರ ವಿವಿಧ ರೂಪಗಳಲ್ಲಿ ಸುಮಾರು ೪೫ ದಶಲಕ್ಷ ಜನರು ಆಡು ನುಡಿಯಾಗಿ ಬಳಸುತ್ತಲಿದ್ದಾರೆ. ಕನ್ನಡ ಕರ್ನಾಟಕ ರಾಜ್ಯದ ಆಡಳಿತ ಭಾಷೆ.[೧೧] ಜಗತ್ತಿನಲ್ಲಿ ಅತ್ಯಂತ ಹೆಚ್ಚು ಮಂದಿ ಮಾತನಾಡುವ ಭಾಷೆಯೆಂಬ ನೆಲೆಯಲ್ಲಿ ಇಪ್ಪತೊಂಬತ್ತನೆಯ ಸ್ಥಾನ ಕನ್ನಡಕ್ಕಿದೆ. ೨೦೧೧ರ ಜನಗಣತಿಯ ಪ್ರಕಾರ ಜಗತ್ತಿನಲ್ಲಿ ೬.೪ ಕೋಟಿ ಜನಗಳು ಕನ್ನಡ ಮಾತನಾಡುತ್ತಾರೆ ಎಂದು ತಿಳಿದುಬಂದಿದೆ. ಇವರಲ್ಲಿ ೫.೫ ಕೋಟಿ ಜನಗಳ ಮಾತೃಭಾಷೆ ಕನ್ನಡವಾಗಿದೆ. ಬ್ರಾಹ್ಮಿ ಲಿಪಿಯಿಂದ ರೂಪುಗೊಂಡ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಉಪಯೋಗಿಸಿ ಕನ್ನಡ ಭಾಷೆಯನ್ನು ಬರೆಯಲಾಗುತ್ತದೆ. ಕನ್ನಡ ಬರಹದ ಮಾದರಿಗಳಿಗೆ ಸಾವಿರದ ಐನೂರು ವರುಷಗಳ ಚರಿತ್ರೆಯಿದೆ. ಕ್ರಿ.ಶ. ಆರನೆಯ ಶತಮಾನದ ಪಶ್ಚಿಮ ಗಂಗ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ [೧೨] ಮತ್ತು ಒಂಬತ್ತನೆಯ ಶತಮಾನದ ರಾಷ್ಟ್ರಕೂಟ ಸಾಮ್ರಾಜ್ಯದ ಕಾಲದಲ್ಲಿ ಹಳಗನ್ನಡ ಸಾಹಿತ್ಯ ಅತ್ಯಂತ ಹೆಚ್ಚಿನ ರಾಜಾಶ್ರಯ ಪಡೆಯಿತು.[೧೩][೧೪] ಅದಲ್ಲದೆ ಸಾವಿರ ವರುಷಗಳ ಸಾಹಿತ್ಯ ಪರಂಪರೆ ಕನ್ನಡಕ್ಕಿದೆ.[೧೫]ವಿನೋಬಾ ಭಾವೆ ಕನ್ನಡ ಲಿಪಿಯನ್ನು ಲಿಪಿಗಳ ರಾಣಿಯೆಂದು ಹೊಗಳಿದ್ದಾರೆ.[ಸೂಕ್ತ ಉಲ್ಲೇಖನ ಬೇಕು]'),
+ ('si', 'ශ්රී ලංකාවේ ප්රධාන ජාතිය වන සිංහල ජනයාගේ මව් බස සිංහල වෙයි. අද වන විට මිලියන 20 කට අධික සිංහල සහ මිලියන 3කට අධික සිංහල නොවන ජනගහනයක් සිංහල භාෂාව භාවිත කරති. සිංහල ඉන්දු-යුරෝපීය භාෂාවල උප ගණයක් වන ඉන්දු-ආර්ය භාෂා ගණයට අයිති වන අතර මාල දිවයින භාවිත කරන දිවෙහි භාෂාව සිංහලයෙන් පැවත එන්නකි. සිංහල ශ්රී ලංකාවේ නිල භාෂාවයි .'),
+ ('ta', 'தமிழ் மொழி (Tamil language) தமிழர்களினதும், தமிழ் பேசும் பலரதும் தாய்மொழி ஆகும். தமிழ் திராவிட மொழிக் குடும்பத்தின் முதன்மையான மொழிகளில் ஒன்றும் செம்மொழியும் ஆகும். இந்தியா, இலங்கை, மலேசியா, சிங்கப்பூர் ஆகிய நாடுகளில் அதிக அளவிலும், ஐக்கிய அரபு அமீரகம், தென்னாப்பிரிக்கா, மொரிசியசு, பிஜி, ரீயூனியன், டிரினிடாட் போன்ற நாடுகளில் சிறிய அளவிலும் தமிழ் பேசப்படுகிறது. 1997ஆம் ஆண்டுப் புள்ளி விவரப்படி உலகம் முழுவதிலும் 8 கோடி (80 மில்லியன்) மக்களால் பேசப்படும் தமிழ்[13], ஒரு மொழியைத் தாய்மொழியாகக் கொண்டு பேசும் மக்களின் எண்ணிக்கை அடிப்படையில் பதினெட்டாவது இடத்தில் உள்ளது.[14] இணையத்தில் அதிகம் பயன்படுத்தப்படும் இந்திய மொழிகளில் தமிழ் முதன்மையாக உள்ளதாக 2017 ஆவது ஆண்டில் நடைபெற்ற கூகுள் கணக்கெடுப்பில் தெரிய வந்தது.[15]'),
+ ('te', 'ఆంధ్ర ప్రదేశ్, తెలంగాణ రాష్ట్రాల అధికార భాష తెలుగు. భారత దేశంలో తెలుగు మాతృభాషగా మాట్లాడే 8.7 కోట్ల (2001) జనాభాతో [1] ప్రాంతీయ భాషలలో మొదటి స్థానంలో ఉంది. ప్రపంచంలోని ప్రజలు అత్యధికముగా మాట్లాడే భాషలలో 15 స్థానములోనూ, భారత దేశములో హిందీ, తర్వాత స్థానములోనూ నిలుస్తుంది. పాతవైన ప్రపంచ భాష గణాంకాల (ఎథ్నోలాగ్) ప్రకారం ప్రపంచవ్యాప్తంగా 7.4 కోట్లు మందికి మాతృభాషగా ఉంది.[2] మొదటి భాషగా మాట్లాడతారు. అతి ప్రాచీన దేశ భాషలలో సంస్కృతము తమిళముతో బాటు తెలుగు భాషను 2008 అక్టోబరు 31న భారత ప్రభుత్వము గుర్తించింది.'),
+ ('ur', 'اُردُو لشکری زبان[8] (یا جدید معیاری اردو) برصغیر کی معیاری زبانوں میں سے ایک ہے۔ یہ پاکستان کی قومی اور رابطہ عامہ کی زبان ہے، جبکہ بھارت کی چھے ریاستوں کی دفتری زبان کا درجہ رکھتی ہے۔ آئین ہند کے مطابق اسے 22 دفتری شناخت زبانوں میں شامل کیا جاچکا ہے۔ 2001ء کی مردم شماری کے مطابق اردو کو بطور مادری زبان بھارت میں 5.01% فیصد لوگ بولتے ہیں اور اس لحاظ سے یہ بھارت کی چھٹی بڑی زبان ہے جبکہ پاکستان میں اسے بطور مادری زبان 7.59% فیصد لوگ استعمال کرتے ہیں، یہ پاکستان کی پانچویں بڑی زبان ہے۔ اردو تاریخی طور پر ہندوستان کی مسلم آبادی سے جڑی ہے۔[حوالہ درکار] بعض ذخیرہ الفاظ کے علاوہ یہ زبان معیاری ہندی سے قابل فہم ہے جو اس خطے کی ہندوؤں سے منسوب ہے۔[حوالہ درکار] زبانِ اردو کو پہچان و ترقی اس وقت ملی جب برطانوی دور میں انگریز حکمرانوں نے اسے فارسی کی بجائے انگریزی کے ساتھ شمالی ہندوستان کے علاقوں اور جموں و کشمیر میں اسے سنہ 1846ء اور پنجاب میں سنہ 1849ء میں بطور دفتری زبان نافذ کیا۔ اس کے علاوہ خلیجی، یورپی، ایشیائی اور امریکی علاقوں میں اردو بولنے والوں کی ایک بڑی تعداد آباد ہے جو بنیادی طور پر جنوبی ایشیاء سے کوچ کرنے والے اہلِ اردو ہیں۔ 1999ء کے اعداد وشمار کے مطابق اردو زبان کے مجموعی متکلمین کی تعداد دس کروڑ ساٹھ لاکھ کے لگ بھگ تھی۔ اس لحاظ سے یہ دنیا کی نویں بڑی زبان ہے۔'),
+ ],
+ # fmt: on
+)
+def test_sentencizer_across_scripts(lang, text):
+ nlp = spacy.blank(lang)
+ sentencizer = Sentencizer()
+ nlp.add_pipe(sentencizer)
+ doc = nlp(text)
+ assert len(list(doc.sents)) > 1
diff --git a/spacy/tests/regression/test_issue1-1000.py b/spacy/tests/regression/test_issue1-1000.py
index dca3d624f..6d88d68c2 100644
--- a/spacy/tests/regression/test_issue1-1000.py
+++ b/spacy/tests/regression/test_issue1-1000.py
@@ -30,7 +30,7 @@ def test_issue118(en_tokenizer, patterns):
doc = en_tokenizer(text)
ORG = doc.vocab.strings["ORG"]
matcher = Matcher(doc.vocab)
- matcher.add("BostonCeltics", None, *patterns)
+ matcher.add("BostonCeltics", patterns)
assert len(list(doc.ents)) == 0
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
assert matches == [(ORG, 9, 11), (ORG, 10, 11)]
@@ -57,7 +57,7 @@ def test_issue118_prefix_reorder(en_tokenizer, patterns):
doc = en_tokenizer(text)
ORG = doc.vocab.strings["ORG"]
matcher = Matcher(doc.vocab)
- matcher.add("BostonCeltics", None, *patterns)
+ matcher.add("BostonCeltics", patterns)
assert len(list(doc.ents)) == 0
matches = [(ORG, start, end) for _, start, end in matcher(doc)]
doc.ents += tuple(matches)[1:]
@@ -78,7 +78,7 @@ def test_issue242(en_tokenizer):
]
doc = en_tokenizer(text)
matcher = Matcher(doc.vocab)
- matcher.add("FOOD", None, *patterns)
+ matcher.add("FOOD", patterns)
matches = [(ent_type, start, end) for ent_type, start, end in matcher(doc)]
match1, match2 = matches
assert match1[1] == 3
@@ -127,17 +127,13 @@ def test_issue587(en_tokenizer):
"""Test that Matcher doesn't segfault on particular input"""
doc = en_tokenizer("a b; c")
matcher = Matcher(doc.vocab)
- matcher.add("TEST1", None, [{ORTH: "a"}, {ORTH: "b"}])
+ matcher.add("TEST1", [[{ORTH: "a"}, {ORTH: "b"}]])
matches = matcher(doc)
assert len(matches) == 1
- matcher.add(
- "TEST2", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]
- )
+ matcher.add("TEST2", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "c"}]])
matches = matcher(doc)
assert len(matches) == 2
- matcher.add(
- "TEST3", None, [{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]
- )
+ matcher.add("TEST3", [[{ORTH: "a"}, {ORTH: "b"}, {IS_PUNCT: True}, {ORTH: "d"}]])
matches = matcher(doc)
assert len(matches) == 2
@@ -145,7 +141,7 @@ def test_issue587(en_tokenizer):
def test_issue588(en_vocab):
matcher = Matcher(en_vocab)
with pytest.raises(ValueError):
- matcher.add("TEST", None, [])
+ matcher.add("TEST", [[]])
@pytest.mark.xfail
@@ -161,11 +157,9 @@ def test_issue590(en_vocab):
doc = Doc(en_vocab, words=["n", "=", "1", ";", "a", ":", "5", "%"])
matcher = Matcher(en_vocab)
matcher.add(
- "ab",
- None,
- [{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}],
+ "ab", [[{"IS_ALPHA": True}, {"ORTH": ":"}, {"LIKE_NUM": True}, {"ORTH": "%"}]]
)
- matcher.add("ab", None, [{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}])
+ matcher.add("ab", [[{"IS_ALPHA": True}, {"ORTH": "="}, {"LIKE_NUM": True}]])
matches = matcher(doc)
assert len(matches) == 2
@@ -221,7 +215,7 @@ def test_issue615(en_tokenizer):
label = "Sport_Equipment"
doc = en_tokenizer(text)
matcher = Matcher(doc.vocab)
- matcher.add(label, merge_phrases, pattern)
+ matcher.add(label, [pattern], on_match=merge_phrases)
matcher(doc)
entities = list(doc.ents)
assert entities != []
@@ -339,7 +333,7 @@ def test_issue850():
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab)
pattern = [{"LOWER": "bob"}, {"OP": "*"}, {"LOWER": "frank"}]
- matcher.add("FarAway", None, pattern)
+ matcher.add("FarAway", [pattern])
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
match = matcher(doc)
assert len(match) == 1
@@ -353,7 +347,7 @@ def test_issue850_basic():
vocab = Vocab(lex_attr_getters={LOWER: lambda string: string.lower()})
matcher = Matcher(vocab)
pattern = [{"LOWER": "bob"}, {"OP": "*", "LOWER": "and"}, {"LOWER": "frank"}]
- matcher.add("FarAway", None, pattern)
+ matcher.add("FarAway", [pattern])
doc = Doc(matcher.vocab, words=["bob", "and", "and", "frank"])
match = matcher(doc)
assert len(match) == 1
@@ -430,7 +424,7 @@ def test_issue957(en_tokenizer):
def test_issue999(train_data):
"""Test that adding entities and resuming training works passably OK.
There are two issues here:
- 1) We have to read labels. This isn't very nice.
+ 1) We have to re-add labels. This isn't very nice.
2) There's no way to set the learning rate for the weight update, so we
end up out-of-scale, causing it to learn too fast.
"""
diff --git a/spacy/tests/regression/test_issue1001-1500.py b/spacy/tests/regression/test_issue1001-1500.py
index 889a5dc71..924c5aa3e 100644
--- a/spacy/tests/regression/test_issue1001-1500.py
+++ b/spacy/tests/regression/test_issue1001-1500.py
@@ -111,7 +111,7 @@ def test_issue1434():
hello_world = Doc(vocab, words=["Hello", "World"])
hello = Doc(vocab, words=["Hello"])
matcher = Matcher(vocab)
- matcher.add("MyMatcher", None, pattern)
+ matcher.add("MyMatcher", [pattern])
matches = matcher(hello_world)
assert matches
matches = matcher(hello)
@@ -133,7 +133,7 @@ def test_issue1450(string, start, end):
"""Test matcher works when patterns end with * operator."""
pattern = [{"ORTH": "a"}, {"ORTH": "b", "OP": "*"}]
matcher = Matcher(Vocab())
- matcher.add("TSTEND", None, pattern)
+ matcher.add("TSTEND", [pattern])
doc = Doc(Vocab(), words=string.split())
matches = matcher(doc)
if start is None or end is None:
diff --git a/spacy/tests/regression/test_issue1501-2000.py b/spacy/tests/regression/test_issue1501-2000.py
index a9cf070cd..e498417d1 100644
--- a/spacy/tests/regression/test_issue1501-2000.py
+++ b/spacy/tests/regression/test_issue1501-2000.py
@@ -224,7 +224,7 @@ def test_issue1868():
def test_issue1883():
matcher = Matcher(Vocab())
- matcher.add("pat1", None, [{"orth": "hello"}])
+ matcher.add("pat1", [[{"orth": "hello"}]])
doc = Doc(matcher.vocab, words=["hello"])
assert len(matcher(doc)) == 1
new_matcher = copy.deepcopy(matcher)
@@ -249,7 +249,7 @@ def test_issue1915():
def test_issue1945():
"""Test regression in Matcher introduced in v2.0.6."""
matcher = Matcher(Vocab())
- matcher.add("MWE", None, [{"orth": "a"}, {"orth": "a"}])
+ matcher.add("MWE", [[{"orth": "a"}, {"orth": "a"}]])
doc = Doc(matcher.vocab, words=["a", "a", "a"])
matches = matcher(doc) # we should see two overlapping matches here
assert len(matches) == 2
@@ -285,7 +285,7 @@ def test_issue1971(en_vocab):
{"ORTH": "!", "OP": "?"},
]
Token.set_extension("optional", default=False)
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["Hello", "John", "Doe", "!"])
# We could also assert length 1 here, but this is more conclusive, because
# the real problem here is that it returns a duplicate match for a match_id
@@ -299,7 +299,7 @@ def test_issue_1971_2(en_vocab):
pattern1 = [{"ORTH": "EUR", "LOWER": {"IN": ["eur"]}}, {"LIKE_NUM": True}]
pattern2 = [{"LIKE_NUM": True}, {"ORTH": "EUR"}] # {"IN": ["EUR"]}}]
doc = Doc(en_vocab, words=["EUR", "10", "is", "10", "EUR"])
- matcher.add("TEST1", None, pattern1, pattern2)
+ matcher.add("TEST1", [pattern1, pattern2])
matches = matcher(doc)
assert len(matches) == 2
@@ -310,8 +310,8 @@ def test_issue_1971_3(en_vocab):
Token.set_extension("b", default=2, force=True)
doc = Doc(en_vocab, words=["hello", "world"])
matcher = Matcher(en_vocab)
- matcher.add("A", None, [{"_": {"a": 1}}])
- matcher.add("B", None, [{"_": {"b": 2}}])
+ matcher.add("A", [[{"_": {"a": 1}}]])
+ matcher.add("B", [[{"_": {"b": 2}}]])
matches = sorted((en_vocab.strings[m_id], s, e) for m_id, s, e in matcher(doc))
assert len(matches) == 4
assert matches == sorted([("A", 0, 1), ("A", 1, 2), ("B", 0, 1), ("B", 1, 2)])
@@ -326,7 +326,7 @@ def test_issue_1971_4(en_vocab):
matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=["this", "is", "text"])
pattern = [{"_": {"ext_a": "str_a", "ext_b": "str_b"}}] * 3
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
matches = matcher(doc)
# Uncommenting this caused a segmentation fault
assert len(matches) == 1
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 4292c8d23..e95c1a9b9 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -128,7 +128,7 @@ def test_issue2464(en_vocab):
"""Test problem with successive ?. This is the same bug, so putting it here."""
matcher = Matcher(en_vocab)
doc = Doc(en_vocab, words=["a", "b"])
- matcher.add("4", None, [{"OP": "?"}, {"OP": "?"}])
+ matcher.add("4", [[{"OP": "?"}, {"OP": "?"}]])
matches = matcher(doc)
assert len(matches) == 3
diff --git a/spacy/tests/regression/test_issue2501-3000.py b/spacy/tests/regression/test_issue2501-3000.py
index e26ccbf4b..73ff7376a 100644
--- a/spacy/tests/regression/test_issue2501-3000.py
+++ b/spacy/tests/regression/test_issue2501-3000.py
@@ -37,7 +37,7 @@ def test_issue2569(en_tokenizer):
doc = en_tokenizer("It is May 15, 1993.")
doc.ents = [Span(doc, 2, 6, label=doc.vocab.strings["DATE"])]
matcher = Matcher(doc.vocab)
- matcher.add("RULE", None, [{"ENT_TYPE": "DATE", "OP": "+"}])
+ matcher.add("RULE", [[{"ENT_TYPE": "DATE", "OP": "+"}]])
matched = [doc[start:end] for _, start, end in matcher(doc)]
matched = sorted(matched, key=len, reverse=True)
assert len(matched) == 10
@@ -89,7 +89,7 @@ def test_issue2671():
{"IS_PUNCT": True, "OP": "?"},
{"LOWER": "adrenaline"},
]
- matcher.add(pattern_id, None, pattern)
+ matcher.add(pattern_id, [pattern])
doc1 = nlp("This is a high-adrenaline situation.")
doc2 = nlp("This is a high adrenaline situation.")
matches1 = matcher(doc1)
diff --git a/spacy/tests/regression/test_issue3001-3500.py b/spacy/tests/regression/test_issue3001-3500.py
index 35011b532..d05759c31 100644
--- a/spacy/tests/regression/test_issue3001-3500.py
+++ b/spacy/tests/regression/test_issue3001-3500.py
@@ -15,6 +15,7 @@ from spacy.util import decaying
import numpy
import re
+from spacy.vectors import Vectors
from ..util import get_doc
@@ -51,7 +52,7 @@ def test_issue3009(en_vocab):
doc = get_doc(en_vocab, words=words, tags=tags)
matcher = Matcher(en_vocab)
for i, pattern in enumerate(patterns):
- matcher.add(str(i), None, pattern)
+ matcher.add(str(i), [pattern])
matches = matcher(doc)
assert matches
@@ -115,8 +116,8 @@ def test_issue3248_1():
total number of patterns."""
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
- matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
- matcher.add("TEST2", None, nlp("d"))
+ matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
+ matcher.add("TEST2", [nlp("d")])
assert len(matcher) == 2
@@ -124,8 +125,8 @@ def test_issue3248_2():
"""Test that the PhraseMatcher can be pickled correctly."""
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
- matcher.add("TEST1", None, nlp("a"), nlp("b"), nlp("c"))
- matcher.add("TEST2", None, nlp("d"))
+ matcher.add("TEST1", [nlp("a"), nlp("b"), nlp("c")])
+ matcher.add("TEST2", [nlp("d")])
data = pickle.dumps(matcher)
new_matcher = pickle.loads(data)
assert len(new_matcher) == len(matcher)
@@ -169,21 +170,20 @@ def test_issue3328(en_vocab):
[{"LOWER": {"IN": ["hello", "how"]}}],
[{"LOWER": {"IN": ["you", "doing"]}}],
]
- matcher.add("TEST", None, *patterns)
+ matcher.add("TEST", patterns)
matches = matcher(doc)
assert len(matches) == 4
matched_texts = [doc[start:end].text for _, start, end in matches]
assert matched_texts == ["Hello", "how", "you", "doing"]
-@pytest.mark.xfail
def test_issue3331(en_vocab):
"""Test that duplicate patterns for different rules result in multiple
matches, one per rule.
"""
matcher = PhraseMatcher(en_vocab)
- matcher.add("A", None, Doc(en_vocab, words=["Barack", "Obama"]))
- matcher.add("B", None, Doc(en_vocab, words=["Barack", "Obama"]))
+ matcher.add("A", [Doc(en_vocab, words=["Barack", "Obama"])])
+ matcher.add("B", [Doc(en_vocab, words=["Barack", "Obama"])])
doc = Doc(en_vocab, words=["Barack", "Obama", "lifts", "America"])
matches = matcher(doc)
assert len(matches) == 2
@@ -293,6 +293,15 @@ def test_issue3410():
list(phrasematcher.pipe(docs, n_threads=4))
+def test_issue3412():
+ data = numpy.asarray([[0, 0, 0], [1, 2, 3], [9, 8, 7]], dtype="f")
+ vectors = Vectors(data=data)
+ keys, best_rows, scores = vectors.most_similar(
+ numpy.asarray([[9, 8, 7], [0, 0, 0]], dtype="f")
+ )
+ assert best_rows[0] == 2
+
+
def test_issue3447():
sizes = decaying(10.0, 1.0, 0.5)
size = next(sizes)
@@ -318,12 +327,13 @@ def test_issue3449():
assert t3[5].text == "I"
+@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3456():
# this crashed because of a padding error in layer.ops.unflatten in thinc
nlp = English()
nlp.add_pipe(nlp.create_pipe("tagger"))
nlp.begin_training()
- list(nlp.pipe(['hi', '']))
+ list(nlp.pipe(["hi", ""]))
def test_issue3468():
diff --git a/spacy/tests/regression/test_issue3549.py b/spacy/tests/regression/test_issue3549.py
index 3932bf19c..587b3a857 100644
--- a/spacy/tests/regression/test_issue3549.py
+++ b/spacy/tests/regression/test_issue3549.py
@@ -10,6 +10,6 @@ def test_issue3549(en_vocab):
"""Test that match pattern validation doesn't raise on empty errors."""
matcher = Matcher(en_vocab, validate=True)
pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
- matcher.add("GOOD", None, pattern)
+ matcher.add("GOOD", [pattern])
with pytest.raises(MatchPatternError):
- matcher.add("BAD", None, [{"X": "Y"}])
+ matcher.add("BAD", [[{"X": "Y"}]])
diff --git a/spacy/tests/regression/test_issue3555.py b/spacy/tests/regression/test_issue3555.py
index 096b33367..8444f11f2 100644
--- a/spacy/tests/regression/test_issue3555.py
+++ b/spacy/tests/regression/test_issue3555.py
@@ -12,6 +12,6 @@ def test_issue3555(en_vocab):
Token.set_extension("issue3555", default=None)
matcher = Matcher(en_vocab)
pattern = [{"LEMMA": "have"}, {"_": {"issue3555": True}}]
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["have", "apple"])
matcher(doc)
diff --git a/spacy/tests/regression/test_issue3611.py b/spacy/tests/regression/test_issue3611.py
index c0ee83e1b..3c4836264 100644
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@@ -34,8 +34,7 @@ def test_issue3611():
nlp.add_pipe(textcat, last=True)
# training the network
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
- with nlp.disable_pipes(*other_pipes):
+ with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
diff --git a/spacy/tests/regression/test_issue3839.py b/spacy/tests/regression/test_issue3839.py
index c24c60b6d..fe722a681 100644
--- a/spacy/tests/regression/test_issue3839.py
+++ b/spacy/tests/regression/test_issue3839.py
@@ -12,10 +12,10 @@ def test_issue3839(en_vocab):
match_id = "PATTERN"
pattern1 = [{"LOWER": "terrific"}, {"OP": "?"}, {"LOWER": "group"}]
pattern2 = [{"LOWER": "terrific"}, {"OP": "?"}, {"OP": "?"}, {"LOWER": "group"}]
- matcher.add(match_id, None, pattern1)
+ matcher.add(match_id, [pattern1])
matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id]
matcher = Matcher(en_vocab)
- matcher.add(match_id, None, pattern2)
+ matcher.add(match_id, [pattern2])
matches = matcher(doc)
assert matches[0][0] == en_vocab.strings[match_id]
diff --git a/spacy/tests/regression/test_issue3879.py b/spacy/tests/regression/test_issue3879.py
index 123e9fce3..5cd245231 100644
--- a/spacy/tests/regression/test_issue3879.py
+++ b/spacy/tests/regression/test_issue3879.py
@@ -10,5 +10,5 @@ def test_issue3879(en_vocab):
assert len(doc) == 5
pattern = [{"ORTH": "This", "OP": "?"}, {"OP": "?"}, {"ORTH": "test"}]
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
assert len(matcher(doc)) == 2 # fails because of a FP match 'is a test'
diff --git a/spacy/tests/regression/test_issue3880.py b/spacy/tests/regression/test_issue3880.py
index 6de373f11..c060473f5 100644
--- a/spacy/tests/regression/test_issue3880.py
+++ b/spacy/tests/regression/test_issue3880.py
@@ -2,8 +2,10 @@
from __future__ import unicode_literals
from spacy.lang.en import English
+import pytest
+@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue3880():
"""Test that `nlp.pipe()` works when an empty string ends the batch.
diff --git a/spacy/tests/regression/test_issue3951.py b/spacy/tests/regression/test_issue3951.py
index e07ffd36e..33230112f 100644
--- a/spacy/tests/regression/test_issue3951.py
+++ b/spacy/tests/regression/test_issue3951.py
@@ -14,7 +14,7 @@ def test_issue3951(en_vocab):
{"OP": "?"},
{"LOWER": "world"},
]
- matcher.add("TEST", None, pattern)
+ matcher.add("TEST", [pattern])
doc = Doc(en_vocab, words=["Hello", "my", "new", "world"])
matches = matcher(doc)
assert len(matches) == 0
diff --git a/spacy/tests/regression/test_issue3972.py b/spacy/tests/regression/test_issue3972.py
index a7f76e4d7..22b8d486e 100644
--- a/spacy/tests/regression/test_issue3972.py
+++ b/spacy/tests/regression/test_issue3972.py
@@ -9,8 +9,8 @@ def test_issue3972(en_vocab):
"""Test that the PhraseMatcher returns duplicates for duplicate match IDs.
"""
matcher = PhraseMatcher(en_vocab)
- matcher.add("A", None, Doc(en_vocab, words=["New", "York"]))
- matcher.add("B", None, Doc(en_vocab, words=["New", "York"]))
+ matcher.add("A", [Doc(en_vocab, words=["New", "York"])])
+ matcher.add("B", [Doc(en_vocab, words=["New", "York"])])
doc = Doc(en_vocab, words=["I", "live", "in", "New", "York"])
matches = matcher(doc)
diff --git a/spacy/tests/regression/test_issue4002.py b/spacy/tests/regression/test_issue4002.py
index 37e054b3e..d075128aa 100644
--- a/spacy/tests/regression/test_issue4002.py
+++ b/spacy/tests/regression/test_issue4002.py
@@ -11,7 +11,7 @@ def test_issue4002(en_vocab):
matcher = PhraseMatcher(en_vocab, attr="NORM")
pattern1 = Doc(en_vocab, words=["c", "d"])
assert [t.norm_ for t in pattern1] == ["c", "d"]
- matcher.add("TEST", None, pattern1)
+ matcher.add("TEST", [pattern1])
doc = Doc(en_vocab, words=["a", "b", "c", "d"])
assert [t.norm_ for t in doc] == ["a", "b", "c", "d"]
matches = matcher(doc)
@@ -21,6 +21,6 @@ def test_issue4002(en_vocab):
pattern2[0].norm_ = "c"
pattern2[1].norm_ = "d"
assert [t.norm_ for t in pattern2] == ["c", "d"]
- matcher.add("TEST", None, pattern2)
+ matcher.add("TEST", [pattern2])
matches = matcher(doc)
assert len(matches) == 1
diff --git a/spacy/tests/regression/test_issue4030.py b/spacy/tests/regression/test_issue4030.py
index c331fa1d2..ed219573f 100644
--- a/spacy/tests/regression/test_issue4030.py
+++ b/spacy/tests/regression/test_issue4030.py
@@ -34,8 +34,7 @@ def test_issue4030():
nlp.add_pipe(textcat, last=True)
# training the network
- other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
- with nlp.disable_pipes(*other_pipes):
+ with nlp.disable_pipes([p for p in nlp.pipe_names if p != "textcat"]):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py
index 500be9f2a..00a8882d3 100644
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@@ -76,7 +76,6 @@ def test_issue4042_bug2():
output_dir.mkdir()
ner1.to_disk(output_dir)
- nlp2 = English(vocab)
ner2 = EntityRecognizer(vocab)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2
diff --git a/spacy/tests/regression/test_issue4120.py b/spacy/tests/regression/test_issue4120.py
index 2ce5aec6a..d288f46c4 100644
--- a/spacy/tests/regression/test_issue4120.py
+++ b/spacy/tests/regression/test_issue4120.py
@@ -8,7 +8,7 @@ from spacy.tokens import Doc
def test_issue4120(en_vocab):
"""Test that matches without a final {OP: ?} token are returned."""
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}])
+ matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}]])
doc1 = Doc(en_vocab, words=["a"])
assert len(matcher(doc1)) == 1 # works
@@ -16,11 +16,11 @@ def test_issue4120(en_vocab):
assert len(matcher(doc2)) == 2 # fixed
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}])
+ matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b"}]])
doc3 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc3)) == 2 # works
matcher = Matcher(en_vocab)
- matcher.add("TEST", None, [{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}])
+ matcher.add("TEST", [[{"ORTH": "a"}, {"OP": "?"}, {"ORTH": "b", "OP": "?"}]])
doc4 = Doc(en_vocab, words=["a", "b", "b", "c"])
assert len(matcher(doc4)) == 3 # fixed
diff --git a/spacy/tests/regression/test_issue4267.py b/spacy/tests/regression/test_issue4267.py
index 5fc61e142..ef871bf9f 100644
--- a/spacy/tests/regression/test_issue4267.py
+++ b/spacy/tests/regression/test_issue4267.py
@@ -1,13 +1,8 @@
# coding: utf8
from __future__ import unicode_literals
-import pytest
-
-import spacy
-
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
-from spacy.tokens import Span
def test_issue4267():
diff --git a/spacy/tests/regression/test_issue4348.py b/spacy/tests/regression/test_issue4348.py
index 9391c3529..d2e27d563 100644
--- a/spacy/tests/regression/test_issue4348.py
+++ b/spacy/tests/regression/test_issue4348.py
@@ -3,8 +3,10 @@ from __future__ import unicode_literals
from spacy.lang.en import English
from spacy.util import minibatch, compounding
+import pytest
+@pytest.mark.filterwarnings("ignore::UserWarning")
def test_issue4348():
"""Test that training the tagger with empty data, doesn't throw errors"""
diff --git a/spacy/tests/regression/test_issue4367.py b/spacy/tests/regression/test_issue4367.py
index 6c9e54cdb..ab6192744 100644
--- a/spacy/tests/regression/test_issue4367.py
+++ b/spacy/tests/regression/test_issue4367.py
@@ -6,6 +6,6 @@ from spacy.tokens import DocBin
def test_issue4367():
"""Test that docbin init goes well"""
- doc_bin_1 = DocBin()
- doc_bin_2 = DocBin(attrs=["LEMMA"])
- doc_bin_3 = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
+ DocBin()
+ DocBin(attrs=["LEMMA"])
+ DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"])
diff --git a/spacy/tests/regression/test_issue4402.py b/spacy/tests/regression/test_issue4402.py
new file mode 100644
index 000000000..2e1b69000
--- /dev/null
+++ b/spacy/tests/regression/test_issue4402.py
@@ -0,0 +1,96 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import srsly
+from spacy.gold import GoldCorpus
+
+from spacy.lang.en import English
+from spacy.tests.util import make_tempdir
+
+
+def test_issue4402():
+ nlp = English()
+ with make_tempdir() as tmpdir:
+ print("temp", tmpdir)
+ json_path = tmpdir / "test4402.json"
+ srsly.write_json(json_path, json_data)
+
+ corpus = GoldCorpus(str(json_path), str(json_path))
+
+ train_docs = list(corpus.train_docs(nlp, gold_preproc=True, max_length=0))
+ # assert that the data got split into 4 sentences
+ assert len(train_docs) == 4
+
+
+json_data = [
+ {
+ "id": 0,
+ "paragraphs": [
+ {
+ "raw": "How should I cook bacon in an oven?\nI've heard of people cooking bacon in an oven.",
+ "sentences": [
+ {
+ "tokens": [
+ {"id": 0, "orth": "How", "ner": "O"},
+ {"id": 1, "orth": "should", "ner": "O"},
+ {"id": 2, "orth": "I", "ner": "O"},
+ {"id": 3, "orth": "cook", "ner": "O"},
+ {"id": 4, "orth": "bacon", "ner": "O"},
+ {"id": 5, "orth": "in", "ner": "O"},
+ {"id": 6, "orth": "an", "ner": "O"},
+ {"id": 7, "orth": "oven", "ner": "O"},
+ {"id": 8, "orth": "?", "ner": "O"},
+ ],
+ "brackets": [],
+ },
+ {
+ "tokens": [
+ {"id": 9, "orth": "\n", "ner": "O"},
+ {"id": 10, "orth": "I", "ner": "O"},
+ {"id": 11, "orth": "'ve", "ner": "O"},
+ {"id": 12, "orth": "heard", "ner": "O"},
+ {"id": 13, "orth": "of", "ner": "O"},
+ {"id": 14, "orth": "people", "ner": "O"},
+ {"id": 15, "orth": "cooking", "ner": "O"},
+ {"id": 16, "orth": "bacon", "ner": "O"},
+ {"id": 17, "orth": "in", "ner": "O"},
+ {"id": 18, "orth": "an", "ner": "O"},
+ {"id": 19, "orth": "oven", "ner": "O"},
+ {"id": 20, "orth": ".", "ner": "O"},
+ ],
+ "brackets": [],
+ },
+ ],
+ "cats": [
+ {"label": "baking", "value": 1.0},
+ {"label": "not_baking", "value": 0.0},
+ ],
+ },
+ {
+ "raw": "What is the difference between white and brown eggs?\n",
+ "sentences": [
+ {
+ "tokens": [
+ {"id": 0, "orth": "What", "ner": "O"},
+ {"id": 1, "orth": "is", "ner": "O"},
+ {"id": 2, "orth": "the", "ner": "O"},
+ {"id": 3, "orth": "difference", "ner": "O"},
+ {"id": 4, "orth": "between", "ner": "O"},
+ {"id": 5, "orth": "white", "ner": "O"},
+ {"id": 6, "orth": "and", "ner": "O"},
+ {"id": 7, "orth": "brown", "ner": "O"},
+ {"id": 8, "orth": "eggs", "ner": "O"},
+ {"id": 9, "orth": "?", "ner": "O"},
+ ],
+ "brackets": [],
+ },
+ {"tokens": [{"id": 10, "orth": "\n", "ner": "O"}], "brackets": []},
+ ],
+ "cats": [
+ {"label": "baking", "value": 0.0},
+ {"label": "not_baking", "value": 1.0},
+ ],
+ },
+ ],
+ }
+]
diff --git a/spacy/tests/regression/test_issue4528.py b/spacy/tests/regression/test_issue4528.py
new file mode 100644
index 000000000..460449003
--- /dev/null
+++ b/spacy/tests/regression/test_issue4528.py
@@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from spacy.tokens import Doc, DocBin
+
+
+def test_issue4528(en_vocab):
+ """Test that user_data is correctly serialized in DocBin."""
+ doc = Doc(en_vocab, words=["hello", "world"])
+ doc.user_data["foo"] = "bar"
+ # This is how extension attribute values are stored in the user data
+ doc.user_data[("._.", "foo", None, None)] = "bar"
+ doc_bin = DocBin(store_user_data=True)
+ doc_bin.add(doc)
+ doc_bin_bytes = doc_bin.to_bytes()
+ new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
+ new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
+ assert new_doc.user_data["foo"] == "bar"
+ assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
diff --git a/spacy/tests/regression/test_issue4529.py b/spacy/tests/regression/test_issue4529.py
new file mode 100644
index 000000000..381957be6
--- /dev/null
+++ b/spacy/tests/regression/test_issue4529.py
@@ -0,0 +1,13 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.gold import GoldParse
+
+
+@pytest.mark.parametrize(
+ "text,words", [("A'B C", ["A", "'", "B", "C"]), ("A-B", ["A-B"])]
+)
+def test_gold_misaligned(en_tokenizer, text, words):
+ doc = en_tokenizer(text)
+ GoldParse(doc, words=words)
diff --git a/spacy/tests/regression/test_issue4590.py b/spacy/tests/regression/test_issue4590.py
new file mode 100644
index 000000000..6a43dfea9
--- /dev/null
+++ b/spacy/tests/regression/test_issue4590.py
@@ -0,0 +1,34 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+from mock import Mock
+from spacy.matcher import DependencyMatcher
+from ..util import get_doc
+
+
+def test_issue4590(en_vocab):
+ """Test that matches param in on_match method are the same as matches run with no on_match method"""
+ pattern = [
+ {"SPEC": {"NODE_NAME": "jumped"}, "PATTERN": {"ORTH": "jumped"}},
+ {"SPEC": {"NODE_NAME": "fox", "NBOR_RELOP": ">", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
+ {"SPEC": {"NODE_NAME": "quick", "NBOR_RELOP": ".", "NBOR_NAME": "jumped"}, "PATTERN": {"ORTH": "fox"}},
+ ]
+
+ on_match = Mock()
+
+ matcher = DependencyMatcher(en_vocab)
+ matcher.add("pattern", on_match, pattern)
+
+ text = "The quick brown fox jumped over the lazy fox"
+ heads = [3, 2, 1, 1, 0, -1, 2, 1, -3]
+ deps = ["det", "amod", "amod", "nsubj", "prep", "pobj", "det", "amod"]
+
+ doc = get_doc(en_vocab, text.split(), heads=heads, deps=deps)
+
+ matches = matcher(doc)
+
+ on_match_args = on_match.call_args
+
+ assert on_match_args[0][3] == matches
+
diff --git a/spacy/tests/serialize/test_serialize_doc.py b/spacy/tests/serialize/test_serialize_doc.py
index de91a50b6..87b087760 100644
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@@ -74,4 +74,4 @@ def test_serialize_doc_bin():
# Deserialize later, e.g. in a new process
nlp = spacy.blank("en")
doc_bin = DocBin().from_bytes(bytes_data)
- docs = list(doc_bin.get_docs(nlp.vocab))
+ list(doc_bin.get_docs(nlp.vocab))
diff --git a/spacy/tests/test_architectures.py b/spacy/tests/test_architectures.py
new file mode 100644
index 000000000..77f1af020
--- /dev/null
+++ b/spacy/tests/test_architectures.py
@@ -0,0 +1,19 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy import registry
+from thinc.v2v import Affine
+from catalogue import RegistryError
+
+
+@registry.architectures.register("my_test_function")
+def create_model(nr_in, nr_out):
+ return Affine(nr_in, nr_out)
+
+
+def test_get_architecture():
+ arch = registry.architectures.get("my_test_function")
+ assert arch is create_model
+ with pytest.raises(RegistryError):
+ registry.architectures.get("not_an_existing_key")
diff --git a/spacy/tests/test_gold.py b/spacy/tests/test_gold.py
index 4f79c4463..fbdb3155b 100644
--- a/spacy/tests/test_gold.py
+++ b/spacy/tests/test_gold.py
@@ -2,8 +2,8 @@
from __future__ import unicode_literals
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags
-from spacy.gold import spans_from_biluo_tags, GoldParse
-from spacy.gold import GoldCorpus, docs_to_json
+from spacy.gold import spans_from_biluo_tags, GoldParse, iob_to_biluo
+from spacy.gold import GoldCorpus, docs_to_json, align
from spacy.lang.en import English
from spacy.tokens import Doc
from .util import make_tempdir
@@ -87,16 +87,35 @@ def test_gold_ner_missing_tags(en_tokenizer):
gold = GoldParse(doc, entities=biluo_tags) # noqa: F841
+def test_iob_to_biluo():
+ good_iob = ["O", "O", "B-LOC", "I-LOC", "O", "B-PERSON"]
+ good_biluo = ["O", "O", "B-LOC", "L-LOC", "O", "U-PERSON"]
+ bad_iob = ["O", "O", '"', "B-LOC", "I-LOC"]
+ converted_biluo = iob_to_biluo(good_iob)
+ assert good_biluo == converted_biluo
+ with pytest.raises(ValueError):
+ iob_to_biluo(bad_iob)
+
+
def test_roundtrip_docs_to_json():
text = "I flew to Silicon Valley via London."
+ tags = ["PRP", "VBD", "IN", "NNP", "NNP", "IN", "NNP", "."]
+ heads = [1, 1, 1, 4, 2, 1, 5, 1]
+ deps = ["nsubj", "ROOT", "prep", "compound", "pobj", "prep", "pobj", "punct"]
+ biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
cats = {"TRAVEL": 1.0, "BAKING": 0.0}
nlp = English()
doc = nlp(text)
+ for i in range(len(tags)):
+ doc[i].tag_ = tags[i]
+ doc[i].dep_ = deps[i]
+ doc[i].head = doc[heads[i]]
+ doc.ents = spans_from_biluo_tags(doc, biluo_tags)
doc.cats = cats
- doc[0].is_sent_start = True
- for i in range(1, len(doc)):
- doc[i].is_sent_start = False
+ doc.is_tagged = True
+ doc.is_parsed = True
+ # roundtrip to JSON
with make_tempdir() as tmpdir:
json_file = tmpdir / "roundtrip.json"
srsly.write_json(json_file, [docs_to_json(doc)])
@@ -106,7 +125,94 @@ def test_roundtrip_docs_to_json():
assert len(doc) == goldcorpus.count_train()
assert text == reloaded_doc.text
+ assert tags == goldparse.tags
+ assert deps == goldparse.labels
+ assert heads == goldparse.heads
+ assert biluo_tags == goldparse.ner
assert "TRAVEL" in goldparse.cats
assert "BAKING" in goldparse.cats
assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
assert cats["BAKING"] == goldparse.cats["BAKING"]
+
+ # roundtrip to JSONL train dicts
+ with make_tempdir() as tmpdir:
+ jsonl_file = tmpdir / "roundtrip.jsonl"
+ srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
+ goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+
+ reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
+
+ assert len(doc) == goldcorpus.count_train()
+ assert text == reloaded_doc.text
+ assert tags == goldparse.tags
+ assert deps == goldparse.labels
+ assert heads == goldparse.heads
+ assert biluo_tags == goldparse.ner
+ assert "TRAVEL" in goldparse.cats
+ assert "BAKING" in goldparse.cats
+ assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
+ assert cats["BAKING"] == goldparse.cats["BAKING"]
+
+ # roundtrip to JSONL tuples
+ with make_tempdir() as tmpdir:
+ jsonl_file = tmpdir / "roundtrip.jsonl"
+ # write to JSONL train dicts
+ srsly.write_jsonl(jsonl_file, [docs_to_json(doc)])
+ goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+ # load and rewrite as JSONL tuples
+ srsly.write_jsonl(jsonl_file, goldcorpus.train_tuples)
+ goldcorpus = GoldCorpus(str(jsonl_file), str(jsonl_file))
+
+ reloaded_doc, goldparse = next(goldcorpus.train_docs(nlp))
+
+ assert len(doc) == goldcorpus.count_train()
+ assert text == reloaded_doc.text
+ assert tags == goldparse.tags
+ assert deps == goldparse.labels
+ assert heads == goldparse.heads
+ assert biluo_tags == goldparse.ner
+ assert "TRAVEL" in goldparse.cats
+ assert "BAKING" in goldparse.cats
+ assert cats["TRAVEL"] == goldparse.cats["TRAVEL"]
+ assert cats["BAKING"] == goldparse.cats["BAKING"]
+
+
+@pytest.mark.skip(reason="skip while we have backwards-compatible alignment")
+@pytest.mark.parametrize(
+ "tokens_a,tokens_b,expected",
+ [
+ (["a", "b", "c"], ["ab", "c"], (3, [-1, -1, 1], [-1, 2], {0: 0, 1: 0}, {})),
+ (
+ ["a", "b", "``", "c"],
+ ['ab"', "c"],
+ (4, [-1, -1, -1, 1], [-1, 3], {0: 0, 1: 0, 2: 0}, {}),
+ ),
+ (["a", "bc"], ["ab", "c"], (4, [-1, -1], [-1, -1], {0: 0}, {1: 1})),
+ (
+ ["ab", "c", "d"],
+ ["a", "b", "cd"],
+ (6, [-1, -1, -1], [-1, -1, -1], {1: 2, 2: 2}, {0: 0, 1: 0}),
+ ),
+ (
+ ["a", "b", "cd"],
+ ["a", "b", "c", "d"],
+ (3, [0, 1, -1], [0, 1, -1, -1], {}, {2: 2, 3: 2}),
+ ),
+ ([" ", "a"], ["a"], (1, [-1, 0], [1], {}, {})),
+ ],
+)
+def test_align(tokens_a, tokens_b, expected):
+ cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_a, tokens_b)
+ assert (cost, list(a2b), list(b2a), a2b_multi, b2a_multi) == expected
+ # check symmetry
+ cost, a2b, b2a, a2b_multi, b2a_multi = align(tokens_b, tokens_a)
+ assert (cost, list(b2a), list(a2b), b2a_multi, a2b_multi) == expected
+
+
+def test_goldparse_startswith_space(en_tokenizer):
+ text = " a"
+ doc = en_tokenizer(text)
+ g = GoldParse(doc, words=["a"], entities=["U-DATE"], deps=["ROOT"], heads=[0])
+ assert g.words == [" ", "a"]
+ assert g.ner == [None, "U-DATE"]
+ assert g.labels == [None, "ROOT"]
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 94c37d4ab..7106cef74 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -1,11 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
+import itertools
+
import pytest
-from spacy.vocab import Vocab
-from spacy.language import Language
-from spacy.tokens import Doc
+from spacy.compat import is_python2
from spacy.gold import GoldParse
+from spacy.language import Language
+from spacy.tokens import Doc, Span
+from spacy.vocab import Vocab
+
+from .util import add_vecs_to_vocab, assert_docs_equal
@pytest.fixture
@@ -58,3 +63,88 @@ def test_language_evaluate(nlp):
# Evaluate badly
with pytest.raises(Exception):
nlp.evaluate([text, gold])
+
+
+def test_evaluate_no_pipe(nlp):
+ """Test that docs are processed correctly within Language.pipe if the
+ component doesn't expose a .pipe method."""
+
+ def pipe(doc):
+ return doc
+
+ text = "hello world"
+ annots = {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}
+ nlp = Language(Vocab())
+ nlp.add_pipe(pipe)
+ nlp.evaluate([(text, annots)])
+
+
+def vector_modification_pipe(doc):
+ doc.vector += 1
+ return doc
+
+
+def userdata_pipe(doc):
+ doc.user_data["foo"] = "bar"
+ return doc
+
+
+def ner_pipe(doc):
+ span = Span(doc, 0, 1, label="FIRST")
+ doc.ents += (span,)
+ return doc
+
+
+@pytest.fixture
+def sample_vectors():
+ return [
+ ("spacy", [-0.1, -0.2, -0.3]),
+ ("world", [-0.2, -0.3, -0.4]),
+ ("pipe", [0.7, 0.8, 0.9]),
+ ]
+
+
+@pytest.fixture
+def nlp2(nlp, sample_vectors):
+ add_vecs_to_vocab(nlp.vocab, sample_vectors)
+ nlp.add_pipe(vector_modification_pipe)
+ nlp.add_pipe(ner_pipe)
+ nlp.add_pipe(userdata_pipe)
+ return nlp
+
+
+@pytest.fixture
+def texts():
+ data = [
+ "Hello world.",
+ "This is spacy.",
+ "You can use multiprocessing with pipe method.",
+ "Please try!",
+ ]
+ return data
+
+
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe(nlp2, n_process, texts):
+ texts = texts * 10
+ expecteds = [nlp2(text) for text in texts]
+ docs = nlp2.pipe(texts, n_process=n_process, batch_size=2)
+
+ for doc, expected_doc in zip(docs, expecteds):
+ assert_docs_equal(doc, expected_doc)
+
+
+@pytest.mark.skipif(
+ is_python2, reason="python2 seems to be unable to handle iterator properly"
+)
+@pytest.mark.parametrize("n_process", [1, 2])
+def test_language_pipe_stream(nlp2, n_process, texts):
+ # check if nlp.pipe can handle infinite length iterator properly.
+ stream_texts = itertools.cycle(texts)
+ texts0, texts1 = itertools.tee(stream_texts)
+ expecteds = (nlp2(text) for text in texts0)
+ docs = nlp2.pipe(texts1, n_process=n_process, batch_size=2)
+
+ n_fetch = 20
+ for doc, expected_doc in itertools.islice(zip(docs, expecteds), n_fetch):
+ assert_docs_equal(doc, expected_doc)
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index a033b6dd0..4075ccf64 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -95,12 +95,18 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
def test_prefer_gpu():
- assert not prefer_gpu()
+ try:
+ import cupy # noqa: F401
+ except ImportError:
+ assert not prefer_gpu()
def test_require_gpu():
- with pytest.raises(ValueError):
- require_gpu()
+ try:
+ import cupy # noqa: F401
+ except ImportError:
+ with pytest.raises(ValueError):
+ require_gpu()
def test_create_symlink_windows(
diff --git a/spacy/tests/test_scorer.py b/spacy/tests/test_scorer.py
index 9cc4f75b2..c59358a6b 100644
--- a/spacy/tests/test_scorer.py
+++ b/spacy/tests/test_scorer.py
@@ -9,6 +9,14 @@ from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc
+test_las_apple = [
+ [
+ "Apple is looking at buying U.K. startup for $ 1 billion",
+ {"heads": [2, 2, 2, 2, 3, 6, 4, 4, 10, 10, 7],
+ "deps": ['nsubj', 'aux', 'ROOT', 'prep', 'pcomp', 'compound', 'dobj', 'prep', 'quantmod', 'compound', 'pobj']},
+ ]
+]
+
test_ner_cardinal = [
["100 - 200", {"entities": [[0, 3, "CARDINAL"], [6, 9, "CARDINAL"]]}]
]
@@ -21,6 +29,53 @@ test_ner_apple = [
]
+def test_las_per_type(en_vocab):
+ # Gold and Doc are identical
+ scorer = Scorer()
+ for input_, annot in test_las_apple:
+ doc = get_doc(
+ en_vocab,
+ words=input_.split(" "),
+ heads=([h - i for i, h in enumerate(annot["heads"])]),
+ deps=annot["deps"],
+ )
+ gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
+ scorer.score(doc, gold)
+ results = scorer.scores
+
+ assert results["uas"] == 100
+ assert results["las"] == 100
+ assert results["las_per_type"]["nsubj"]["p"] == 100
+ assert results["las_per_type"]["nsubj"]["r"] == 100
+ assert results["las_per_type"]["nsubj"]["f"] == 100
+ assert results["las_per_type"]["compound"]["p"] == 100
+ assert results["las_per_type"]["compound"]["r"] == 100
+ assert results["las_per_type"]["compound"]["f"] == 100
+
+ # One dep is incorrect in Doc
+ scorer = Scorer()
+ for input_, annot in test_las_apple:
+ doc = get_doc(
+ en_vocab,
+ words=input_.split(" "),
+ heads=([h - i for i, h in enumerate(annot["heads"])]),
+ deps=annot["deps"]
+ )
+ gold = GoldParse(doc, heads=annot["heads"], deps=annot["deps"])
+ doc[0].dep_ = "compound"
+ scorer.score(doc, gold)
+ results = scorer.scores
+
+ assert results["uas"] == 100
+ assert_almost_equal(results["las"], 90.9090909)
+ assert results["las_per_type"]["nsubj"]["p"] == 0
+ assert results["las_per_type"]["nsubj"]["r"] == 0
+ assert results["las_per_type"]["nsubj"]["f"] == 0
+ assert_almost_equal(results["las_per_type"]["compound"]["p"], 66.6666666)
+ assert results["las_per_type"]["compound"]["r"] == 100
+ assert results["las_per_type"]["compound"]["f"] == 80
+
+
def test_ner_per_type(en_vocab):
# Gold and Doc are identical
scorer = Scorer()
diff --git a/spacy/tests/test_tok2vec.py b/spacy/tests/test_tok2vec.py
new file mode 100644
index 000000000..ddaa71059
--- /dev/null
+++ b/spacy/tests/test_tok2vec.py
@@ -0,0 +1,66 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import pytest
+
+from spacy._ml import Tok2Vec
+from spacy.vocab import Vocab
+from spacy.tokens import Doc
+from spacy.compat import unicode_
+
+
+def get_batch(batch_size):
+ vocab = Vocab()
+ docs = []
+ start = 0
+ for size in range(1, batch_size + 1):
+ # Make the words numbers, so that they're distnct
+ # across the batch, and easy to track.
+ numbers = [unicode_(i) for i in range(start, start + size)]
+ docs.append(Doc(vocab, words=numbers))
+ start += size
+ return docs
+
+
+# This fails in Thinc v7.3.1. Need to push patch
+@pytest.mark.xfail
+def test_empty_doc():
+ width = 128
+ embed_size = 2000
+ vocab = Vocab()
+ doc = Doc(vocab, words=[])
+ tok2vec = Tok2Vec(width, embed_size)
+ vectors, backprop = tok2vec.begin_update([doc])
+ assert len(vectors) == 1
+ assert vectors[0].shape == (0, width)
+
+
+@pytest.mark.parametrize(
+ "batch_size,width,embed_size", [[1, 128, 2000], [2, 128, 2000], [3, 8, 63]]
+)
+def test_tok2vec_batch_sizes(batch_size, width, embed_size):
+ batch = get_batch(batch_size)
+ tok2vec = Tok2Vec(width, embed_size)
+ vectors, backprop = tok2vec.begin_update(batch)
+ assert len(vectors) == len(batch)
+ for doc_vec, doc in zip(vectors, batch):
+ assert doc_vec.shape == (len(doc), width)
+
+
+@pytest.mark.parametrize(
+ "tok2vec_config",
+ [
+ {"width": 8, "embed_size": 100, "char_embed": False},
+ {"width": 8, "embed_size": 100, "char_embed": True},
+ {"width": 8, "embed_size": 100, "conv_depth": 6},
+ {"width": 8, "embed_size": 100, "conv_depth": 6},
+ {"width": 8, "embed_size": 100, "subword_features": False},
+ ],
+)
+def test_tok2vec_configs(tok2vec_config):
+ docs = get_batch(3)
+ tok2vec = Tok2Vec(**tok2vec_config)
+ vectors, backprop = tok2vec.begin_update(docs)
+ assert len(vectors) == len(docs)
+ assert vectors[0].shape == (len(docs[0]), tok2vec_config["width"])
+ backprop(vectors)
diff --git a/spacy/tests/tokenizer/test_urls.py b/spacy/tests/tokenizer/test_urls.py
index bf59ae4d7..21e1819b7 100644
--- a/spacy/tests/tokenizer/test_urls.py
+++ b/spacy/tests/tokenizer/test_urls.py
@@ -48,12 +48,15 @@ URLS_SHOULD_MATCH = [
"http://a.b--c.de/", # this is a legit domain name see: https://gist.github.com/dperini/729294 comment on 9/9/2014
"ssh://login@server.com:12345/repository.git",
"svn+ssh://user@ssh.yourdomain.com/path",
- pytest.param("chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
- pytest.param("chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()),
- pytest.param("http://foo.com/blah_blah_(wikipedia)", marks=pytest.mark.xfail()),
pytest.param(
- "http://foo.com/blah_blah_(wikipedia)_(again)", marks=pytest.mark.xfail()
+ "chrome://extensions/?id=mhjfbmdgcfjbbpaeojofohoefgiehjai",
+ marks=pytest.mark.xfail(),
),
+ pytest.param(
+ "chrome-extension://mhjfbmdgcfjbbpaeojofohoefgiehjai", marks=pytest.mark.xfail()
+ ),
+ "http://foo.com/blah_blah_(wikipedia)",
+ "http://foo.com/blah_blah_(wikipedia)_(again)",
pytest.param("http://⌘.ws", marks=pytest.mark.xfail()),
pytest.param("http://⌘.ws/", marks=pytest.mark.xfail()),
pytest.param("http://☺.damowmow.com/", marks=pytest.mark.xfail()),
@@ -100,8 +103,8 @@ URLS_SHOULD_NOT_MATCH = [
"NASDAQ:GOOG",
"http://-a.b.co",
pytest.param("foo.com", marks=pytest.mark.xfail()),
- pytest.param("http://1.1.1.1.1", marks=pytest.mark.xfail()),
- pytest.param("http://www.foo.bar./", marks=pytest.mark.xfail()),
+ "http://1.1.1.1.1",
+ "http://www.foo.bar./",
]
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index 4b2e171a6..b688ab9dd 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -51,6 +51,14 @@ def data():
return numpy.asarray([[0.0, 1.0, 2.0], [3.0, -2.0, 4.0]], dtype="f")
+@pytest.fixture
+def most_similar_vectors_data():
+ return numpy.asarray(
+ [[0.0, 1.0, 2.0], [1.0, -2.0, 4.0], [1.0, 1.0, -1.0], [2.0, 3.0, 1.0]],
+ dtype="f",
+ )
+
+
@pytest.fixture
def resize_data():
return numpy.asarray([[0.0, 1.0], [2.0, 3.0]], dtype="f")
@@ -127,6 +135,24 @@ def test_set_vector(strings, data):
assert list(v[strings[0]]) != list(orig[0])
+def test_vectors_most_similar(most_similar_vectors_data):
+ v = Vectors(data=most_similar_vectors_data)
+ _, best_rows, _ = v.most_similar(v.data, batch_size=2, n=2, sort=True)
+ assert all(row[0] == i for i, row in enumerate(best_rows))
+
+
+def test_vectors_most_similar_identical():
+ """Test that most similar identical vectors are assigned a score of 1.0."""
+ data = numpy.asarray([[4, 2, 2, 2], [4, 2, 2, 2], [1, 1, 1, 1]], dtype="f")
+ v = Vectors(data=data, keys=["A", "B", "C"])
+ keys, _, scores = v.most_similar(numpy.asarray([[4, 2, 2, 2]], dtype="f"))
+ assert scores[0][0] == 1.0 # not 1.0000002
+ data = numpy.asarray([[1, 2, 3], [1, 2, 3], [1, 1, 1]], dtype="f")
+ v = Vectors(data=data, keys=["A", "B", "C"])
+ keys, _, scores = v.most_similar(numpy.asarray([[1, 2, 3]], dtype="f"))
+ assert scores[0][0] == 1.0 # not 0.9999999
+
+
@pytest.mark.parametrize("text", ["apple and orange"])
def test_vectors_token_vector(tokenizer_v, vectors, text):
doc = tokenizer_v(text)
@@ -284,8 +310,8 @@ def test_vocab_prune_vectors():
vocab.set_vector("dog", data[1])
vocab.set_vector("kitten", data[2])
- remap = vocab.prune_vectors(2)
+ remap = vocab.prune_vectors(2, batch_size=2)
assert list(remap.keys()) == ["kitten"]
neighbour, similarity = list(remap.values())[0]
assert neighbour == "cat", remap
- assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-6)
+ assert_allclose(similarity, cosine(data[0], data[2]), atol=1e-4, rtol=1e-3)
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index cdfa55dcb..b39bb1ecb 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -331,6 +331,9 @@ cdef class Tokenizer:
cdef int _save_cached(self, const TokenC* tokens, hash_t key,
int has_special, int n) except -1:
cdef int i
+ if n <= 0:
+ # avoid mem alloc of zero length
+ return 0
for i in range(n):
if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
return 0
diff --git a/spacy/tokens/_retokenize.pyx b/spacy/tokens/_retokenize.pyx
index f8b13dd78..a5d06491a 100644
--- a/spacy/tokens/_retokenize.pyx
+++ b/spacy/tokens/_retokenize.pyx
@@ -157,6 +157,9 @@ def _merge(Doc doc, merges):
cdef TokenC* token
cdef Pool mem = Pool()
cdef int merged_iob = 0
+
+ # merges should not be empty, but make sure to avoid zero-length mem alloc
+ assert len(merges) > 0
tokens = mem.alloc(len(merges), sizeof(TokenC))
spans = []
@@ -326,7 +329,7 @@ def _split(Doc doc, int token_index, orths, heads, attrs):
doc.c[i].head += offset
# Double doc.c max_length if necessary (until big enough for all new tokens)
while doc.length + nb_subtokens - 1 >= doc.max_length:
- doc._realloc(doc.length * 2)
+ doc._realloc(doc.max_length * 2)
# Move tokens after the split to create space for the new tokens
doc.length = len(doc) + nb_subtokens -1
to_process_tensor = (doc.tensor is not None and doc.tensor.size != 0)
diff --git a/spacy/tokens/_serialize.py b/spacy/tokens/_serialize.py
index 67ad9a21a..18cb8a234 100644
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@@ -103,7 +103,8 @@ class DocBin(object):
doc = Doc(vocab, words=words, spaces=spaces)
doc = doc.from_array(self.attrs, tokens)
if self.store_user_data:
- doc.user_data.update(srsly.msgpack_loads(self.user_data[i]))
+ user_data = srsly.msgpack_loads(self.user_data[i], use_list=False)
+ doc.user_data.update(user_data)
yield doc
def merge(self, other):
@@ -155,9 +156,9 @@ class DocBin(object):
msg = srsly.msgpack_loads(zlib.decompress(bytes_data))
self.attrs = msg["attrs"]
self.strings = set(msg["strings"])
- lengths = numpy.fromstring(msg["lengths"], dtype="int32")
- flat_spaces = numpy.fromstring(msg["spaces"], dtype=bool)
- flat_tokens = numpy.fromstring(msg["tokens"], dtype="uint64")
+ lengths = numpy.frombuffer(msg["lengths"], dtype="int32")
+ flat_spaces = numpy.frombuffer(msg["spaces"], dtype=bool)
+ flat_tokens = numpy.frombuffer(msg["tokens"], dtype="uint64")
shape = (flat_tokens.size // len(self.attrs), len(self.attrs))
flat_tokens = flat_tokens.reshape(shape)
flat_spaces = flat_spaces.reshape((flat_spaces.size, 1))
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 80a808bae..6afe89e05 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -791,6 +791,8 @@ cdef class Doc:
# Get set up for fast loading
cdef Pool mem = Pool()
cdef int n_attrs = len(attrs)
+ # attrs should not be empty, but make sure to avoid zero-length mem alloc
+ assert n_attrs > 0
attr_ids = mem.alloc(n_attrs, sizeof(attr_id_t))
for i, attr_id in enumerate(attrs):
attr_ids[i] = attr_id
diff --git a/spacy/util.py b/spacy/util.py
index b4c4d3fab..b505a0e01 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -2,7 +2,6 @@
from __future__ import unicode_literals, print_function
import os
-import pkg_resources
import importlib
import re
from pathlib import Path
@@ -14,6 +13,7 @@ import functools
import itertools
import numpy.random
import srsly
+import catalogue
import sys
try:
@@ -32,18 +32,16 @@ from .compat import import_file
from .errors import Errors, Warnings, deprecation_warning
-LANGUAGES = {}
_data_path = Path(__file__).parent / "data"
_PRINT_ENV = False
-class ENTRY_POINTS(object):
- """Available entry points to register extensions."""
-
- factories = "spacy_factories"
- languages = "spacy_languages"
- displacy_colors = "spacy_displacy_colors"
- lookups = "spacy_lookups"
+class registry(object):
+ languages = catalogue.create("spacy", "languages", entry_points=True)
+ architectures = catalogue.create("spacy", "architectures", entry_points=True)
+ lookups = catalogue.create("spacy", "lookups", entry_points=True)
+ factories = catalogue.create("spacy", "factories", entry_points=True)
+ displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
def set_env_log(value):
@@ -59,8 +57,7 @@ def lang_class_is_loaded(lang):
lang (unicode): Two-letter language code, e.g. 'en'.
RETURNS (bool): Whether a Language class has been loaded.
"""
- global LANGUAGES
- return lang in LANGUAGES
+ return lang in registry.languages
def get_lang_class(lang):
@@ -69,19 +66,16 @@ def get_lang_class(lang):
lang (unicode): Two-letter language code, e.g. 'en'.
RETURNS (Language): Language class.
"""
- global LANGUAGES
- # Check if an entry point is exposed for the language code
- entry_point = get_entry_point(ENTRY_POINTS.languages, lang)
- if entry_point is not None:
- LANGUAGES[lang] = entry_point
- return entry_point
- if lang not in LANGUAGES:
+ # Check if language is registered / entry point is available
+ if lang in registry.languages:
+ return registry.languages.get(lang)
+ else:
try:
module = importlib.import_module(".lang.%s" % lang, "spacy")
except ImportError as err:
raise ImportError(Errors.E048.format(lang=lang, err=err))
- LANGUAGES[lang] = getattr(module, module.__all__[0])
- return LANGUAGES[lang]
+ set_lang_class(lang, getattr(module, module.__all__[0]))
+ return registry.languages.get(lang)
def set_lang_class(name, cls):
@@ -90,8 +84,7 @@ def set_lang_class(name, cls):
name (unicode): Name of Language class.
cls (Language): Language class.
"""
- global LANGUAGES
- LANGUAGES[name] = cls
+ registry.languages.register(name, func=cls)
def get_data_path(require_exists=True):
@@ -115,6 +108,11 @@ def set_data_path(path):
_data_path = ensure_path(path)
+def make_layer(arch_config):
+ arch_func = registry.architectures.get(arch_config["arch"])
+ return arch_func(arch_config["config"])
+
+
def ensure_path(path):
"""Ensure string is converted to a Path.
@@ -198,6 +196,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
cls = get_lang_class(lang)
nlp = cls(meta=meta, **overrides)
pipeline = meta.get("pipeline", [])
+ factories = meta.get("factories", {})
disable = overrides.get("disable", [])
if pipeline is True:
pipeline = nlp.Defaults.pipe_names
@@ -206,7 +205,8 @@ def load_model_from_path(model_path, meta=False, **overrides):
for name in pipeline:
if name not in disable:
config = meta.get("pipeline_args", {}).get(name, {})
- component = nlp.create_pipe(name, config=config)
+ factory = factories.get(name, name)
+ component = nlp.create_pipe(factory, config=config)
nlp.add_pipe(component, name=name)
return nlp.from_disk(model_path)
@@ -253,6 +253,8 @@ def is_package(name):
name (unicode): Name of package.
RETURNS (bool): True if installed package, False if not.
"""
+ import pkg_resources
+
name = name.lower() # compare package name against lowercase name
packages = pkg_resources.working_set.by_key.keys()
for package in packages:
@@ -274,34 +276,6 @@ def get_package_path(name):
return Path(pkg.__file__).parent
-def get_entry_points(key):
- """Get registered entry points from other packages for a given key, e.g.
- 'spacy_factories' and return them as a dictionary, keyed by name.
-
- key (unicode): Entry point name.
- RETURNS (dict): Entry points, keyed by name.
- """
- result = {}
- for entry_point in pkg_resources.iter_entry_points(key):
- result[entry_point.name] = entry_point.load()
- return result
-
-
-def get_entry_point(key, value, default=None):
- """Check if registered entry point is available for a given name and
- load it. Otherwise, return None.
-
- key (unicode): Entry point name.
- value (unicode): Name of entry point to load.
- default: Optional default value to return.
- RETURNS: The loaded entry point or None.
- """
- for entry_point in pkg_resources.iter_entry_points(key):
- if entry_point.name == value:
- return entry_point.load()
- return default
-
-
def is_in_jupyter():
"""Check if user is running spaCy from a Jupyter notebook by detecting the
IPython kernel. Mainly used for the displaCy visualizer.
@@ -317,6 +291,16 @@ def is_in_jupyter():
return False
+def get_component_name(component):
+ if hasattr(component, "name"):
+ return component.name
+ if hasattr(component, "__name__"):
+ return component.__name__
+ if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"):
+ return component.__class__.__name__
+ return repr(component)
+
+
def get_cuda_stream(require=False, non_blocking=True):
if CudaStream is None:
return None
@@ -358,7 +342,7 @@ def env_opt(name, default=None):
def read_regex(path):
path = ensure_path(path)
- with path.open() as file_:
+ with path.open(encoding="utf8") as file_:
entries = file_.read().split("\n")
expression = "|".join(
["^" + re.escape(piece) for piece in entries if piece.strip()]
@@ -620,7 +604,7 @@ def filter_spans(spans):
spans (iterable): The spans to filter.
RETURNS (list): The filtered spans.
"""
- get_sort_key = lambda span: (span.end - span.start, span.start)
+ get_sort_key = lambda span: (span.end - span.start, -span.start)
sorted_spans = sorted(spans, key=get_sort_key, reverse=True)
result = []
seen_tokens = set()
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 75716617c..44dddb30c 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -321,14 +321,18 @@ cdef class Vectors:
"""
xp = get_array_module(self.data)
- vectors = self.data / xp.linalg.norm(self.data, axis=1, keepdims=True)
+ norms = xp.linalg.norm(self.data, axis=1, keepdims=True)
+ norms[norms == 0] = 1
+ vectors = self.data / norms
best_rows = xp.zeros((queries.shape[0], n), dtype='i')
scores = xp.zeros((queries.shape[0], n), dtype='f')
# Work in batches, to avoid memory problems.
for i in range(0, queries.shape[0], batch_size):
batch = queries[i : i+batch_size]
- batch /= xp.linalg.norm(batch, axis=1, keepdims=True)
+ batch_norms = xp.linalg.norm(batch, axis=1, keepdims=True)
+ batch_norms[batch_norms == 0] = 1
+ batch /= batch_norms
# batch e.g. (1024, 300)
# vectors e.g. (10000, 300)
# sims e.g. (1024, 10000)
@@ -336,12 +340,16 @@ cdef class Vectors:
best_rows[i:i+batch_size] = xp.argpartition(sims, -n, axis=1)[:,-n:]
scores[i:i+batch_size] = xp.partition(sims, -n, axis=1)[:,-n:]
- if sort:
- sorted_index = xp.arange(scores.shape[0])[:,None],xp.argsort(scores, axis=1)[:,::-1]
+ if sort and n >= 2:
+ sorted_index = xp.arange(scores.shape[0])[:,None][i:i+batch_size],xp.argsort(scores[i:i+batch_size], axis=1)[:,::-1]
scores[i:i+batch_size] = scores[sorted_index]
best_rows[i:i+batch_size] = best_rows[sorted_index]
-
+
xp = get_array_module(self.data)
+ # Round values really close to 1 or -1
+ scores = xp.around(scores, decimals=4, out=scores)
+ # Account for numerical error we want to return in range -1, 1
+ scores = xp.clip(scores, a_min=-1, a_max=1, out=scores)
row2key = {row: key for key, row in self.key2row.items()}
keys = xp.asarray(
[[row2key[row] for row in best_rows[i] if row in row2key]
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 67317a9ac..4a21537cb 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -336,7 +336,15 @@ cdef class Vocab:
"""Retrieve a vector for a word in the vocabulary. Words can be looked
up by string or int ID. If no vectors data is loaded, ValueError is
raised.
+
+ If `minn` is defined, then the resulting vector uses Fasttext's
+ subword features by average over ngrams of `orth`.
+ orth (int / unicode): The hash value of a word, or its unicode string.
+ minn (int): Minimum n-gram length used for Fasttext's ngram computation.
+ Defaults to the length of `orth`.
+ maxn (int): Maximum n-gram length used for Fasttext's ngram computation.
+ Defaults to the length of `orth`.
RETURNS (numpy.ndarray): A word vector. Size
and shape determined by the `vocab.vectors` instance. Usually, a
numpy ndarray of shape (300,) and dtype float32.
diff --git a/website/docs/api/annotation.md b/website/docs/api/annotation.md
index fac7e79b6..5ca5e91d9 100644
--- a/website/docs/api/annotation.md
+++ b/website/docs/api/annotation.md
@@ -48,14 +48,14 @@ be installed if needed via `pip install spacy[lookups]`. Some languages provide
full lemmatization rules and exceptions, while other languages currently only
rely on simple lookup tables.
-
+
-spaCy adds a **special case for pronouns**: all pronouns are lemmatized to the
-special token `-PRON-`. Unlike verbs and common nouns, there's no clear base
-form of a personal pronoun. Should the lemma of "me" be "I", or should we
-normalize person as well, giving "it" — or maybe "he"? spaCy's solution is to
-introduce a novel symbol, `-PRON-`, which is used as the lemma for all personal
-pronouns.
+spaCy adds a **special case for English pronouns**: all English pronouns are
+lemmatized to the special token `-PRON-`. Unlike verbs and common nouns,
+there's no clear base form of a personal pronoun. Should the lemma of "me" be
+"I", or should we normalize person as well, giving "it" — or maybe "he"?
+spaCy's solution is to introduce a novel symbol, `-PRON-`, which is used as the
+lemma for all personal pronouns.
@@ -117,76 +117,72 @@ type. They're available as the [`Token.pos`](/api/token#attributes) and
The English part-of-speech tagger uses the
[OntoNotes 5](https://catalog.ldc.upenn.edu/LDC2013T19) version of the Penn
-Treebank tag set. We also map the tags to the simpler Google Universal POS tag
-set.
-
-| Tag | POS | Morphology | Description |
-| ----------------------------------- | ------- | ---------------------------------------------- | ----------------------------------------- |
-| `-LRB-` | `PUNCT` | `PunctType=brck PunctSide=ini` | left round bracket |
-| `-RRB-` | `PUNCT` | `PunctType=brck PunctSide=fin` | right round bracket |
-| `,` | `PUNCT` | `PunctType=comm` | punctuation mark, comma |
-| `:` | `PUNCT` | | punctuation mark, colon or ellipsis |
-| `.` | `PUNCT` | `PunctType=peri` | punctuation mark, sentence closer |
-| `''` | `PUNCT` | `PunctType=quot PunctSide=fin` | closing quotation mark |
-| `""` | `PUNCT` | `PunctType=quot PunctSide=fin` | closing quotation mark |
-| `` | `PUNCT` | `PunctType=quot PunctSide=ini` | opening quotation mark |
-| `#` | `SYM` | `SymType=numbersign` | symbol, number sign |
-| `$` | `SYM` | `SymType=currency` | symbol, currency |
-| `ADD` | `X` | | email |
-| `AFX` | `ADJ` | `Hyph=yes` | affix |
-| `BES` | `VERB` | | auxiliary "be" |
-| `CC` | `CONJ` | `ConjType=coor` | conjunction, coordinating |
-| `CD` | `NUM` | `NumType=card` | cardinal number |
-| `DT` | `DET` | | determiner |
-| `EX` | `ADV` | `AdvType=ex` | existential there |
-| `FW` | `X` | `Foreign=yes` | foreign word |
-| `GW` | `X` | | additional word in multi-word expression |
-| `HVS` | `VERB` | | forms of "have" |
-| `HYPH` | `PUNCT` | `PunctType=dash` | punctuation mark, hyphen |
-| `IN` | `ADP` | | conjunction, subordinating or preposition |
-| `JJ` | `ADJ` | `Degree=pos` | adjective |
-| `JJR` | `ADJ` | `Degree=comp` | adjective, comparative |
-| `JJS` | `ADJ` | `Degree=sup` | adjective, superlative |
-| `LS` | `PUNCT` | `NumType=ord` | list item marker |
-| `MD` | `VERB` | `VerbType=mod` | verb, modal auxiliary |
-| `NFP` | `PUNCT` | | superfluous punctuation |
-| `NIL` | | | missing tag |
-| `NN` | `NOUN` | `Number=sing` | noun, singular or mass |
-| `NNP` | `PROPN` | `NounType=prop Number=sign` | noun, proper singular |
-| `NNPS` | `PROPN` | `NounType=prop Number=plur` | noun, proper plural |
-| `NNS` | `NOUN` | `Number=plur` | noun, plural |
-| `PDT` | `ADJ` | `AdjType=pdt PronType=prn` | predeterminer |
-| `POS` | `PART` | `Poss=yes` | possessive ending |
-| `PRP` | `PRON` | `PronType=prs` | pronoun, personal |
-| `PRP$` | `ADJ` | `PronType=prs Poss=yes` | pronoun, possessive |
-| `RB` | `ADV` | `Degree=pos` | adverb |
-| `RBR` | `ADV` | `Degree=comp` | adverb, comparative |
-| `RBS` | `ADV` | `Degree=sup` | adverb, superlative |
-| `RP` | `PART` | | adverb, particle |
-| `_SP` | `SPACE` | | space |
-| `SYM` | `SYM` | | symbol |
-| `TO` | `PART` | `PartType=inf VerbForm=inf` | infinitival "to" |
-| `UH` | `INTJ` | | interjection |
-| `VB` | `VERB` | `VerbForm=inf` | verb, base form |
-| `VBD` | `VERB` | `VerbForm=fin Tense=past` | verb, past tense |
-| `VBG` | `VERB` | `VerbForm=part Tense=pres Aspect=prog` | verb, gerund or present participle |
-| `VBN` | `VERB` | `VerbForm=part Tense=past Aspect=perf` | verb, past participle |
-| `VBP` | `VERB` | `VerbForm=fin Tense=pres` | verb, non-3rd person singular present |
-| `VBZ` | `VERB` | `VerbForm=fin Tense=pres Number=sing Person=3` | verb, 3rd person singular present |
-| `WDT` | `ADJ` | `PronType=int|rel` | wh-determiner |
-| `WP` | `NOUN` | `PronType=int|rel` | wh-pronoun, personal |
-| `WP$` | `ADJ` | `Poss=yes PronType=int|rel` | wh-pronoun, possessive |
-| `WRB` | `ADV` | `PronType=int|rel` | wh-adverb |
-| `XX` | `X` | | unknown |
+Treebank tag set. We also map the tags to the simpler Universal Dependencies v2
+POS tag set.
+| Tag | POS | Morphology | Description |
+| ------------------------------------- | ------- | --------------------------------------- | ----------------------------------------- |
+| `$` | `SYM` | | symbol, currency |
+| `` | `PUNCT` | `PunctType=quot PunctSide=ini` | opening quotation mark |
+| `''` | `PUNCT` | `PunctType=quot PunctSide=fin` | closing quotation mark |
+| `,` | `PUNCT` | `PunctType=comm` | punctuation mark, comma |
+| `-LRB-` | `PUNCT` | `PunctType=brck PunctSide=ini` | left round bracket |
+| `-RRB-` | `PUNCT` | `PunctType=brck PunctSide=fin` | right round bracket |
+| `.` | `PUNCT` | `PunctType=peri` | punctuation mark, sentence closer |
+| `:` | `PUNCT` | | punctuation mark, colon or ellipsis |
+| `ADD` | `X` | | email |
+| `AFX` | `ADJ` | `Hyph=yes` | affix |
+| `CC` | `CCONJ` | `ConjType=comp` | conjunction, coordinating |
+| `CD` | `NUM` | `NumType=card` | cardinal number |
+| `DT` | `DET` | | determiner |
+| `EX` | `PRON` | `AdvType=ex` | existential there |
+| `FW` | `X` | `Foreign=yes` | foreign word |
+| `GW` | `X` | | additional word in multi-word expression |
+| `HYPH` | `PUNCT` | `PunctType=dash` | punctuation mark, hyphen |
+| `IN` | `ADP` | | conjunction, subordinating or preposition |
+| `JJ` | `ADJ` | `Degree=pos` | adjective |
+| `JJR` | `ADJ` | `Degree=comp` | adjective, comparative |
+| `JJS` | `ADJ` | `Degree=sup` | adjective, superlative |
+| `LS` | `X` | `NumType=ord` | list item marker |
+| `MD` | `VERB` | `VerbType=mod` | verb, modal auxiliary |
+| `NFP` | `PUNCT` | | superfluous punctuation |
+| `NIL` | `X` | | missing tag |
+| `NN` | `NOUN` | `Number=sing` | noun, singular or mass |
+| `NNP` | `PROPN` | `NounType=prop Number=sing` | noun, proper singular |
+| `NNPS` | `PROPN` | `NounType=prop Number=plur` | noun, proper plural |
+| `NNS` | `NOUN` | `Number=plur` | noun, plural |
+| `PDT` | `DET` | | predeterminer |
+| `POS` | `PART` | `Poss=yes` | possessive ending |
+| `PRP` | `PRON` | `PronType=prs` | pronoun, personal |
+| `PRP$` | `DET` | `PronType=prs Poss=yes` | pronoun, possessive |
+| `RB` | `ADV` | `Degree=pos` | adverb |
+| `RBR` | `ADV` | `Degree=comp` | adverb, comparative |
+| `RBS` | `ADV` | `Degree=sup` | adverb, superlative |
+| `RP` | `ADP` | | adverb, particle |
+| `SP` | `SPACE` | | space |
+| `SYM` | `SYM` | | symbol |
+| `TO` | `PART` | `PartType=inf VerbForm=inf` | infinitival "to" |
+| `UH` | `INTJ` | | interjection |
+| `VB` | `VERB` | `VerbForm=inf` | verb, base form |
+| `VBD` | `VERB` | `VerbForm=fin Tense=past` | verb, past tense |
+| `VBG` | `VERB` | `VerbForm=part Tense=pres Aspect=prog` | verb, gerund or present participle |
+| `VBN` | `VERB` | `VerbForm=part Tense=past Aspect=perf` | verb, past participle |
+| `VBP` | `VERB` | `VerbForm=fin Tense=pres` | verb, non-3rd person singular present |
+| `VBZ` | `VERB` | `VerbForm=fin Tense=pres Number=sing Person=three` | verb, 3rd person singular present |
+| `WDT` | `DET` | | wh-determiner |
+| `WP` | `PRON` | | wh-pronoun, personal |
+| `WP$` | `DET` | `Poss=yes` | wh-pronoun, possessive |
+| `WRB` | `ADV` | | wh-adverb |
+| `XX` | `X` | | unknown |
+| `_SP` | `SPACE` | | |
The German part-of-speech tagger uses the
[TIGER Treebank](http://www.ims.uni-stuttgart.de/forschung/ressourcen/korpora/TIGERCorpus/annotation/index.html)
-annotation scheme. We also map the tags to the simpler Google Universal POS tag
-set.
+annotation scheme. We also map the tags to the simpler Universal Dependencies
+v2 POS tag set.
| Tag | POS | Morphology | Description |
| --------- | ------- | ---------------------------------------- | ------------------------------------------------- |
@@ -194,7 +190,7 @@ set.
| `$,` | `PUNCT` | `PunctType=comm` | comma |
| `$.` | `PUNCT` | `PunctType=peri` | sentence-final punctuation mark |
| `ADJA` | `ADJ` | | adjective, attributive |
-| `ADJD` | `ADJ` | `Variant=short` | adjective, adverbial or predicative |
+| `ADJD` | `ADJ` | | adjective, adverbial or predicative |
| `ADV` | `ADV` | | adverb |
| `APPO` | `ADP` | `AdpType=post` | postposition |
| `APPR` | `ADP` | `AdpType=prep` | preposition; circumposition left |
@@ -204,28 +200,28 @@ set.
| `CARD` | `NUM` | `NumType=card` | cardinal number |
| `FM` | `X` | `Foreign=yes` | foreign language material |
| `ITJ` | `INTJ` | | interjection |
-| `KOKOM` | `CONJ` | `ConjType=comp` | comparative conjunction |
-| `KON` | `CONJ` | | coordinate conjunction |
+| `KOKOM` | `CCONJ` | `ConjType=comp` | comparative conjunction |
+| `KON` | `CCONJ` | | coordinate conjunction |
| `KOUI` | `SCONJ` | | subordinate conjunction with "zu" and infinitive |
| `KOUS` | `SCONJ` | | subordinate conjunction with sentence |
| `NE` | `PROPN` | | proper noun |
-| `NNE` | `PROPN` | | proper noun |
| `NN` | `NOUN` | | noun, singular or mass |
-| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb |
+| `NNE` | `PROPN` | | proper noun |
| `PDAT` | `DET` | `PronType=dem` | attributive demonstrative pronoun |
| `PDS` | `PRON` | `PronType=dem` | substituting demonstrative pronoun |
-| `PIAT` | `DET` | `PronType=ind\|neg\|tot` | attributive indefinite pronoun without determiner |
-| `PIS` | `PRON` | `PronType=ind\|neg\|tot` | substituting indefinite pronoun |
+| `PIAT` | `DET` | `PronType=ind|neg|tot` | attributive indefinite pronoun without determiner |
+| `PIS` | `PRON` | `PronType=ind|neg|tot` | substituting indefinite pronoun |
| `PPER` | `PRON` | `PronType=prs` | non-reflexive personal pronoun |
| `PPOSAT` | `DET` | `Poss=yes PronType=prs` | attributive possessive pronoun |
-| `PPOSS` | `PRON` | `PronType=rel` | substituting possessive pronoun |
+| `PPOSS` | `PRON` | `Poss=yes PronType=prs` | substituting possessive pronoun |
| `PRELAT` | `DET` | `PronType=rel` | attributive relative pronoun |
| `PRELS` | `PRON` | `PronType=rel` | substituting relative pronoun |
| `PRF` | `PRON` | `PronType=prs Reflex=yes` | reflexive personal pronoun |
+| `PROAV` | `ADV` | `PronType=dem` | pronominal adverb |
| `PTKA` | `PART` | | particle with adjective or adverb |
| `PTKANT` | `PART` | `PartType=res` | answer particle |
-| `PTKNEG` | `PART` | `Negative=yes` | negative particle |
-| `PTKVZ` | `PART` | `PartType=vbp` | separable verbal particle |
+| `PTKNEG` | `PART` | `Polarity=neg` | negative particle |
+| `PTKVZ` | `ADP` | `PartType=vbp` | separable verbal particle |
| `PTKZU` | `PART` | `PartType=inf` | "zu" before infinitive |
| `PWAT` | `DET` | `PronType=int` | attributive interrogative pronoun |
| `PWAV` | `ADV` | `PronType=int` | adverbial interrogative or relative pronoun |
@@ -234,9 +230,9 @@ set.
| `VAFIN` | `AUX` | `Mood=ind VerbForm=fin` | finite verb, auxiliary |
| `VAIMP` | `AUX` | `Mood=imp VerbForm=fin` | imperative, auxiliary |
| `VAINF` | `AUX` | `VerbForm=inf` | infinitive, auxiliary |
-| `VAPP` | `AUX` | `Aspect=perf VerbForm=fin` | perfect participle, auxiliary |
+| `VAPP` | `AUX` | `Aspect=perf VerbForm=part` | perfect participle, auxiliary |
| `VMFIN` | `VERB` | `Mood=ind VerbForm=fin VerbType=mod` | finite verb, modal |
-| `VMINF` | `VERB` | `VerbForm=fin VerbType=mod` | infinitive, modal |
+| `VMINF` | `VERB` | `VerbForm=inf VerbType=mod` | infinitive, modal |
| `VMPP` | `VERB` | `Aspect=perf VerbForm=part VerbType=mod` | perfect participle, modal |
| `VVFIN` | `VERB` | `Mood=ind VerbForm=fin` | finite verb, full |
| `VVIMP` | `VERB` | `Mood=imp VerbForm=fin` | imperative, full |
@@ -244,8 +240,7 @@ set.
| `VVIZU` | `VERB` | `VerbForm=inf` | infinitive with "zu", full |
| `VVPP` | `VERB` | `Aspect=perf VerbForm=part` | perfect participle, full |
| `XY` | `X` | | non-word containing non-letter |
-| `SP` | `SPACE` | | space |
-
+| `_SP` | `SPACE` | | |
---
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index aa28a14d1..a37921f3c 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -155,21 +155,14 @@ $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter]
### Output file types {new="2.1"}
-> #### Which format should I choose?
->
-> If you're not sure, go with the default `jsonl`. Newline-delimited JSON means
-> that there's one JSON object per line. Unlike a regular JSON file, it can also
-> be read in line-by-line and you won't have to parse the _entire file_ first.
-> This makes it a very convenient format for larger corpora.
-
All output files generated by this command are compatible with
[`spacy train`](/api/cli#train).
-| ID | Description |
-| ------- | --------------------------------- |
-| `jsonl` | Newline-delimited JSON (default). |
-| `json` | Regular JSON. |
-| `msg` | Binary MessagePack format. |
+| ID | Description |
+| ------- | -------------------------- |
+| `json` | Regular JSON (default). |
+| `jsonl` | Newline-delimited JSON. |
+| `msg` | Binary MessagePack format. |
### Converter options
@@ -453,8 +446,10 @@ improvement.
```bash
$ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
-[--width] [--depth] [--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length] [--min-length]
-[--seed] [--n-iter] [--use-vectors] [--n-save_every] [--init-tok2vec] [--epoch-start]
+[--width] [--depth] [--cnn-window] [--cnn-pieces] [--use-chars] [--sa-depth]
+[--embed-rows] [--loss_func] [--dropout] [--batch-size] [--max-length]
+[--min-length] [--seed] [--n-iter] [--use-vectors] [--n-save_every]
+[--init-tok2vec] [--epoch-start]
```
| Argument | Type | Description |
@@ -464,6 +459,10 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
| `output_dir` | positional | Directory to write models to on each epoch. |
| `--width`, `-cw` | option | Width of CNN layers. |
| `--depth`, `-cd` | option | Depth of CNN layers. |
+| `--cnn-window`, `-cW` 2.2.2 | option | Window size for CNN layers. |
+| `--cnn-pieces`, `-cP` 2.2.2 | option | Maxout size for CNN layers. `1` for [Mish](https://github.com/digantamisra98/Mish). |
+| `--use-chars`, `-chr` 2.2.2 | flag | Whether to use character-based embedding. |
+| `--sa-depth`, `-sa` 2.2.2 | option | Depth of self-attention layers. |
| `--embed-rows`, `-er` | option | Number of embedding rows. |
| `--loss-func`, `-L` | option | Loss function to use for the objective. Either `"L2"` or `"cosine"`. |
| `--dropout`, `-d` | option | Dropout rate. |
@@ -476,7 +475,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
| `--n-save-every`, `-se` | option | Save model every X batches. |
| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
| `--epoch-start`, `-es` 2.1.5 | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. |
-| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
+| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
### JSONL format for raw text {#pretrain-jsonl}
diff --git a/website/docs/api/docbin.md b/website/docs/api/docbin.md
index 41ebb6075..9f12a07e6 100644
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@@ -109,8 +109,8 @@ raise an error if the pre-defined attrs of the two `DocBin`s don't match.
> doc_bin1.add(nlp("Hello world"))
> doc_bin2 = DocBin(attrs=["LEMMA", "POS"])
> doc_bin2.add(nlp("This is a sentence"))
-> merged_bins = doc_bin1.merge(doc_bin2)
-> assert len(merged_bins) == 2
+> doc_bin1.merge(doc_bin2)
+> assert len(doc_bin1) == 2
> ```
| Argument | Type | Description |
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 607cb28ce..af3db0dcb 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -202,6 +202,14 @@ All labels present in the match patterns.
| ----------- | ----- | ------------------ |
| **RETURNS** | tuple | The string labels. |
+## EntityRuler.ent_ids {#labels tag="property" new="2.2.2"}
+
+All entity ids present in the match patterns `id` properties.
+
+| Name | Type | Description |
+| ----------- | ----- | ------------------- |
+| **RETURNS** | tuple | The string ent_ids. |
+
## EntityRuler.patterns {#patterns tag="property"}
Get all patterns that were added to the entity ruler.
diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md
index 2dd24316f..1ef6f0362 100644
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@@ -62,7 +62,7 @@ Whether the provided syntactic annotations form a projective dependency tree.
Convert a list of Doc objects into the
[JSON-serializable format](/api/annotation#json-input) used by the
-[`spacy train`](/api/cli#train) command.
+[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc.
> #### Example
>
@@ -77,7 +77,7 @@ Convert a list of Doc objects into the
| ----------- | ---------------- | ------------------------------------------ |
| `docs` | iterable / `Doc` | The `Doc` object(s) to convert. |
| `id` | int | ID to assign to the JSON. Defaults to `0`. |
-| **RETURNS** | list | The data in spaCy's JSON format. |
+| **RETURNS** | dict | The data in spaCy's JSON format. |
### gold.align {#align tag="function"}
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index c44339ff5..6e7f6be3e 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -323,18 +323,38 @@ you can use to undo your changes.
> #### Example
>
> ```python
-> with nlp.disable_pipes('tagger', 'parser'):
+> # New API as of v2.2.2
+> with nlp.disable_pipes(["tagger", "parser"]):
+> nlp.begin_training()
+>
+> with nlp.disable_pipes("tagger", "parser"):
> nlp.begin_training()
>
-> disabled = nlp.disable_pipes('tagger', 'parser')
+> disabled = nlp.disable_pipes("tagger", "parser")
> nlp.begin_training()
> disabled.restore()
> ```
-| Name | Type | Description |
-| ----------- | --------------- | ------------------------------------------------------------------------------------ |
-| `*disabled` | unicode | Names of pipeline components to disable. |
-| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
+| Name | Type | Description |
+| ----------------------------------------- | --------------- | ------------------------------------------------------------------------------------ |
+| `disabled` 2.2.2 | list | Names of pipeline components to disable. |
+| `*disabled` | unicode | Names of pipeline components to disable. |
+| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
+
+
+
+As of spaCy v2.2.2, the `Language.disable_pipes` method can also take a list of
+component names as its first argument (instead of a variable number of
+arguments). This is especially useful if you're generating the component names
+to disable programmatically. The new syntax will become the default in the
+future.
+
+```diff
+- disabled = nlp.disable_pipes("tagger", "parser")
++ disabled = nlp.disable_pipes(["tagger", "parser"])
+```
+
+
## Language.to_disk {#to_disk tag="method" new="2"}
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index 7570e4ea2..f43e17fd3 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -54,7 +54,7 @@ Lemmatize a string.
> ```python
> from spacy.lemmatizer import Lemmatizer
> from spacy.lookups import Lookups
-> lookups = Loookups()
+> lookups = Lookups()
> lookups.add_table("lemma_rules", {"noun": [["s", ""]]})
> lemmatizer = Lemmatizer(lookups)
> lemmas = lemmatizer("ducks", "NOUN")
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index 84d9ed888..bfd4fb0ec 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -157,16 +157,19 @@ overwritten.
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
-
+
-As of spaCy 2.0, `Matcher.add_pattern` and `Matcher.add_entity` are deprecated
-and have been replaced with a simpler [`Matcher.add`](/api/matcher#add) that
-lets you add a list of patterns and a callback for a given match ID.
+As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
+the default in the future. The patterns are now the second argument and a list
+(instead of a variable number of arguments). The `on_match` callback becomes an
+optional keyword argument.
```diff
-- matcher.add_entity("GoogleNow", on_match=merge_phrases)
-- matcher.add_pattern("GoogleNow", [{ORTH: "Google"}, {ORTH: "Now"}])
-+ matcher.add('GoogleNow', merge_phrases, [{"ORTH": "Google"}, {"ORTH": "Now"}])
+patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
+- matcher.add("GoogleNow", None, *patterns)
++ matcher.add("GoogleNow", patterns)
+- matcher.add("GoogleNow", on_match, *patterns)
++ matcher.add("GoogleNow", patterns, on_match=on_match)
```
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index 40b8d6c1a..c7311a401 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -151,7 +151,24 @@ overwritten.
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | unicode | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
-| `*docs` | list | `Doc` objects of the phrases to match. |
+| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
+
+
+
+As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
+become the default in the future. The `Doc` patterns are now the second argument
+and a list (instead of a variable number of arguments). The `on_match` callback
+becomes an optional keyword argument.
+
+```diff
+patterns = [nlp("health care reform"), nlp("healthcare reform")]
+- matcher.add("HEALTH", None, *patterns)
++ matcher.add("HEALTH", patterns)
+- matcher.add("HEALTH", on_match, *patterns)
++ matcher.add("HEALTH", patterns, on_match=on_match)
+```
+
+
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index 237cd6a8a..c9b935f22 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -60,7 +60,7 @@ the component has been added to the pipeline using
> sentencizer = nlp.create_pipe("sentencizer")
> nlp.add_pipe(sentencizer)
> doc = nlp("This is a sentence. This is another sentence.")
-> assert list(doc.sents) == 2
+> assert len(list(doc.sents)) == 2
> ```
| Name | Type | Description |
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index ea0c2d219..e024ab54a 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -166,18 +166,23 @@ cosines are calculated in minibatches, to reduce memory usage.
## Vocab.get_vector {#get_vector tag="method" new="2"}
Retrieve a vector for a word in the vocabulary. Words can be looked up by string
-or hash value. If no vectors data is loaded, a `ValueError` is raised.
+or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
+is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
+subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
> #### Example
>
> ```python
> nlp.vocab.get_vector("apple")
+> nlp.vocab.get_vector("apple", minn=1, maxn=5)
> ```
-| Name | Type | Description |
-| ----------- | ---------------------------------------- | ----------------------------------------------------------------------------- |
-| `orth` | int / unicode | The hash value of a word, or its unicode string. |
-| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. |
+| Name | Type | Description |
+| ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- |
+| `orth` | int / unicode | The hash value of a word, or its unicode string. |
+| `minn` 2.1 | int | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
+| `maxn` 2.1 | int | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
+| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. |
## Vocab.set_vector {#set_vector tag="method" new="2"}
diff --git a/website/docs/images/displacy-ent-custom.html b/website/docs/images/displacy-ent-custom.html
index 15294db49..709c6f631 100644
--- a/website/docs/images/displacy-ent-custom.html
+++ b/website/docs/images/displacy-ent-custom.html
@@ -1,9 +1,33 @@
-But
-Google
-ORGis starting from behind. The company made a late push into hardware,
-and
-Apple
-ORG’s Siri, available on iPhones, and
-Amazon
-ORG’s Alexa software, which runs on its Echo and Dot devices, have clear
-leads in consumer adoption.
+But
+ Google
+ ORGis starting from behind. The company made a late push into hardware, and
+ Apple
+ ORG’s Siri, available on iPhones, and
+ Amazon
+ ORG’s Alexa software, which runs on its Echo and Dot devices, have clear leads in consumer
+ adoption.
diff --git a/website/docs/images/displacy-ent-snek.html b/website/docs/images/displacy-ent-snek.html
index 1e4920fb5..c8b416d8d 100644
--- a/website/docs/images/displacy-ent-snek.html
+++ b/website/docs/images/displacy-ent-snek.html
@@ -2,17 +2,25 @@
class="entities"
style="line-height: 2.5; font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; font-size: 16px"
>
- 🌱🌿 🐍 SNEK ____ 🌳🌲 ____ 👨🌾 HUMAN 🏘️
+ 🌱🌿
+ 🐍
+ SNEK
+ ____ 🌳🌲 ____
+ 👨🌾
+ HUMAN
+ 🏘️
diff --git a/website/docs/images/displacy-ent1.html b/website/docs/images/displacy-ent1.html
index 6e3de2675..708df8093 100644
--- a/website/docs/images/displacy-ent1.html
+++ b/website/docs/images/displacy-ent1.html
@@ -1,16 +1,37 @@
-
-
+
+
Apple
- ORG
+ ORG
is looking at buying
-
+
U.K.
- GPE
+ GPE
startup for
-
+
$1 billion
- MONEY
+ MONEY
diff --git a/website/docs/images/displacy-ent2.html b/website/docs/images/displacy-ent2.html
index e72640b51..5e1833ca0 100644
--- a/website/docs/images/displacy-ent2.html
+++ b/website/docs/images/displacy-ent2.html
@@ -1,18 +1,39 @@
-
+
When
-
+
Sebastian Thrun
- PERSON
+ PERSON
started working on self-driving cars at
-
+
Google
- ORG
+ ORG
in
-
+
2007
- DATE
+ DATE
, few people outside of the company took him seriously.
diff --git a/website/docs/usage/101/_named-entities.md b/website/docs/usage/101/_named-entities.md
index 1ecaf9fe7..0dfee8636 100644
--- a/website/docs/usage/101/_named-entities.md
+++ b/website/docs/usage/101/_named-entities.md
@@ -1,9 +1,10 @@
A named entity is a "real-world object" that's assigned a name – for example, a
-person, a country, a product or a book title. spaCy can **recognize**
-[various types](/api/annotation#named-entities) of named entities in a document,
-by asking the model for a **prediction**. Because models are statistical and
-strongly depend on the examples they were trained on, this doesn't always work
-_perfectly_ and might need some tuning later, depending on your use case.
+person, a country, a product or a book title. spaCy can **recognize
+[various types](/api/annotation#named-entities)** of named entities in a
+document, by asking the model for a **prediction**. Because models are
+statistical and strongly depend on the examples they were trained on, this
+doesn't always work _perfectly_ and might need some tuning later, depending on
+your use case.
Named entities are available as the `ents` property of a `Doc`:
diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md
index 36e6e6809..4b12c6be1 100644
--- a/website/docs/usage/adding-languages.md
+++ b/website/docs/usage/adding-languages.md
@@ -402,12 +402,17 @@ iterators:
> assert chunks[1].text == "another phrase"
> ```
-| Language | Code | Source |
-| -------- | ---- | ----------------------------------------------------------------------------------------------------------------- |
-| English | `en` | [`lang/en/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py) |
-| German | `de` | [`lang/de/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/de/syntax_iterators.py) |
-| French | `fr` | [`lang/fr/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/syntax_iterators.py) |
-| Spanish | `es` | [`lang/es/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/es/syntax_iterators.py) |
+| Language | Code | Source |
+| ---------------- | ---- | ----------------------------------------------------------------------------------------------------------------- |
+| English | `en` | [`lang/en/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/en/syntax_iterators.py) |
+| German | `de` | [`lang/de/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/de/syntax_iterators.py) |
+| French | `fr` | [`lang/fr/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fr/syntax_iterators.py) |
+| Spanish | `es` | [`lang/es/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/es/syntax_iterators.py) |
+| Greek | `el` | [`lang/el/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/el/syntax_iterators.py) |
+| Norwegian Bokmål | `nb` | [`lang/nb/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/nb/syntax_iterators.py) |
+| Swedish | `sv` | [`lang/sv/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/sv/syntax_iterators.py) |
+| Indonesian | `id` | [`lang/id/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/id/syntax_iterators.py) |
+| Persian | `fa` | [`lang/fa/syntax_iterators.py`](https://github.com/explosion/spaCy/tree/master/spacy/lang/fa/syntax_iterators.py) |
### Lemmatizer {#lemmatizer new="2"}
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index e8326cdc9..039534fb7 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -5,6 +5,7 @@ menu:
- ['POS Tagging', 'pos-tagging']
- ['Dependency Parse', 'dependency-parse']
- ['Named Entities', 'named-entities']
+ - ['Entity Linking', 'entity-linking']
- ['Tokenization', 'tokenization']
- ['Merging & Splitting', 'retokenization']
- ['Sentence Segmentation', 'sbd']
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 9c3a43f1d..663ac5e5a 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -163,7 +163,7 @@ rule-based matching are:
| `TEXT`
2.1 | unicode | The exact verbatim text of a token. |
| `LOWER` | unicode | The lowercase form of the token text. |
| `LENGTH` | int | The length of the token text. |
-| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphanumeric characters, ASCII characters, digits. |
+| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
@@ -986,6 +986,37 @@ doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])
```
+### Adding IDs to patterns {#entityruler-ent-ids new="2.2.2"}
+
+The [`EntityRuler`](/api/entityruler) can also accept an `id` attribute for each
+pattern. Using the `id` attribute allows multiple patterns to be associated with
+the same entity.
+
+```python
+### {executable="true"}
+from spacy.lang.en import English
+from spacy.pipeline import EntityRuler
+
+nlp = English()
+ruler = EntityRuler(nlp)
+patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
+ {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
+ {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
+ruler.add_patterns(patterns)
+nlp.add_pipe(ruler)
+
+doc1 = nlp("Apple is opening its first big office in San Francisco.")
+print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])
+
+doc2 = nlp("Apple is opening its first big office in San Fran.")
+print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])
+```
+
+If the `id` attribute is included in the [`EntityRuler`](/api/entityruler)
+patterns, the `ent_id_` property of the matched entity is set to the `id` given
+in the patterns. So in the example above it's easy to identify that "San
+Francisco" and "San Fran" are both the same entity.
+
The entity ruler is designed to integrate with spaCy's existing statistical
models and enhance the named entity recognizer. If it's added **before the
`"ner"` component**, the entity recognizer will respect the existing entity
@@ -1135,6 +1166,8 @@ def expand_person_entities(doc):
if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
new_ents.append(new_ent)
+ else:
+ new_ents.append(ent)
else:
new_ents.append(ent)
doc.ents = new_ents
diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md
index 379535cf4..da56f2397 100644
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@@ -573,7 +573,7 @@ apple = doc[0]
print("Fine-grained POS tag", apple.pos_, apple.pos)
print("Coarse-grained POS tag", apple.tag_, apple.tag)
print("Word shape", apple.shape_, apple.shape)
-print("Alphanumeric characters?", apple.is_alpha)
+print("Alphabetic characters?", apple.is_alpha)
print("Punctuation mark?", apple.is_punct)
billion = doc[10]
diff --git a/website/meta/languages.json b/website/meta/languages.json
index 09a17b568..dbb300fbf 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -8,10 +8,10 @@
"en_core_web_md",
"en_core_web_lg",
"en_vectors_web_lg",
- "en_pytt_bertbaseuncased_lg",
- "en_pytt_robertabase_lg",
- "en_pytt_distilbertbaseuncased_lg",
- "en_pytt_xlnetbasecased_lg"
+ "en_trf_bertbaseuncased_lg",
+ "en_trf_robertabase_lg",
+ "en_trf_distilbertbaseuncased_lg",
+ "en_trf_xlnetbasecased_lg"
],
"example": "This is a sentence.",
"has_examples": true
@@ -19,7 +19,7 @@
{
"code": "de",
"name": "German",
- "models": ["de_core_news_sm", "de_core_news_md", "de_pytt_bertbasecased_lg"],
+ "models": ["de_core_news_sm", "de_core_news_md", "de_trf_bertbasecased_lg"],
"example": "Dies ist ein Satz.",
"has_examples": true
},
@@ -127,6 +127,7 @@
{ "code": "sr", "name": "Serbian" },
{ "code": "sk", "name": "Slovak" },
{ "code": "sl", "name": "Slovenian" },
+ { "code": "lb", "name": "Luxembourgish" },
{
"code": "sq",
"name": "Albanian",
diff --git a/website/meta/universe.json b/website/meta/universe.json
index d30b77ca4..40ebfaaa7 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1,5 +1,25 @@
{
"resources": [
+ {
+ "id": "spacy-server",
+ "title": "spaCy Server",
+ "slogan": "\uD83E\uDD9C Containerized HTTP API for spaCy NLP",
+ "description": "For developers who need programming language agnostic NLP, spaCy Server is a containerized HTTP API that provides industrial-strength natural language processing. Unlike other servers, our server is fast, idiomatic, and well documented.",
+ "github": "neelkamath/spacy-server",
+ "code_example": [
+ "docker run --rm -dp 8080:8080 neelkamath/spacy-server",
+ "curl http://localhost:8080/ner -H 'Content-Type: application/json' -d '{\"sections\": [\"My name is John Doe. I grew up in California.\"]}'"
+ ],
+ "code_language": "shell",
+ "url": "https://hub.docker.com/r/neelkamath/spacy-server",
+ "author": "Neel Kamath",
+ "author_links": {
+ "github": "neelkamath",
+ "website": "https://neelkamath.com"
+ },
+ "category": ["apis"],
+ "tags": ["docker"]
+ },
{
"id": "nlp-architect",
"title": "NLP Architect",
@@ -578,13 +598,13 @@
},
{
"id": "rasa",
- "title": "Rasa NLU",
+ "title": "Rasa",
"slogan": "Turn natural language into structured data",
- "description": "Rasa NLU (Natural Language Understanding) is a tool for understanding what is being said in short pieces of text. Rasa NLU is primarily used to build chatbots and voice apps, where this is called intent classification and entity extraction. To use Rasa, *you have to provide some training data*.",
- "github": "RasaHQ/rasa_nlu",
- "pip": "rasa_nlu",
- "thumb": "https://i.imgur.com/ndCfKNq.png",
- "url": "https://nlu.rasa.com/",
+ "description": "Machine learning tools for developers to build, improve, and deploy contextual chatbots and assistants. Powered by open source.",
+ "github": "RasaHQ/rasa",
+ "pip": "rasa",
+ "thumb": "https://i.imgur.com/TyZnpwL.png",
+ "url": "https://rasa.com/",
"author": "Rasa",
"author_links": {
"github": "RasaHQ"
@@ -1675,21 +1695,21 @@
}
},
{
- "id": "spacy-pytorch-transformers",
- "title": "spacy-pytorch-transformers",
+ "id": "spacy-transformers",
+ "title": "spacy-transformers",
"slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
- "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
- "github": "explosion/spacy-pytorch-transformers",
- "url": "https://explosion.ai/blog/spacy-pytorch-transformers",
- "pip": "spacy-pytorch-transformers",
+ "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `transformers`](https://github.com/huggingface/transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
+ "github": "explosion/spacy-transformers",
+ "url": "https://explosion.ai/blog/spacy-transformers",
+ "pip": "spacy-transformers",
"category": ["pipeline", "models", "research"],
"code_example": [
"import spacy",
"",
- "nlp = spacy.load(\"en_pytt_bertbaseuncased_lg\")",
+ "nlp = spacy.load(\"en_trf_bertbaseuncased_lg\")",
"doc = nlp(\"Apple shares rose on the news. Apple pie is delicious.\")",
"print(doc[0].similarity(doc[7]))",
- "print(doc._.pytt_last_hidden_state.shape)"
+ "print(doc._.trf_last_hidden_state.shape)"
],
"author": "Explosion",
"author_links": {
@@ -1800,6 +1820,71 @@
"author_links": {
"github": "microsoft"
}
+ },
+ {
+ "id": "python-sentence-boundary-disambiguation",
+ "title": "pySBD - python Sentence Boundary Disambiguation",
+ "slogan": "Rule-based sentence boundary detection that works out-of-the-box",
+ "github": "nipunsadvilkar/pySBD",
+ "description": "pySBD is 'real-world' sentence segmenter which extracts reasonable sentences when the format and domain of the input text are unknown. It is a rules-based algorithm based on [The Golden Rules](https://s3.amazonaws.com/tm-town-nlp-resources/golden_rules.txt) - a set of tests to check accuracy of segmenter in regards to edge case scenarios developed by [TM-Town](https://www.tm-town.com/) dev team. pySBD is python port of ruby gem [Pragmatic Segmenter](https://github.com/diasks2/pragmatic_segmenter).",
+ "pip": "pysbd",
+ "category": ["scientific"],
+ "tags": ["sentence segmentation"],
+ "code_example": [
+ "from pysbd.util import PySBDFactory",
+ "",
+ "nlp = spacy.blank('en')",
+ "nlp.add_pipe(PySBDFactory(nlp))",
+ "",
+ "doc = nlp('My name is Jonas E. Smith. Please turn to p. 55.')",
+ "print(list(doc.sents))",
+ "# [My name is Jonas E. Smith., Please turn to p. 55.]"
+ ],
+ "author": "Nipun Sadvilkar",
+ "author_links": {
+ "twitter": "nipunsadvilkar",
+ "github": "nipunsadvilkar",
+ "website": "https://nipunsadvilkar.github.io"
+ }
+ },
+ {
+ "id": "cookiecutter-spacy-fastapi",
+ "title": "cookiecutter-spacy-fastapi",
+ "slogan": "Docker-based cookiecutter for easy spaCy APIs using FastAPI",
+ "description": "Docker-based cookiecutter for easy spaCy APIs using FastAPI. The default endpoints expect batch requests with a list of Records in the Azure Search Cognitive Skill format. So out of the box, this cookiecutter can be setup as a Custom Cognitive Skill. For more on Azure Search and Cognitive Skills [see this page](https://docs.microsoft.com/en-us/azure/search/cognitive-search-custom-skill-interface).",
+ "url": "https://github.com/microsoft/cookiecutter-spacy-fastapi",
+ "image": "https://raw.githubusercontent.com/microsoft/cookiecutter-spacy-fastapi/master/images/cookiecutter-docs.png",
+ "github": "microsoft/cookiecutter-spacy-fastapi",
+ "category": ["apis"],
+ "thumb": "https://avatars0.githubusercontent.com/u/6154722",
+ "author": "Microsoft",
+ "author_links": {
+ "github": "microsoft"
+ }
+ },
+ {
+ "id": "dframcy",
+ "title": "Dframcy",
+ "slogan": "Dataframe Integration with spaCy NLP",
+ "github": "yash1994/dframcy",
+ "description": "DframCy is a light-weight utility module to integrate Pandas Dataframe to spaCy's linguistic annotation and training tasks.",
+ "pip": "dframcy",
+ "category": ["pipeline", "training"],
+ "tags": ["pandas"],
+ "code_example": [
+ "import spacy",
+ "from dframcy import DframCy",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "dframcy = DframCy(nlp)",
+ "doc = dframcy.nlp(u'Apple is looking at buying U.K. startup for $1 billion')",
+ "annotation_dataframe = dframcy.to_dataframe(doc)"
+ ],
+ "author": "Yash Patadia",
+ "author_links": {
+ "twitter": "PatadiaYash",
+ "github": "yash1994"
+ }
}
],
diff --git a/website/src/components/util.js b/website/src/components/util.js
index 0d3f7bda3..1935a8085 100644
--- a/website/src/components/util.js
+++ b/website/src/components/util.js
@@ -45,6 +45,14 @@ export function isString(obj) {
return typeof obj === 'string' || obj instanceof String
}
+/**
+ * @param obj - The object to check.
+ * @returns {boolean} - Whether the object is empty.
+ */
+export function isEmptyObj(obj) {
+ return Object.entries(obj).length === 0 && obj.constructor === Object
+}
+
/**
* Convert raw HTML to React elements
* @param {string} html - The HTML markup to convert.
diff --git a/website/src/templates/models.js b/website/src/templates/models.js
index 3ab701727..3ac5e6ebf 100644
--- a/website/src/templates/models.js
+++ b/website/src/templates/models.js
@@ -15,7 +15,8 @@ import Link from '../components/link'
import Grid from '../components/grid'
import Infobox from '../components/infobox'
import Accordion from '../components/accordion'
-import { join, arrayToObj, abbrNum, markdownToReact, isString } from '../components/util'
+import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
+import { isString, isEmptyObj } from '../components/util'
const MODEL_META = {
core: 'Vocabulary, syntax, entities, vectors',
@@ -23,6 +24,7 @@ const MODEL_META = {
dep: 'Vocabulary, syntax',
ent: 'Named entities',
pytt: 'PyTorch Transformers',
+ trf: 'Transformers',
vectors: 'Word vectors',
web: 'written text (blogs, news, comments)',
news: 'written text (news, media)',
@@ -104,7 +106,7 @@ function formatModelMeta(data) {
author: data.author,
url: data.url,
license: data.license,
- labels: data.labels,
+ labels: isEmptyObj(data.labels) ? null : data.labels,
vectors: formatVectors(data.vectors),
accuracy: formatAccuracy(data.accuracy),
}