Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2025-09-15 08:32:37 +03:00 · 2017-11-06 19:20:41 +01:00 · 2017-11-06 19:20:41 +01:00 · ffb9101f3f
commit ffb9101f3f
parent 8fea512ac8 64d0f97c67
18 changed files with 99 additions and 96 deletions
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@ -1,4 +0,0 @@
-# coding: utf8
-from __future__ import unicode_literals
-
-PRON_LEMMA = "-PRON-"
--- a/spacy/lang/bn/morph_rules.py
+++ b/spacy/lang/bn/morph_rules.py
@ -1,8 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...deprecated import PRON_LEMMA
-from ...symbols import LEMMA
+from ...symbols import LEMMA, PRON_LEMMA


 MORPH_RULES = {
--- a/spacy/lang/da/morph_rules.py
+++ b/spacy/lang/da/morph_rules.py
@ -1,8 +1,8 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import LEMMA
-from ...deprecated import PRON_LEMMA
+from ...symbols import LEMMA, PRON_LEMMA
+

 MORPH_RULES = {
    "PRON": {
--- a/spacy/lang/de/tokenizer_exceptions.py
+++ b/spacy/lang/de/tokenizer_exceptions.py
@ -1,8 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, TAG, NORM
-from ...deprecated import PRON_LEMMA
+from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA


 _exc = {
--- a/spacy/lang/en/morph_rules.py
+++ b/spacy/lang/en/morph_rules.py
@ -1,8 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import LEMMA
-from ...deprecated import PRON_LEMMA
+from ...symbols import LEMMA, PRON_LEMMA


 MORPH_RULES = {
--- a/spacy/lang/en/tokenizer_exceptions.py
+++ b/spacy/lang/en/tokenizer_exceptions.py
@ -1,8 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, TAG, NORM
-from ...deprecated import PRON_LEMMA
+from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA


 _exc = {}
--- a/spacy/lang/es/tokenizer_exceptions.py
+++ b/spacy/lang/es/tokenizer_exceptions.py
@ -1,8 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET
-from ...deprecated import PRON_LEMMA
+from ...symbols import ORTH, LEMMA, TAG, NORM, ADP, DET, PRON_LEMMA


 _exc = {
--- a/spacy/lang/fr/tokenizer_exceptions.py
+++ b/spacy/lang/fr/tokenizer_exceptions.py
@ -7,8 +7,7 @@ from ._tokenizer_exceptions_list import FR_BASE_EXCEPTIONS
 from .punctuation import ELISION, HYPHENS
 from ..tokenizer_exceptions import URL_PATTERN
 from ..char_classes import ALPHA_LOWER
-from ...symbols import ORTH, LEMMA, TAG, NORM
-from ...deprecated import PRON_LEMMA
+from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA


 def upper_first_letter(text):
--- a/spacy/lang/nb/morph_rules.py
+++ b/spacy/lang/nb/morph_rules.py
@ -1,8 +1,7 @@
 # encoding: utf8
 from __future__ import unicode_literals

-from ...symbols import LEMMA
-from ...deprecated import PRON_LEMMA
+from ...symbols import LEMMA, PRON_LEMMA


 # Used the table of pronouns at https://no.wiktionary.org/wiki/Tillegg:Pronomen_i_norsk
--- a/spacy/lang/pt/tokenizer_exceptions.py
+++ b/spacy/lang/pt/tokenizer_exceptions.py
@ -1,8 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, NORM
-from ...deprecated import PRON_LEMMA
+from ...symbols import ORTH, LEMMA, NORM, PRON_LEMMA


 _exc = {
--- a/spacy/lang/sv/morph_rules.py
+++ b/spacy/lang/sv/morph_rules.py
@ -1,8 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import LEMMA
-from ...deprecated import PRON_LEMMA
+from ...symbols import LEMMA, PRON_LEMMA


 # Used the table of pronouns at https://sv.wiktionary.org/wiki/deras
--- a/spacy/lang/sv/tokenizer_exceptions.py
+++ b/spacy/lang/sv/tokenizer_exceptions.py
@ -1,8 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

-from ...symbols import ORTH, LEMMA, TAG, NORM
-from ...deprecated import PRON_LEMMA
+from ...symbols import ORTH, LEMMA, TAG, NORM, PRON_LEMMA


 _exc = {}
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@ -452,8 +452,8 @@ mixin head-cell()

 //- Table cell (only used within +row in +table)

-mixin cell()
-    td.c-table__cell.u-text&attributes(attributes)
+mixin cell(align)
+    td.c-table__cell.u-text(class=align ? "u-text-" + align : null)&attributes(attributes)
        block


--- a/website/_includes/_page_models.jade
+++ b/website/_includes/_page_models.jade
@ -71,7 +71,7 @@ for id in CURRENT_MODELS
                                        +label=label
                                            if MODEL_META[field]
                                                |  #[+help(MODEL_META[field]).u-color-subtle]
-                                    +cell.u-text-right(data-tpl=id data-tpl-key=field)
+                                    +cell("right")(data-tpl=id data-tpl-key=field)
                                        |  n/a

        p.u-text-small.u-color-dark(data-tpl=id data-tpl-key="notes")
--- a/website/models/_data.json
+++ b/website/models/_data.json
@ -43,7 +43,7 @@
        "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
        "de": ["de_core_news_sm"],
        "es": ["es_core_news_sm", "es_core_news_md"],
-        "pt": [],
+        "pt": ["pt_core_news_sm"],
        "fr": ["fr_core_news_sm"],
        "it": ["it_core_news_sm"],
        "nl": ["nl_core_news_sm"],
--- a/website/usage/_facts-figures/_benchmarks-choi-2015.jade
+++ b/website/usage/_facts-figures/_benchmarks-choi-2015.jade
@ -5,41 +5,41 @@
        +cell #[strong spaCy v2.x]
        +cell 2017
        +cell Python / Cython
-        +cell.u-text-right #[strong 92.6]
-        +cell.u-text-right #[em n/a]
+        +cell("right") #[strong 92.6]
+        +cell("right") #[em n/a]
            |  #[+help("This table shows speed as benchmarked by Choi et al. We therefore can't provide comparable figures, as we'd be running the benchmark on different hardware.").u-color-dark]

    +row
        +cell #[strong spaCy v1.x]
        +cell 2015
        +cell Python / Cython
-        +cell.u-text-right 91.8
-        +cell.u-text-right 13,963
+        +cell("right") 91.8
+        +cell("right") 13,963

    +row
        +cell ClearNLP
        +cell 2015
        +cell Java
-        +cell.u-text-right 91.7
-        +cell.u-text-right 10,271
+        +cell("right") 91.7
+        +cell("right") 10,271

    +row
        +cell CoreNLP
        +cell 2015
        +cell Java
-        +cell.u-text-right 89.6
-        +cell.u-text-right 8,602
+        +cell("right") 89.6
+        +cell("right") 8,602

    +row
        +cell MATE
        +cell 2015
        +cell Java
-        +cell.u-text-right 92.5
-        +cell.u-text-right 550
+        +cell("right") 92.5
+        +cell("right") 550

    +row
        +cell Turbo
        +cell 2015
        +cell C++
-        +cell.u-text-right 92.4
-        +cell.u-text-right 349
+        +cell("right") 92.4
+        +cell("right") 349
--- a/website/usage/_facts-figures/_benchmarks-models.jade
+++ b/website/usage/_facts-figures/_benchmarks-models.jade
@ -4,7 +4,9 @@ p
    |  In this section, we provide benchmark accuracies for the pre-trained
    |  model pipelines we distribute with spaCy. Evaluations are conducted
    |  end-to-end from raw text, with no "gold standard" pre-processing, over
-    |  text from a mix of genres where possible.
+    |  text from a mix of genres where possible. For are more detailed
+    |  comparison of the available models, see the new
+    |  #[+a("/models/comparison") model comparison tool].

 +aside("Methodology")
    |  The evaluation was conducted on raw text with no gold standard
@ -16,55 +18,70 @@ p

 +table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
    +row
-        +cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a5
+        +cell #[+a("/models/en#en_core_web_sm") #[code en_core_web_sm]] 2.0.0a8
        each data in ["2.x", "neural"]
-            +cell.u-text-right=data
-        +cell.u-text-right 91.4
-        +cell.u-text-right 85.5
-        +cell.u-text-right 97.0
-        +cell.u-text-right 8.2k
-        +cell.u-text-right #[strong 36 MB]
+            +cell("right")=data
+        +cell("right") 91.7
+        +cell("right") 85.3
+        +cell("right") 97.0
+        +cell("right") 10.1k
+        +cell("right") #[strong 35 MB]

    +row
-        +cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a0
+        +cell #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] 2.0.0a3
        each data in ["2.x", "neural"]
-            +cell.u-text-right=data
-        +cell.u-text-right #[strong 91.9]
-        +cell.u-text-right #[strong 86.4]
-        +cell.u-text-right #[strong 97.2]
-        +cell.u-text-right #[em n/a]
-        +cell.u-text-right 667 MB
+            +cell("right")=data
+        +cell("right") #[strong 91.9]
+        +cell("right") #[strong 85.9]
+        +cell("right") #[strong 97.2]
+        +cell("right") 5.0k
+        +cell("right") 812 MB

    +row("divider")
        +cell #[code en_core_web_sm] 1.2.0
        each data in ["1.x", "linear", 86.6, 78.5, 96.6]
-            +cell.u-text-right=data
-        +cell.u-text-right #[strong 25.7k]
-        +cell.u-text-right 50 MB
+            +cell("right")=data
+        +cell("right") #[strong 25.7k]
+        +cell("right") 50 MB

    +row
        +cell #[code en_core_web_md] 1.2.1
        each data in ["1.x", "linear", 90.6, 81.4, 96.7, "18.8k", "1 GB"]
-            +cell.u-text-right=data
+            +cell("right")=data

 +h(4, "benchmarks-models-spanish") Spanish

+aside("Evaluation note")
+    |  The NER accuracy refers to the "silver standard" annotations in the
+    |  WikiNER corpus. Accuracy on these annotations tends to be higher than
+    |  correct human annotations.
+
 +table(["Model", "spaCy", "Type", "UAS", "NER F", "POS", "WPS", "Size"])
    +row
-        +cell #[+a("/models/es#es_core_web_sm") #[code es_core_web_sm]] 2.0.0a0
-        +cell.u-text-right 2.x
-        +cell.u-text-right neural
-        +cell.u-text-right #[strong 90.1]
-        +cell.u-text-right 89.0
-        +cell.u-text-right #[strong 96.7]
-        +cell.u-text-right #[em n/a]
-        +cell.u-text-right #[strong 36 MB]
+        +cell #[+a("/models/es#es_core_news_sm") #[code es_core_news_sm]] 2.0.0a0
+        +cell("right") 2.x
+        +cell("right") neural
+        +cell("right") 89.8
+        +cell("right") 88.7
+        +cell("right") #[strong 96.9]
+        +cell("right") #[em n/a]
+        +cell("right") #[strong 35 MB]
+
+    +row
+        +cell #[+a("/models/es#es_core_news_md") #[code es_core_news_md]] 2.0.0a0
+        +cell("right") 2.x
+        +cell("right") neural
+        +cell("right") #[strong 90.2]
+        +cell("right") 89.0
+        +cell("right") 97.8
+        +cell("right") #[em n/a]
+        +cell("right") 93 MB

    +row("divider")
        +cell #[code es_core_web_md] 1.1.0
        each data in ["1.x", "linear", 87.5]
-            +cell.u-text-right=data
-        +cell #[strong 94.2]
-        +cell #[strong 96.7]
-        +cell.u-text-right #[em n/a]
-        +cell.u-text-right 377 MB
+            +cell("right")=data
+        +cell("right") #[strong 94.2]
+        +cell("right") 96.7
+        +cell("right") #[em n/a]
+        +cell("right") 377 MB
--- a/website/usage/_facts-figures/_benchmarks.jade
+++ b/website/usage/_facts-figures/_benchmarks.jade
@ -50,55 +50,55 @@ p
        +cell spaCy v2.0.0
        +cell 2017
        +cell neural
-        +cell.u-text-right 94.48
+        +cell("right") 94.48

    +row
        +cell spaCy v1.1.0
        +cell 2016
        +cell linear
-        +cell.u-text-right 92.80
+        +cell("right") 92.80

    +row("divider")
        +cell
            +a("https://arxiv.org/pdf/1611.01734.pdf") Dozat and Manning
            +cell 2017
            +cell neural
-            +cell.u-text-right #[strong 95.75]
+            +cell("right") #[strong 95.75]

    +row
        +cell
            +a("http://arxiv.org/abs/1603.06042") Andor et al.
        +cell 2016
        +cell neural
-        +cell.u-text-right 94.44
+        +cell("right") 94.44

    +row
        +cell
            +a("https://github.com/tensorflow/models/tree/master/research/syntaxnet") SyntaxNet Parsey McParseface
        +cell 2016
        +cell neural
-        +cell.u-text-right 94.15
+        +cell("right") 94.15

    +row
        +cell
            +a("http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43800.pdf") Weiss et al.
        +cell 2015
        +cell neural
-        +cell.u-text-right 93.91
+        +cell("right") 93.91

    +row
        +cell
            +a("http://research.google.com/pubs/archive/38148.pdf") Zhang and McDonald
        +cell 2014
        +cell linear
-        +cell.u-text-right 93.32
+        +cell("right") 93.32

    +row
        +cell
            +a("http://www.cs.cmu.edu/~ark/TurboParser/") Martins et al.
        +cell 2013
        +cell linear
-        +cell.u-text-right 93.10
+        +cell("right") 93.10

 +h(4, "ner-accuracy-ontonotes5") NER accuracy (OntoNotes 5, no pre-process)

@ -110,38 +110,38 @@ p

 +table(["System", "Year", "Type", "Accuracy"])
    +row
-        +cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0
+        +cell spaCy #[+a("/models/en#en_core_web_lg") #[code en_core_web_lg]] v2.0.0a3
        +cell 2017
        +cell neural
-        +cell.u-text-right 86.45
+        +cell("right") 85.85

    +row("divider")
        +cell
            +a("https://arxiv.org/pdf/1702.02098.pdf") Strubell et al.
        +cell 2017
        +cell neural
-        +cell.u-text-right #[strong 86.81]
+        +cell("right") #[strong 86.81]

    +row
        +cell
            +a("https://www.semanticscholar.org/paper/Named-Entity-Recognition-with-Bidirectional-LSTM-C-Chiu-Nichols/10a4db59e81d26b2e0e896d3186ef81b4458b93f") Chiu and Nichols
        +cell 2016
        +cell neural
-        +cell.u-text-right 86.19
+        +cell("right") 86.19

    +row
        +cell
            +a("https://www.semanticscholar.org/paper/A-Joint-Model-for-Entity-Analysis-Coreference-Typi-Durrett-Klein/28eb033eee5f51c5e5389cbb6b777779203a6778") Durrett and Klein
        +cell 2014
        +cell neural
-        +cell.u-text-right 84.04
+        +cell("right") 84.04

    +row
        +cell
            +a("http://www.aclweb.org/anthology/W09-1119") Ratinov and Roth
        +cell 2009
        +cell linear
-        +cell.u-text-right 83.45
+        +cell("right") 83.45

 +h(3, "spacy-models") Model comparison

@ -183,24 +183,24 @@ p
    +row
        +cell #[strong spaCy]
        each data in [ "0.2ms", "1ms", "19ms"]
-            +cell.u-text-right #[strong=data]
+            +cell("right") #[strong=data]

        each data in ["1x", "1x", "1x"]
-            +cell.u-text-right=data
+            +cell("right")=data

    +row
        +cell CoreNLP
        each data in ["2ms", "10ms", "49ms", "10x", "10x", "2.6x"]
-            +cell.u-text-right=data
+            +cell("right")=data
    +row
        +cell ZPar
        each data in ["1ms", "8ms", "850ms", "5x", "8x", "44.7x"]
-            +cell.u-text-right=data
+            +cell("right")=data
    +row
        +cell NLTK
        each data in ["4ms", "443ms"]
-            +cell.u-text-right=data
-        +cell.u-text-right #[em n/a]
+            +cell("right")=data
+        +cell("right") #[em n/a]
        each data in ["20x", "443x"]
-            +cell.u-text-right=data
-        +cell.u-text-right #[em n/a]
+            +cell("right")=data
+        +cell("right") #[em n/a]