Merge branch 'master' of github.com:pmbaumgartner/spaCy

2025-08-01 02:49:50 +03:00 · 2019-07-14 20:25:37 -04:00 · 2019-07-14 20:25:37 -04:00 · 040bb061fd
commit 040bb061fd
parent 9a86d95ea2 c0e29f7029
11 changed files with 104 additions and 16 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,6 +5,6 @@ requires = ["setuptools",
            "cymem>=2.0.2,<2.1.0",
            "preshed>=2.0.1,<2.1.0",
            "murmurhash>=0.28.0,<1.1.0",
-            "thinc>=7.0.6,<7.1.0",
+            "thinc>=7.0.8,<7.1.0",
            ]
 build-backend = "setuptools.build_meta"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,7 +1,7 @@
 # Our libraries
 cymem>=2.0.2,<2.1.0
 preshed>=2.0.1,<2.1.0
-thinc>=7.0.6,<7.1.0
+thinc>=7.0.8,<7.1.0
 blis>=0.2.2,<0.3.0
 murmurhash>=0.28.0,<1.1.0
 wasabi>=0.2.0,<1.1.0
--- a/setup.py
+++ b/setup.py
@ -228,7 +228,7 @@ def setup_package():
                "murmurhash>=0.28.0,<1.1.0",
                "cymem>=2.0.2,<2.1.0",
                "preshed>=2.0.1,<2.1.0",
-                "thinc>=7.0.6,<7.1.0",
+                "thinc>=7.0.8,<7.1.0",
                "blis>=0.2.2,<0.3.0",
                "plac<1.0.0,>=0.9.6",
                "requests>=2.13.0,<3.0.0",
--- a/spacy/about.py
+++ b/spacy/about.py
@ -4,13 +4,13 @@
 # fmt: off

 __title__ = "spacy"
-__version__ = "2.1.5.dev0"
+__version__ = "2.1.6"
 __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 __uri__ = "https://spacy.io"
 __author__ = "Explosion AI"
 __email__ = "contact@explosion.ai"
 __license__ = "MIT"
-__release__ = False
+__release__ = True

 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/attrs.pxd
+++ b/spacy/attrs.pxd
@ -1,4 +1,6 @@
 # Reserve 64 values for flag features
+from . cimport symbols
+
 cdef enum attr_id_t:
    NULL_ATTR
    IS_ALPHA
@ -82,10 +84,10 @@ cdef enum attr_id_t:
    DEP
    ENT_IOB
    ENT_TYPE
-    ENT_KB_ID
    HEAD
    SENT_START
    SPACY
    PROB

    LANG
+    ENT_KB_ID = symbols.ENT_KB_ID
--- a/spacy/lang/da/punctuation.py
+++ b/spacy/lang/da/punctuation.py
@ -14,10 +14,11 @@ _infixes = (
    + [
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
-        r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
+        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
    ]
 )

--- a/spacy/lang/da/tokenizer_exceptions.py
+++ b/spacy/lang/da/tokenizer_exceptions.py
@ -52,6 +52,7 @@ for exc_data in [
    {ORTH: "Ons.", LEMMA: "onsdag"},
    {ORTH: "Fre.", LEMMA: "fredag"},
    {ORTH: "Lør.", LEMMA: "lørdag"},
+    {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
 ]:
    _exc[exc_data[ORTH]] = [exc_data]

@ -64,6 +65,8 @@ for orth in [
    "mik.",
    "pers.",
    "A.D.",
+    "A/B",
+    "a/s",
    "A/S",
    "B.C.",
    "BK.",
@ -79,7 +82,9 @@ for orth in [
    "Kprs.",
    "L.A.",
    "Ll.",
+    "m/k",
    "m/s",
+    "m/sek.",
    "M/S",
    "Mag.",
    "Mr.",
@ -90,6 +95,7 @@ for orth in [
    "Sdr.",
    "Skt.",
    "Spl.",
+    "TCP/IP",
    "Vg.",
 ]:
    _exc[orth] = [{ORTH: orth}]
@ -141,6 +147,7 @@ for orth in [
    "brolægn.",
    "bto.",
    "bygn.",
+    "c/o",
    "ca.",
    "cand.",
    "d.d.",
@ -293,6 +300,7 @@ for orth in [
    "kgl.",
    "kl.",
    "kld.",
+    "km/t",
    "knsp.",
    "komm.",
    "kons.",
--- a/spacy/symbols.pxd
+++ b/spacy/symbols.pxd
@ -81,7 +81,6 @@ cdef enum symbol_t:
    DEP
    ENT_IOB
    ENT_TYPE
-    ENT_KB_ID
    HEAD
    SENT_START
    SPACY
@ -461,3 +460,5 @@ cdef enum symbol_t:
    xcomp

    acl
+
+    ENT_KB_ID
--- a/spacy/tests/lang/da/test_exceptions.py
+++ b/spacy/tests/lang/da/test_exceptions.py
@ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
 def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
    tokens = da_tokenizer(text)
    assert tokens[0].norm_ == norm
+
+
+@pytest.mark.parametrize(
+    "text,n_tokens",
+    [
+        ("Godt og/eller skidt", 3),
+        ("Kør 4 km/t på vejen", 5),
+        ("Det blæser 12 m/s.", 5),
+        ("Det blæser 12 m/sek. på havnen", 6),
+        ("Windows 8/Windows 10", 5),
+        ("Billeten virker til bus/tog/metro", 8),
+        ("26/02/2019", 1),
+        ("Kristiansen c/o Madsen", 3),
+        ("Sprogteknologi a/s", 2),
+        ("De boede i A/B Bellevue", 5),
+        ("Rotorhastigheden er 3400 o/m.", 5),
+        ("Jeg købte billet t/r.", 5),
+        ("Murerarbejdsmand m/k søges", 3),
+        ("Netværket kører over TCP/IP", 4),
+    ],
+)
+def test_da_tokenizer_slash(da_tokenizer, text, n_tokens):
+    tokens = da_tokenizer(text)
+    assert len(tokens) == n_tokens
--- a/spacy/tests/regression/test_issue3611.py
+++ b/spacy/tests/regression/test_issue3611.py
@ -0,0 +1,51 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+import spacy
+from spacy.util import minibatch, compounding
+
+
+def test_issue3611():
+    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
+    unique_classes = ["offensive", "inoffensive"]
+    x_train = ["This is an offensive text",
+               "This is the second offensive text",
+               "inoff"]
+    y_train = ["offensive", "offensive", "inoffensive"]
+
+    # preparing the data
+    pos_cats = list()
+    for train_instance in y_train:
+        pos_cats.append({label: label == train_instance for label in unique_classes})
+    train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
+
+    # set up the spacy model with a text categorizer component
+    nlp = spacy.blank('en')
+
+    textcat = nlp.create_pipe(
+        "textcat",
+        config={
+            "exclusive_classes": True,
+            "architecture": "bow",
+            "ngram_size": 2
+        }
+    )
+
+    for label in unique_classes:
+        textcat.add_label(label)
+    nlp.add_pipe(textcat, last=True)
+
+    # training the network
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
+    with nlp.disable_pipes(*other_pipes):
+        optimizer = nlp.begin_training()
+        for i in range(3):
+            losses = {}
+            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
+
+            for batch in batches:
+                texts, annotations = zip(*batch)
+                nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)
+
+
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@ -152,20 +152,21 @@ const Landing = ({ data }) => {
            <LandingBannerGrid>
                <LandingBanner
                    title="spaCy IRL 2019: Two days of NLP"
-                    label="Join us in Berlin"
-                    to="https://irl.spacy.io/2019"
-                    button="Get tickets"
+                    label="Watch the videos"
+                    to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc"
+                    button="Watch the videos"
                    background="#ffc194"
                    backgroundImage={irlBackground}
                    color="#1a1e23"
                    small
                >
-                    We're pleased to invite the spaCy community and other folks working on Natural
+                    We were pleased to invite the spaCy community and other folks working on Natural
                    Language Processing to Berlin this summer for a small and intimate event{' '}
-                    <strong>July 5-6, 2019</strong>. The event includes a hands-on training day for
-                    teams using spaCy in production, followed by a one-track conference. We've
-                    booked a beautiful venue, hand-picked an awesome lineup of speakers and
-                    scheduled plenty of social time to get to know each other and exchange ideas.
+                    <strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an
+                    awesome lineup of speakers and scheduled plenty of social time to get to know
+                    each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
+                    research, development and applications, with keynotes by Sebastian Ruder
+                    (DeepMind) and Yoav Goldberg (Allen AI).
                </LandingBanner>

                <LandingBanner