Merge branch 'master' of github.com:pmbaumgartner/spaCy

This commit is contained in:
pmbaumgartner 2019-07-14 20:25:37 -04:00
commit 040bb061fd
11 changed files with 104 additions and 16 deletions

View File

@ -5,6 +5,6 @@ requires = ["setuptools",
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=2.0.1,<2.1.0", "preshed>=2.0.1,<2.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=7.0.6,<7.1.0", "thinc>=7.0.8,<7.1.0",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=2.0.1,<2.1.0 preshed>=2.0.1,<2.1.0
thinc>=7.0.6,<7.1.0 thinc>=7.0.8,<7.1.0
blis>=0.2.2,<0.3.0 blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
wasabi>=0.2.0,<1.1.0 wasabi>=0.2.0,<1.1.0

View File

@ -228,7 +228,7 @@ def setup_package():
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=2.0.1,<2.1.0", "preshed>=2.0.1,<2.1.0",
"thinc>=7.0.6,<7.1.0", "thinc>=7.0.8,<7.1.0",
"blis>=0.2.2,<0.3.0", "blis>=0.2.2,<0.3.0",
"plac<1.0.0,>=0.9.6", "plac<1.0.0,>=0.9.6",
"requests>=2.13.0,<3.0.0", "requests>=2.13.0,<3.0.0",

View File

@ -4,13 +4,13 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "2.1.5.dev0" __version__ = "2.1.6"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython" __summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io" __uri__ = "https://spacy.io"
__author__ = "Explosion AI" __author__ = "Explosion AI"
__email__ = "contact@explosion.ai" __email__ = "contact@explosion.ai"
__license__ = "MIT" __license__ = "MIT"
__release__ = False __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -1,4 +1,6 @@
# Reserve 64 values for flag features # Reserve 64 values for flag features
from . cimport symbols
cdef enum attr_id_t: cdef enum attr_id_t:
NULL_ATTR NULL_ATTR
IS_ALPHA IS_ALPHA
@ -82,10 +84,10 @@ cdef enum attr_id_t:
DEP DEP
ENT_IOB ENT_IOB
ENT_TYPE ENT_TYPE
ENT_KB_ID
HEAD HEAD
SENT_START SENT_START
SPACY SPACY
PROB PROB
LANG LANG
ENT_KB_ID = symbols.ENT_KB_ID

View File

@ -14,10 +14,11 @@ _infixes = (
+ [ + [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER), r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA), r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA), r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA), r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes), r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA), r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
] ]
) )

View File

@ -52,6 +52,7 @@ for exc_data in [
{ORTH: "Ons.", LEMMA: "onsdag"}, {ORTH: "Ons.", LEMMA: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"}, {ORTH: "Fre.", LEMMA: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"}, {ORTH: "Lør.", LEMMA: "lørdag"},
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
]: ]:
_exc[exc_data[ORTH]] = [exc_data] _exc[exc_data[ORTH]] = [exc_data]
@ -64,6 +65,8 @@ for orth in [
"mik.", "mik.",
"pers.", "pers.",
"A.D.", "A.D.",
"A/B",
"a/s",
"A/S", "A/S",
"B.C.", "B.C.",
"BK.", "BK.",
@ -79,7 +82,9 @@ for orth in [
"Kprs.", "Kprs.",
"L.A.", "L.A.",
"Ll.", "Ll.",
"m/k",
"m/s", "m/s",
"m/sek.",
"M/S", "M/S",
"Mag.", "Mag.",
"Mr.", "Mr.",
@ -90,6 +95,7 @@ for orth in [
"Sdr.", "Sdr.",
"Skt.", "Skt.",
"Spl.", "Spl.",
"TCP/IP",
"Vg.", "Vg.",
]: ]:
_exc[orth] = [{ORTH: orth}] _exc[orth] = [{ORTH: orth}]
@ -141,6 +147,7 @@ for orth in [
"brolægn.", "brolægn.",
"bto.", "bto.",
"bygn.", "bygn.",
"c/o",
"ca.", "ca.",
"cand.", "cand.",
"d.d.", "d.d.",
@ -293,6 +300,7 @@ for orth in [
"kgl.", "kgl.",
"kl.", "kl.",
"kld.", "kld.",
"km/t",
"knsp.", "knsp.",
"komm.", "komm.",
"kons.", "kons.",

View File

@ -81,7 +81,6 @@ cdef enum symbol_t:
DEP DEP
ENT_IOB ENT_IOB
ENT_TYPE ENT_TYPE
ENT_KB_ID
HEAD HEAD
SENT_START SENT_START
SPACY SPACY
@ -461,3 +460,5 @@ cdef enum symbol_t:
xcomp xcomp
acl acl
ENT_KB_ID

View File

@ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm): def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
tokens = da_tokenizer(text) tokens = da_tokenizer(text)
assert tokens[0].norm_ == norm assert tokens[0].norm_ == norm
@pytest.mark.parametrize(
"text,n_tokens",
[
("Godt og/eller skidt", 3),
("Kør 4 km/t på vejen", 5),
("Det blæser 12 m/s.", 5),
("Det blæser 12 m/sek. på havnen", 6),
("Windows 8/Windows 10", 5),
("Billeten virker til bus/tog/metro", 8),
("26/02/2019", 1),
("Kristiansen c/o Madsen", 3),
("Sprogteknologi a/s", 2),
("De boede i A/B Bellevue", 5),
("Rotorhastigheden er 3400 o/m.", 5),
("Jeg købte billet t/r.", 5),
("Murerarbejdsmand m/k søges", 3),
("Netværket kører over TCP/IP", 4),
],
)
def test_da_tokenizer_slash(da_tokenizer, text, n_tokens):
tokens = da_tokenizer(text)
assert len(tokens) == n_tokens

View File

@ -0,0 +1,51 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
import spacy
from spacy.util import minibatch, compounding
def test_issue3611():
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
unique_classes = ["offensive", "inoffensive"]
x_train = ["This is an offensive text",
"This is the second offensive text",
"inoff"]
y_train = ["offensive", "offensive", "inoffensive"]
# preparing the data
pos_cats = list()
for train_instance in y_train:
pos_cats.append({label: label == train_instance for label in unique_classes})
train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
# set up the spacy model with a text categorizer component
nlp = spacy.blank('en')
textcat = nlp.create_pipe(
"textcat",
config={
"exclusive_classes": True,
"architecture": "bow",
"ngram_size": 2
}
)
for label in unique_classes:
textcat.add_label(label)
nlp.add_pipe(textcat, last=True)
# training the network
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)

View File

@ -152,20 +152,21 @@ const Landing = ({ data }) => {
<LandingBannerGrid> <LandingBannerGrid>
<LandingBanner <LandingBanner
title="spaCy IRL 2019: Two days of NLP" title="spaCy IRL 2019: Two days of NLP"
label="Join us in Berlin" label="Watch the videos"
to="https://irl.spacy.io/2019" to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc"
button="Get tickets" button="Watch the videos"
background="#ffc194" background="#ffc194"
backgroundImage={irlBackground} backgroundImage={irlBackground}
color="#1a1e23" color="#1a1e23"
small small
> >
We're pleased to invite the spaCy community and other folks working on Natural We were pleased to invite the spaCy community and other folks working on Natural
Language Processing to Berlin this summer for a small and intimate event{' '} Language Processing to Berlin this summer for a small and intimate event{' '}
<strong>July 5-6, 2019</strong>. The event includes a hands-on training day for <strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an
teams using spaCy in production, followed by a one-track conference. We've awesome lineup of speakers and scheduled plenty of social time to get to know
booked a beautiful venue, hand-picked an awesome lineup of speakers and each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
scheduled plenty of social time to get to know each other and exchange ideas. research, development and applications, with keynotes by Sebastian Ruder
(DeepMind) and Yoav Goldberg (Allen AI).
</LandingBanner> </LandingBanner>
<LandingBanner <LandingBanner