Merge branch 'master' of github.com:pmbaumgartner/spaCy

This commit is contained in:
pmbaumgartner 2019-07-14 20:25:37 -04:00
commit 040bb061fd
11 changed files with 104 additions and 16 deletions

View File

@ -5,6 +5,6 @@ requires = ["setuptools",
"cymem>=2.0.2,<2.1.0",
"preshed>=2.0.1,<2.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=7.0.6,<7.1.0",
"thinc>=7.0.8,<7.1.0",
]
build-backend = "setuptools.build_meta"

View File

@ -1,7 +1,7 @@
# Our libraries
cymem>=2.0.2,<2.1.0
preshed>=2.0.1,<2.1.0
thinc>=7.0.6,<7.1.0
thinc>=7.0.8,<7.1.0
blis>=0.2.2,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.2.0,<1.1.0

View File

@ -228,7 +228,7 @@ def setup_package():
"murmurhash>=0.28.0,<1.1.0",
"cymem>=2.0.2,<2.1.0",
"preshed>=2.0.1,<2.1.0",
"thinc>=7.0.6,<7.1.0",
"thinc>=7.0.8,<7.1.0",
"blis>=0.2.2,<0.3.0",
"plac<1.0.0,>=0.9.6",
"requests>=2.13.0,<3.0.0",

View File

@ -4,13 +4,13 @@
# fmt: off
__title__ = "spacy"
__version__ = "2.1.5.dev0"
__version__ = "2.1.6"
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
__uri__ = "https://spacy.io"
__author__ = "Explosion AI"
__email__ = "contact@explosion.ai"
__license__ = "MIT"
__release__ = False
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -1,4 +1,6 @@
# Reserve 64 values for flag features
from . cimport symbols
cdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
@ -82,10 +84,10 @@ cdef enum attr_id_t:
DEP
ENT_IOB
ENT_TYPE
ENT_KB_ID
HEAD
SENT_START
SPACY
PROB
LANG
ENT_KB_ID = symbols.ENT_KB_ID

View File

@ -14,10 +14,11 @@ _infixes = (
+ [
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
]
)

View File

@ -52,6 +52,7 @@ for exc_data in [
{ORTH: "Ons.", LEMMA: "onsdag"},
{ORTH: "Fre.", LEMMA: "fredag"},
{ORTH: "Lør.", LEMMA: "lørdag"},
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
]:
_exc[exc_data[ORTH]] = [exc_data]
@ -64,6 +65,8 @@ for orth in [
"mik.",
"pers.",
"A.D.",
"A/B",
"a/s",
"A/S",
"B.C.",
"BK.",
@ -79,7 +82,9 @@ for orth in [
"Kprs.",
"L.A.",
"Ll.",
"m/k",
"m/s",
"m/sek.",
"M/S",
"Mag.",
"Mr.",
@ -90,6 +95,7 @@ for orth in [
"Sdr.",
"Skt.",
"Spl.",
"TCP/IP",
"Vg.",
]:
_exc[orth] = [{ORTH: orth}]
@ -141,6 +147,7 @@ for orth in [
"brolægn.",
"bto.",
"bygn.",
"c/o",
"ca.",
"cand.",
"d.d.",
@ -293,6 +300,7 @@ for orth in [
"kgl.",
"kl.",
"kld.",
"km/t",
"knsp.",
"komm.",
"kons.",

View File

@ -81,7 +81,6 @@ cdef enum symbol_t:
DEP
ENT_IOB
ENT_TYPE
ENT_KB_ID
HEAD
SENT_START
SPACY
@ -461,3 +460,5 @@ cdef enum symbol_t:
xcomp
acl
ENT_KB_ID

View File

@ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
tokens = da_tokenizer(text)
assert tokens[0].norm_ == norm
@pytest.mark.parametrize(
"text,n_tokens",
[
("Godt og/eller skidt", 3),
("Kør 4 km/t på vejen", 5),
("Det blæser 12 m/s.", 5),
("Det blæser 12 m/sek. på havnen", 6),
("Windows 8/Windows 10", 5),
("Billeten virker til bus/tog/metro", 8),
("26/02/2019", 1),
("Kristiansen c/o Madsen", 3),
("Sprogteknologi a/s", 2),
("De boede i A/B Bellevue", 5),
("Rotorhastigheden er 3400 o/m.", 5),
("Jeg købte billet t/r.", 5),
("Murerarbejdsmand m/k søges", 3),
("Netværket kører over TCP/IP", 4),
],
)
def test_da_tokenizer_slash(da_tokenizer, text, n_tokens):
tokens = da_tokenizer(text)
assert len(tokens) == n_tokens

View File

@ -0,0 +1,51 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
import spacy
from spacy.util import minibatch, compounding
def test_issue3611():
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
unique_classes = ["offensive", "inoffensive"]
x_train = ["This is an offensive text",
"This is the second offensive text",
"inoff"]
y_train = ["offensive", "offensive", "inoffensive"]
# preparing the data
pos_cats = list()
for train_instance in y_train:
pos_cats.append({label: label == train_instance for label in unique_classes})
train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
# set up the spacy model with a text categorizer component
nlp = spacy.blank('en')
textcat = nlp.create_pipe(
"textcat",
config={
"exclusive_classes": True,
"architecture": "bow",
"ngram_size": 2
}
)
for label in unique_classes:
textcat.add_label(label)
nlp.add_pipe(textcat, last=True)
# training the network
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.begin_training()
for i in range(3):
losses = {}
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)

View File

@ -152,20 +152,21 @@ const Landing = ({ data }) => {
<LandingBannerGrid>
<LandingBanner
title="spaCy IRL 2019: Two days of NLP"
label="Join us in Berlin"
to="https://irl.spacy.io/2019"
button="Get tickets"
label="Watch the videos"
to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc"
button="Watch the videos"
background="#ffc194"
backgroundImage={irlBackground}
color="#1a1e23"
small
>
We're pleased to invite the spaCy community and other folks working on Natural
We were pleased to invite the spaCy community and other folks working on Natural
Language Processing to Berlin this summer for a small and intimate event{' '}
<strong>July 5-6, 2019</strong>. The event includes a hands-on training day for
teams using spaCy in production, followed by a one-track conference. We've
booked a beautiful venue, hand-picked an awesome lineup of speakers and
scheduled plenty of social time to get to know each other and exchange ideas.
<strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an
awesome lineup of speakers and scheduled plenty of social time to get to know
each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
research, development and applications, with keynotes by Sebastian Ruder
(DeepMind) and Yoav Goldberg (Allen AI).
</LandingBanner>
<LandingBanner