mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Merge branch 'master' of github.com:pmbaumgartner/spaCy
This commit is contained in:
commit
040bb061fd
|
@ -5,6 +5,6 @@ requires = ["setuptools",
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=2.0.1,<2.1.0",
|
"preshed>=2.0.1,<2.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=7.0.6,<7.1.0",
|
"thinc>=7.0.8,<7.1.0",
|
||||||
]
|
]
|
||||||
build-backend = "setuptools.build_meta"
|
build-backend = "setuptools.build_meta"
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=2.0.1,<2.1.0
|
preshed>=2.0.1,<2.1.0
|
||||||
thinc>=7.0.6,<7.1.0
|
thinc>=7.0.8,<7.1.0
|
||||||
blis>=0.2.2,<0.3.0
|
blis>=0.2.2,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
wasabi>=0.2.0,<1.1.0
|
wasabi>=0.2.0,<1.1.0
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -228,7 +228,7 @@ def setup_package():
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=2.0.1,<2.1.0",
|
"preshed>=2.0.1,<2.1.0",
|
||||||
"thinc>=7.0.6,<7.1.0",
|
"thinc>=7.0.8,<7.1.0",
|
||||||
"blis>=0.2.2,<0.3.0",
|
"blis>=0.2.2,<0.3.0",
|
||||||
"plac<1.0.0,>=0.9.6",
|
"plac<1.0.0,>=0.9.6",
|
||||||
"requests>=2.13.0,<3.0.0",
|
"requests>=2.13.0,<3.0.0",
|
||||||
|
|
|
@ -4,13 +4,13 @@
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
||||||
__title__ = "spacy"
|
__title__ = "spacy"
|
||||||
__version__ = "2.1.5.dev0"
|
__version__ = "2.1.6"
|
||||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
|
||||||
__uri__ = "https://spacy.io"
|
__uri__ = "https://spacy.io"
|
||||||
__author__ = "Explosion AI"
|
__author__ = "Explosion AI"
|
||||||
__email__ = "contact@explosion.ai"
|
__email__ = "contact@explosion.ai"
|
||||||
__license__ = "MIT"
|
__license__ = "MIT"
|
||||||
__release__ = False
|
__release__ = True
|
||||||
|
|
||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
# Reserve 64 values for flag features
|
# Reserve 64 values for flag features
|
||||||
|
from . cimport symbols
|
||||||
|
|
||||||
cdef enum attr_id_t:
|
cdef enum attr_id_t:
|
||||||
NULL_ATTR
|
NULL_ATTR
|
||||||
IS_ALPHA
|
IS_ALPHA
|
||||||
|
@ -82,10 +84,10 @@ cdef enum attr_id_t:
|
||||||
DEP
|
DEP
|
||||||
ENT_IOB
|
ENT_IOB
|
||||||
ENT_TYPE
|
ENT_TYPE
|
||||||
ENT_KB_ID
|
|
||||||
HEAD
|
HEAD
|
||||||
SENT_START
|
SENT_START
|
||||||
SPACY
|
SPACY
|
||||||
PROB
|
PROB
|
||||||
|
|
||||||
LANG
|
LANG
|
||||||
|
ENT_KB_ID = symbols.ENT_KB_ID
|
||||||
|
|
|
@ -14,10 +14,11 @@ _infixes = (
|
||||||
+ [
|
+ [
|
||||||
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
|
||||||
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
|
||||||
r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
|
r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
|
||||||
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
|
||||||
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
|
||||||
|
r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -52,6 +52,7 @@ for exc_data in [
|
||||||
{ORTH: "Ons.", LEMMA: "onsdag"},
|
{ORTH: "Ons.", LEMMA: "onsdag"},
|
||||||
{ORTH: "Fre.", LEMMA: "fredag"},
|
{ORTH: "Fre.", LEMMA: "fredag"},
|
||||||
{ORTH: "Lør.", LEMMA: "lørdag"},
|
{ORTH: "Lør.", LEMMA: "lørdag"},
|
||||||
|
{ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
|
||||||
]:
|
]:
|
||||||
_exc[exc_data[ORTH]] = [exc_data]
|
_exc[exc_data[ORTH]] = [exc_data]
|
||||||
|
|
||||||
|
@ -64,6 +65,8 @@ for orth in [
|
||||||
"mik.",
|
"mik.",
|
||||||
"pers.",
|
"pers.",
|
||||||
"A.D.",
|
"A.D.",
|
||||||
|
"A/B",
|
||||||
|
"a/s",
|
||||||
"A/S",
|
"A/S",
|
||||||
"B.C.",
|
"B.C.",
|
||||||
"BK.",
|
"BK.",
|
||||||
|
@ -79,7 +82,9 @@ for orth in [
|
||||||
"Kprs.",
|
"Kprs.",
|
||||||
"L.A.",
|
"L.A.",
|
||||||
"Ll.",
|
"Ll.",
|
||||||
|
"m/k",
|
||||||
"m/s",
|
"m/s",
|
||||||
|
"m/sek.",
|
||||||
"M/S",
|
"M/S",
|
||||||
"Mag.",
|
"Mag.",
|
||||||
"Mr.",
|
"Mr.",
|
||||||
|
@ -90,6 +95,7 @@ for orth in [
|
||||||
"Sdr.",
|
"Sdr.",
|
||||||
"Skt.",
|
"Skt.",
|
||||||
"Spl.",
|
"Spl.",
|
||||||
|
"TCP/IP",
|
||||||
"Vg.",
|
"Vg.",
|
||||||
]:
|
]:
|
||||||
_exc[orth] = [{ORTH: orth}]
|
_exc[orth] = [{ORTH: orth}]
|
||||||
|
@ -141,6 +147,7 @@ for orth in [
|
||||||
"brolægn.",
|
"brolægn.",
|
||||||
"bto.",
|
"bto.",
|
||||||
"bygn.",
|
"bygn.",
|
||||||
|
"c/o",
|
||||||
"ca.",
|
"ca.",
|
||||||
"cand.",
|
"cand.",
|
||||||
"d.d.",
|
"d.d.",
|
||||||
|
@ -293,6 +300,7 @@ for orth in [
|
||||||
"kgl.",
|
"kgl.",
|
||||||
"kl.",
|
"kl.",
|
||||||
"kld.",
|
"kld.",
|
||||||
|
"km/t",
|
||||||
"knsp.",
|
"knsp.",
|
||||||
"komm.",
|
"komm.",
|
||||||
"kons.",
|
"kons.",
|
||||||
|
|
|
@ -81,7 +81,6 @@ cdef enum symbol_t:
|
||||||
DEP
|
DEP
|
||||||
ENT_IOB
|
ENT_IOB
|
||||||
ENT_TYPE
|
ENT_TYPE
|
||||||
ENT_KB_ID
|
|
||||||
HEAD
|
HEAD
|
||||||
SENT_START
|
SENT_START
|
||||||
SPACY
|
SPACY
|
||||||
|
@ -461,3 +460,5 @@ cdef enum symbol_t:
|
||||||
xcomp
|
xcomp
|
||||||
|
|
||||||
acl
|
acl
|
||||||
|
|
||||||
|
ENT_KB_ID
|
||||||
|
|
|
@ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
|
||||||
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
|
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
|
||||||
tokens = da_tokenizer(text)
|
tokens = da_tokenizer(text)
|
||||||
assert tokens[0].norm_ == norm
|
assert tokens[0].norm_ == norm
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"text,n_tokens",
|
||||||
|
[
|
||||||
|
("Godt og/eller skidt", 3),
|
||||||
|
("Kør 4 km/t på vejen", 5),
|
||||||
|
("Det blæser 12 m/s.", 5),
|
||||||
|
("Det blæser 12 m/sek. på havnen", 6),
|
||||||
|
("Windows 8/Windows 10", 5),
|
||||||
|
("Billeten virker til bus/tog/metro", 8),
|
||||||
|
("26/02/2019", 1),
|
||||||
|
("Kristiansen c/o Madsen", 3),
|
||||||
|
("Sprogteknologi a/s", 2),
|
||||||
|
("De boede i A/B Bellevue", 5),
|
||||||
|
("Rotorhastigheden er 3400 o/m.", 5),
|
||||||
|
("Jeg købte billet t/r.", 5),
|
||||||
|
("Murerarbejdsmand m/k søges", 3),
|
||||||
|
("Netværket kører over TCP/IP", 4),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_da_tokenizer_slash(da_tokenizer, text, n_tokens):
|
||||||
|
tokens = da_tokenizer(text)
|
||||||
|
assert len(tokens) == n_tokens
|
||||||
|
|
51
spacy/tests/regression/test_issue3611.py
Normal file
51
spacy/tests/regression/test_issue3611.py
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
# coding: utf8
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import spacy
|
||||||
|
from spacy.util import minibatch, compounding
|
||||||
|
|
||||||
|
|
||||||
|
def test_issue3611():
|
||||||
|
""" Test whether adding n-grams in the textcat works even when n > token length of some docs """
|
||||||
|
unique_classes = ["offensive", "inoffensive"]
|
||||||
|
x_train = ["This is an offensive text",
|
||||||
|
"This is the second offensive text",
|
||||||
|
"inoff"]
|
||||||
|
y_train = ["offensive", "offensive", "inoffensive"]
|
||||||
|
|
||||||
|
# preparing the data
|
||||||
|
pos_cats = list()
|
||||||
|
for train_instance in y_train:
|
||||||
|
pos_cats.append({label: label == train_instance for label in unique_classes})
|
||||||
|
train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
|
||||||
|
|
||||||
|
# set up the spacy model with a text categorizer component
|
||||||
|
nlp = spacy.blank('en')
|
||||||
|
|
||||||
|
textcat = nlp.create_pipe(
|
||||||
|
"textcat",
|
||||||
|
config={
|
||||||
|
"exclusive_classes": True,
|
||||||
|
"architecture": "bow",
|
||||||
|
"ngram_size": 2
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
for label in unique_classes:
|
||||||
|
textcat.add_label(label)
|
||||||
|
nlp.add_pipe(textcat, last=True)
|
||||||
|
|
||||||
|
# training the network
|
||||||
|
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
|
||||||
|
with nlp.disable_pipes(*other_pipes):
|
||||||
|
optimizer = nlp.begin_training()
|
||||||
|
for i in range(3):
|
||||||
|
losses = {}
|
||||||
|
batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
|
||||||
|
|
||||||
|
for batch in batches:
|
||||||
|
texts, annotations = zip(*batch)
|
||||||
|
nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)
|
||||||
|
|
||||||
|
|
|
@ -152,20 +152,21 @@ const Landing = ({ data }) => {
|
||||||
<LandingBannerGrid>
|
<LandingBannerGrid>
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
title="spaCy IRL 2019: Two days of NLP"
|
title="spaCy IRL 2019: Two days of NLP"
|
||||||
label="Join us in Berlin"
|
label="Watch the videos"
|
||||||
to="https://irl.spacy.io/2019"
|
to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc"
|
||||||
button="Get tickets"
|
button="Watch the videos"
|
||||||
background="#ffc194"
|
background="#ffc194"
|
||||||
backgroundImage={irlBackground}
|
backgroundImage={irlBackground}
|
||||||
color="#1a1e23"
|
color="#1a1e23"
|
||||||
small
|
small
|
||||||
>
|
>
|
||||||
We're pleased to invite the spaCy community and other folks working on Natural
|
We were pleased to invite the spaCy community and other folks working on Natural
|
||||||
Language Processing to Berlin this summer for a small and intimate event{' '}
|
Language Processing to Berlin this summer for a small and intimate event{' '}
|
||||||
<strong>July 5-6, 2019</strong>. The event includes a hands-on training day for
|
<strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an
|
||||||
teams using spaCy in production, followed by a one-track conference. We've
|
awesome lineup of speakers and scheduled plenty of social time to get to know
|
||||||
booked a beautiful venue, hand-picked an awesome lineup of speakers and
|
each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
|
||||||
scheduled plenty of social time to get to know each other and exchange ideas.
|
research, development and applications, with keynotes by Sebastian Ruder
|
||||||
|
(DeepMind) and Yoav Goldberg (Allen AI).
|
||||||
</LandingBanner>
|
</LandingBanner>
|
||||||
|
|
||||||
<LandingBanner
|
<LandingBanner
|
||||||
|
|
Loading…
Reference in New Issue
Block a user