mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'master' of github.com:pmbaumgartner/spaCy
This commit is contained in:
		
						commit
						040bb061fd
					
				| 
						 | 
					@ -5,6 +5,6 @@ requires = ["setuptools",
 | 
				
			||||||
            "cymem>=2.0.2,<2.1.0",
 | 
					            "cymem>=2.0.2,<2.1.0",
 | 
				
			||||||
            "preshed>=2.0.1,<2.1.0",
 | 
					            "preshed>=2.0.1,<2.1.0",
 | 
				
			||||||
            "murmurhash>=0.28.0,<1.1.0",
 | 
					            "murmurhash>=0.28.0,<1.1.0",
 | 
				
			||||||
            "thinc>=7.0.6,<7.1.0",
 | 
					            "thinc>=7.0.8,<7.1.0",
 | 
				
			||||||
            ]
 | 
					            ]
 | 
				
			||||||
build-backend = "setuptools.build_meta"
 | 
					build-backend = "setuptools.build_meta"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,7 @@
 | 
				
			||||||
# Our libraries
 | 
					# Our libraries
 | 
				
			||||||
cymem>=2.0.2,<2.1.0
 | 
					cymem>=2.0.2,<2.1.0
 | 
				
			||||||
preshed>=2.0.1,<2.1.0
 | 
					preshed>=2.0.1,<2.1.0
 | 
				
			||||||
thinc>=7.0.6,<7.1.0
 | 
					thinc>=7.0.8,<7.1.0
 | 
				
			||||||
blis>=0.2.2,<0.3.0
 | 
					blis>=0.2.2,<0.3.0
 | 
				
			||||||
murmurhash>=0.28.0,<1.1.0
 | 
					murmurhash>=0.28.0,<1.1.0
 | 
				
			||||||
wasabi>=0.2.0,<1.1.0
 | 
					wasabi>=0.2.0,<1.1.0
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							| 
						 | 
					@ -228,7 +228,7 @@ def setup_package():
 | 
				
			||||||
                "murmurhash>=0.28.0,<1.1.0",
 | 
					                "murmurhash>=0.28.0,<1.1.0",
 | 
				
			||||||
                "cymem>=2.0.2,<2.1.0",
 | 
					                "cymem>=2.0.2,<2.1.0",
 | 
				
			||||||
                "preshed>=2.0.1,<2.1.0",
 | 
					                "preshed>=2.0.1,<2.1.0",
 | 
				
			||||||
                "thinc>=7.0.6,<7.1.0",
 | 
					                "thinc>=7.0.8,<7.1.0",
 | 
				
			||||||
                "blis>=0.2.2,<0.3.0",
 | 
					                "blis>=0.2.2,<0.3.0",
 | 
				
			||||||
                "plac<1.0.0,>=0.9.6",
 | 
					                "plac<1.0.0,>=0.9.6",
 | 
				
			||||||
                "requests>=2.13.0,<3.0.0",
 | 
					                "requests>=2.13.0,<3.0.0",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,13 +4,13 @@
 | 
				
			||||||
# fmt: off
 | 
					# fmt: off
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__title__ = "spacy"
 | 
					__title__ = "spacy"
 | 
				
			||||||
__version__ = "2.1.5.dev0"
 | 
					__version__ = "2.1.6"
 | 
				
			||||||
__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 | 
					__summary__ = "Industrial-strength Natural Language Processing (NLP) with Python and Cython"
 | 
				
			||||||
__uri__ = "https://spacy.io"
 | 
					__uri__ = "https://spacy.io"
 | 
				
			||||||
__author__ = "Explosion AI"
 | 
					__author__ = "Explosion AI"
 | 
				
			||||||
__email__ = "contact@explosion.ai"
 | 
					__email__ = "contact@explosion.ai"
 | 
				
			||||||
__license__ = "MIT"
 | 
					__license__ = "MIT"
 | 
				
			||||||
__release__ = False
 | 
					__release__ = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
					__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
				
			||||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
					__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,4 +1,6 @@
 | 
				
			||||||
# Reserve 64 values for flag features
 | 
					# Reserve 64 values for flag features
 | 
				
			||||||
 | 
					from . cimport symbols
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef enum attr_id_t:
 | 
					cdef enum attr_id_t:
 | 
				
			||||||
    NULL_ATTR
 | 
					    NULL_ATTR
 | 
				
			||||||
    IS_ALPHA
 | 
					    IS_ALPHA
 | 
				
			||||||
| 
						 | 
					@ -82,10 +84,10 @@ cdef enum attr_id_t:
 | 
				
			||||||
    DEP
 | 
					    DEP
 | 
				
			||||||
    ENT_IOB
 | 
					    ENT_IOB
 | 
				
			||||||
    ENT_TYPE
 | 
					    ENT_TYPE
 | 
				
			||||||
    ENT_KB_ID
 | 
					 | 
				
			||||||
    HEAD
 | 
					    HEAD
 | 
				
			||||||
    SENT_START
 | 
					    SENT_START
 | 
				
			||||||
    SPACY
 | 
					    SPACY
 | 
				
			||||||
    PROB
 | 
					    PROB
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    LANG
 | 
					    LANG
 | 
				
			||||||
 | 
					    ENT_KB_ID = symbols.ENT_KB_ID
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -14,10 +14,11 @@ _infixes = (
 | 
				
			||||||
    + [
 | 
					    + [
 | 
				
			||||||
        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
 | 
					        r"(?<=[{al}])\.(?=[{au}])".format(al=ALPHA_LOWER, au=ALPHA_UPPER),
 | 
				
			||||||
        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
 | 
					        r"(?<=[{a}])[,!?](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
        r'(?<=[{a}])[:<>=](?=[{a}])'.format(a=ALPHA),
 | 
					        r"(?<=[{a}])[:<>=](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
					        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
 | 
					        r"(?<=[{a}])([{q}\)\]\(\[])(?=[{a}])".format(a=ALPHA, q=_quotes),
 | 
				
			||||||
        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
 | 
					        r"(?<=[{a}])--(?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
 | 
					        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -52,6 +52,7 @@ for exc_data in [
 | 
				
			||||||
    {ORTH: "Ons.", LEMMA: "onsdag"},
 | 
					    {ORTH: "Ons.", LEMMA: "onsdag"},
 | 
				
			||||||
    {ORTH: "Fre.", LEMMA: "fredag"},
 | 
					    {ORTH: "Fre.", LEMMA: "fredag"},
 | 
				
			||||||
    {ORTH: "Lør.", LEMMA: "lørdag"},
 | 
					    {ORTH: "Lør.", LEMMA: "lørdag"},
 | 
				
			||||||
 | 
					    {ORTH: "og/eller", LEMMA: "og/eller", NORM: "og/eller", TAG: "CC"},
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[exc_data[ORTH]] = [exc_data]
 | 
					    _exc[exc_data[ORTH]] = [exc_data]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -64,6 +65,8 @@ for orth in [
 | 
				
			||||||
    "mik.",
 | 
					    "mik.",
 | 
				
			||||||
    "pers.",
 | 
					    "pers.",
 | 
				
			||||||
    "A.D.",
 | 
					    "A.D.",
 | 
				
			||||||
 | 
					    "A/B",
 | 
				
			||||||
 | 
					    "a/s",
 | 
				
			||||||
    "A/S",
 | 
					    "A/S",
 | 
				
			||||||
    "B.C.",
 | 
					    "B.C.",
 | 
				
			||||||
    "BK.",
 | 
					    "BK.",
 | 
				
			||||||
| 
						 | 
					@ -79,7 +82,9 @@ for orth in [
 | 
				
			||||||
    "Kprs.",
 | 
					    "Kprs.",
 | 
				
			||||||
    "L.A.",
 | 
					    "L.A.",
 | 
				
			||||||
    "Ll.",
 | 
					    "Ll.",
 | 
				
			||||||
 | 
					    "m/k",
 | 
				
			||||||
    "m/s",
 | 
					    "m/s",
 | 
				
			||||||
 | 
					    "m/sek.",
 | 
				
			||||||
    "M/S",
 | 
					    "M/S",
 | 
				
			||||||
    "Mag.",
 | 
					    "Mag.",
 | 
				
			||||||
    "Mr.",
 | 
					    "Mr.",
 | 
				
			||||||
| 
						 | 
					@ -90,6 +95,7 @@ for orth in [
 | 
				
			||||||
    "Sdr.",
 | 
					    "Sdr.",
 | 
				
			||||||
    "Skt.",
 | 
					    "Skt.",
 | 
				
			||||||
    "Spl.",
 | 
					    "Spl.",
 | 
				
			||||||
 | 
					    "TCP/IP",
 | 
				
			||||||
    "Vg.",
 | 
					    "Vg.",
 | 
				
			||||||
]:
 | 
					]:
 | 
				
			||||||
    _exc[orth] = [{ORTH: orth}]
 | 
					    _exc[orth] = [{ORTH: orth}]
 | 
				
			||||||
| 
						 | 
					@ -141,6 +147,7 @@ for orth in [
 | 
				
			||||||
    "brolægn.",
 | 
					    "brolægn.",
 | 
				
			||||||
    "bto.",
 | 
					    "bto.",
 | 
				
			||||||
    "bygn.",
 | 
					    "bygn.",
 | 
				
			||||||
 | 
					    "c/o",
 | 
				
			||||||
    "ca.",
 | 
					    "ca.",
 | 
				
			||||||
    "cand.",
 | 
					    "cand.",
 | 
				
			||||||
    "d.d.",
 | 
					    "d.d.",
 | 
				
			||||||
| 
						 | 
					@ -293,6 +300,7 @@ for orth in [
 | 
				
			||||||
    "kgl.",
 | 
					    "kgl.",
 | 
				
			||||||
    "kl.",
 | 
					    "kl.",
 | 
				
			||||||
    "kld.",
 | 
					    "kld.",
 | 
				
			||||||
 | 
					    "km/t",
 | 
				
			||||||
    "knsp.",
 | 
					    "knsp.",
 | 
				
			||||||
    "komm.",
 | 
					    "komm.",
 | 
				
			||||||
    "kons.",
 | 
					    "kons.",
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -81,7 +81,6 @@ cdef enum symbol_t:
 | 
				
			||||||
    DEP
 | 
					    DEP
 | 
				
			||||||
    ENT_IOB
 | 
					    ENT_IOB
 | 
				
			||||||
    ENT_TYPE
 | 
					    ENT_TYPE
 | 
				
			||||||
    ENT_KB_ID
 | 
					 | 
				
			||||||
    HEAD
 | 
					    HEAD
 | 
				
			||||||
    SENT_START
 | 
					    SENT_START
 | 
				
			||||||
    SPACY
 | 
					    SPACY
 | 
				
			||||||
| 
						 | 
					@ -461,3 +460,5 @@ cdef enum symbol_t:
 | 
				
			||||||
    xcomp
 | 
					    xcomp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    acl
 | 
					    acl
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ENT_KB_ID
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,3 +43,27 @@ def test_da_tokenizer_handles_custom_base_exc(da_tokenizer):
 | 
				
			||||||
def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
 | 
					def test_da_tokenizer_norm_exceptions(da_tokenizer, text, norm):
 | 
				
			||||||
    tokens = da_tokenizer(text)
 | 
					    tokens = da_tokenizer(text)
 | 
				
			||||||
    assert tokens[0].norm_ == norm
 | 
					    assert tokens[0].norm_ == norm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "text,n_tokens",
 | 
				
			||||||
 | 
					    [
 | 
				
			||||||
 | 
					        ("Godt og/eller skidt", 3),
 | 
				
			||||||
 | 
					        ("Kør 4 km/t på vejen", 5),
 | 
				
			||||||
 | 
					        ("Det blæser 12 m/s.", 5),
 | 
				
			||||||
 | 
					        ("Det blæser 12 m/sek. på havnen", 6),
 | 
				
			||||||
 | 
					        ("Windows 8/Windows 10", 5),
 | 
				
			||||||
 | 
					        ("Billeten virker til bus/tog/metro", 8),
 | 
				
			||||||
 | 
					        ("26/02/2019", 1),
 | 
				
			||||||
 | 
					        ("Kristiansen c/o Madsen", 3),
 | 
				
			||||||
 | 
					        ("Sprogteknologi a/s", 2),
 | 
				
			||||||
 | 
					        ("De boede i A/B Bellevue", 5),
 | 
				
			||||||
 | 
					        ("Rotorhastigheden er 3400 o/m.", 5),
 | 
				
			||||||
 | 
					        ("Jeg købte billet t/r.", 5),
 | 
				
			||||||
 | 
					        ("Murerarbejdsmand m/k søges", 3),
 | 
				
			||||||
 | 
					        ("Netværket kører over TCP/IP", 4),
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_da_tokenizer_slash(da_tokenizer, text, n_tokens):
 | 
				
			||||||
 | 
					    tokens = da_tokenizer(text)
 | 
				
			||||||
 | 
					    assert len(tokens) == n_tokens
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
							
								
								
									
										51
									
								
								spacy/tests/regression/test_issue3611.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										51
									
								
								spacy/tests/regression/test_issue3611.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
					@ -0,0 +1,51 @@
 | 
				
			||||||
 | 
					# coding: utf8
 | 
				
			||||||
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
 | 
					from spacy.util import minibatch, compounding
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_issue3611():
 | 
				
			||||||
 | 
					    """ Test whether adding n-grams in the textcat works even when n > token length of some docs """
 | 
				
			||||||
 | 
					    unique_classes = ["offensive", "inoffensive"]
 | 
				
			||||||
 | 
					    x_train = ["This is an offensive text",
 | 
				
			||||||
 | 
					               "This is the second offensive text",
 | 
				
			||||||
 | 
					               "inoff"]
 | 
				
			||||||
 | 
					    y_train = ["offensive", "offensive", "inoffensive"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # preparing the data
 | 
				
			||||||
 | 
					    pos_cats = list()
 | 
				
			||||||
 | 
					    for train_instance in y_train:
 | 
				
			||||||
 | 
					        pos_cats.append({label: label == train_instance for label in unique_classes})
 | 
				
			||||||
 | 
					    train_data = list(zip(x_train, [{'cats': cats} for cats in pos_cats]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # set up the spacy model with a text categorizer component
 | 
				
			||||||
 | 
					    nlp = spacy.blank('en')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    textcat = nlp.create_pipe(
 | 
				
			||||||
 | 
					        "textcat",
 | 
				
			||||||
 | 
					        config={
 | 
				
			||||||
 | 
					            "exclusive_classes": True,
 | 
				
			||||||
 | 
					            "architecture": "bow",
 | 
				
			||||||
 | 
					            "ngram_size": 2
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for label in unique_classes:
 | 
				
			||||||
 | 
					        textcat.add_label(label)
 | 
				
			||||||
 | 
					    nlp.add_pipe(textcat, last=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # training the network
 | 
				
			||||||
 | 
					    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
 | 
				
			||||||
 | 
					    with nlp.disable_pipes(*other_pipes):
 | 
				
			||||||
 | 
					        optimizer = nlp.begin_training()
 | 
				
			||||||
 | 
					        for i in range(3):
 | 
				
			||||||
 | 
					            losses = {}
 | 
				
			||||||
 | 
					            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            for batch in batches:
 | 
				
			||||||
 | 
					                texts, annotations = zip(*batch)
 | 
				
			||||||
 | 
					                nlp.update(docs=texts, golds=annotations, sgd=optimizer, drop=0.1, losses=losses)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -152,20 +152,21 @@ const Landing = ({ data }) => {
 | 
				
			||||||
            <LandingBannerGrid>
 | 
					            <LandingBannerGrid>
 | 
				
			||||||
                <LandingBanner
 | 
					                <LandingBanner
 | 
				
			||||||
                    title="spaCy IRL 2019: Two days of NLP"
 | 
					                    title="spaCy IRL 2019: Two days of NLP"
 | 
				
			||||||
                    label="Join us in Berlin"
 | 
					                    label="Watch the videos"
 | 
				
			||||||
                    to="https://irl.spacy.io/2019"
 | 
					                    to="https://www.youtube.com/playlist?list=PLBmcuObd5An4UC6jvK_-eSl6jCvP1gwXc"
 | 
				
			||||||
                    button="Get tickets"
 | 
					                    button="Watch the videos"
 | 
				
			||||||
                    background="#ffc194"
 | 
					                    background="#ffc194"
 | 
				
			||||||
                    backgroundImage={irlBackground}
 | 
					                    backgroundImage={irlBackground}
 | 
				
			||||||
                    color="#1a1e23"
 | 
					                    color="#1a1e23"
 | 
				
			||||||
                    small
 | 
					                    small
 | 
				
			||||||
                >
 | 
					                >
 | 
				
			||||||
                    We're pleased to invite the spaCy community and other folks working on Natural
 | 
					                    We were pleased to invite the spaCy community and other folks working on Natural
 | 
				
			||||||
                    Language Processing to Berlin this summer for a small and intimate event{' '}
 | 
					                    Language Processing to Berlin this summer for a small and intimate event{' '}
 | 
				
			||||||
                    <strong>July 5-6, 2019</strong>. The event includes a hands-on training day for
 | 
					                    <strong>July 6, 2019</strong>. We booked a beautiful venue, hand-picked an
 | 
				
			||||||
                    teams using spaCy in production, followed by a one-track conference. We've
 | 
					                    awesome lineup of speakers and scheduled plenty of social time to get to know
 | 
				
			||||||
                    booked a beautiful venue, hand-picked an awesome lineup of speakers and
 | 
					                    each other and exchange ideas. The YouTube playlist includes 12 talks about NLP
 | 
				
			||||||
                    scheduled plenty of social time to get to know each other and exchange ideas.
 | 
					                    research, development and applications, with keynotes by Sebastian Ruder
 | 
				
			||||||
 | 
					                    (DeepMind) and Yoav Goldberg (Allen AI).
 | 
				
			||||||
                </LandingBanner>
 | 
					                </LandingBanner>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                <LandingBanner
 | 
					                <LandingBanner
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user