mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	test_gold_biluo_different_tokenization works
This commit is contained in:
		
							parent
							
								
									1c35b8efcd
								
							
						
					
					
						commit
						44a0f9c2c8
					
				| 
						 | 
				
			
			@ -85,17 +85,17 @@ cdef class Example:
 | 
			
		|||
 | 
			
		||||
        vocab = self.reference.vocab
 | 
			
		||||
        gold_values = self.reference.to_array([field])
 | 
			
		||||
        output = []
 | 
			
		||||
        output = [None] * len(self.predicted)
 | 
			
		||||
        for i, gold_i in enumerate(cand_to_gold):
 | 
			
		||||
            if self.predicted[i].text.isspace():
 | 
			
		||||
                output.append(None)
 | 
			
		||||
            elif gold_i is None:
 | 
			
		||||
                output[i] = None
 | 
			
		||||
            if gold_i is None:
 | 
			
		||||
                if i in i2j_multi:
 | 
			
		||||
                    output.append(gold_values[i2j_multi[i]])
 | 
			
		||||
                    output[i] = gold_values[i2j_multi[i]]
 | 
			
		||||
                else:
 | 
			
		||||
                    output.append(None)
 | 
			
		||||
                    output[i] = None
 | 
			
		||||
            else:
 | 
			
		||||
                output.append(gold_values[gold_i])
 | 
			
		||||
                output[i] = gold_values[gold_i]
 | 
			
		||||
 | 
			
		||||
        if field in ["ENT_IOB"]:
 | 
			
		||||
            # Fix many-to-one IOB codes
 | 
			
		||||
| 
						 | 
				
			
			@ -117,6 +117,7 @@ cdef class Example:
 | 
			
		|||
                if cand_j is None:
 | 
			
		||||
                    if j in j2i_multi:
 | 
			
		||||
                        i = j2i_multi[j]
 | 
			
		||||
                        if output[i] is None:
 | 
			
		||||
                            output[i] = gold_values[j]
 | 
			
		||||
 | 
			
		||||
        if as_string:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -158,7 +158,7 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
			
		|||
    entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
 | 
			
		||||
    gold_words = ["I", "flew", "to", "San", "Francisco", "Valley", "."]
 | 
			
		||||
    example = Example.from_dict(doc, {"words": gold_words, "entities": entities})
 | 
			
		||||
    assert example.get_aligned("ENT_IOB") == [2, 2, 1, 2]
 | 
			
		||||
    assert example.get_aligned("ENT_IOB") == [2, 2, 3, 2]
 | 
			
		||||
    assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "LOC", ""]
 | 
			
		||||
 | 
			
		||||
    # many-to-one
 | 
			
		||||
| 
						 | 
				
			
			@ -195,25 +195,21 @@ def test_gold_biluo_different_tokenization(en_vocab, en_tokenizer):
 | 
			
		|||
    assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "LOC", "LOC", ""]
 | 
			
		||||
 | 
			
		||||
    # from issue #4791
 | 
			
		||||
    data = (
 | 
			
		||||
        "I'll return the ₹54 amount",
 | 
			
		||||
        {
 | 
			
		||||
            "words": ["I", "'ll", "return", "the", "₹", "54", "amount"],
 | 
			
		||||
            "entities": [(16, 19, "MONEY")],
 | 
			
		||||
        },
 | 
			
		||||
    )
 | 
			
		||||
    gp = GoldParse(en_tokenizer(data[0]), **data[1])
 | 
			
		||||
    assert gp.ner == ["O", "O", "O", "O", "U-MONEY", "O"]
 | 
			
		||||
    doc = en_tokenizer("I'll return the ₹54 amount")
 | 
			
		||||
    gold_words = ["I", "'ll", "return", "the", "₹", "54", "amount"]
 | 
			
		||||
    gold_spaces = [False, True, True, True, False, True, False]
 | 
			
		||||
    entities = [(16, 19, "MONEY")]
 | 
			
		||||
    example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
 | 
			
		||||
    assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 2]
 | 
			
		||||
    assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", ""]
 | 
			
		||||
 | 
			
		||||
    data = (
 | 
			
		||||
        "I'll return the $54 amount",
 | 
			
		||||
        {
 | 
			
		||||
            "words": ["I", "'ll", "return", "the", "$", "54", "amount"],
 | 
			
		||||
            "entities": [(16, 19, "MONEY")],
 | 
			
		||||
        },
 | 
			
		||||
    )
 | 
			
		||||
    gp = GoldParse(en_tokenizer(data[0]), **data[1])
 | 
			
		||||
    assert gp.ner == ["O", "O", "O", "O", "B-MONEY", "L-MONEY", "O"]
 | 
			
		||||
    doc = en_tokenizer("I'll return the $54 amount")
 | 
			
		||||
    gold_words = ["I", "'ll", "return", "the", "$", "54", "amount"]
 | 
			
		||||
    gold_spaces = [False, True, True, True, False, True, False]
 | 
			
		||||
    entities = [(16, 19, "MONEY")]
 | 
			
		||||
    example = Example.from_dict(doc, {"words": gold_words, "spaces": gold_spaces, "entities": entities})
 | 
			
		||||
    assert example.get_aligned("ENT_IOB") == [2, 2, 2, 2, 3, 1, 2]
 | 
			
		||||
    assert example.get_aligned("ENT_TYPE", as_string=True) == ["", "", "", "", "MONEY", "MONEY", ""]
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user