mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Cast to uint64 for all array-based doc representations (#11933)
* Convert all individual values explicitly to uint64 for array-based doc representations
* Temporarily test with latest numpy v1.24.0rc
* Remove unnecessary conversion from attr_t
* Reduce number of individual casts
* Convert specifically from int32 to uint64
* Revert "Temporarily test with latest numpy v1.24.0rc"
This reverts commit eb0e3c5006.
* Also use int32 in tests
			
			
This commit is contained in:
		
							parent
							
								
									3ac7230abd
								
							
						
					
					
						commit
						8cfc4c7325
					
				| 
						 | 
				
			
			@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
 | 
			
		|||
 | 
			
		||||
    # head before start
 | 
			
		||||
    arr = doc.to_array(["HEAD"])
 | 
			
		||||
    arr[0] = -1
 | 
			
		||||
    arr[0] = numpy.int32(-1).astype(numpy.uint64)
 | 
			
		||||
    doc_from_array = Doc(en_vocab, words=words)
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        doc_from_array.from_array(["HEAD"], arr)
 | 
			
		||||
 | 
			
		||||
    # head after end
 | 
			
		||||
    arr = doc.to_array(["HEAD"])
 | 
			
		||||
    arr[0] = 5
 | 
			
		||||
    arr[0] = numpy.int32(5).astype(numpy.uint64)
 | 
			
		||||
    doc_from_array = Doc(en_vocab, words=words)
 | 
			
		||||
    with pytest.raises(ValueError):
 | 
			
		||||
        doc_from_array.from_array(["HEAD"], arr)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -356,6 +356,7 @@ cdef class Doc:
 | 
			
		|||
            for annot in annotations:
 | 
			
		||||
                if annot:
 | 
			
		||||
                    if annot is heads or annot is sent_starts or annot is ent_iobs:
 | 
			
		||||
                        annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
 | 
			
		||||
                        for i in range(len(words)):
 | 
			
		||||
                            if attrs.ndim == 1:
 | 
			
		||||
                                attrs[i] = annot[i]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -305,7 +305,7 @@ cdef class Span:
 | 
			
		|||
                    for ancestor in ancestors:
 | 
			
		||||
                        ancestor_i = ancestor.i - self.c.start
 | 
			
		||||
                        if ancestor_i in range(length):
 | 
			
		||||
                            array[i, head_col] = ancestor_i - i
 | 
			
		||||
                            array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
 | 
			
		||||
 | 
			
		||||
                # if there is no appropriate ancestor, define a new artificial root
 | 
			
		||||
                value = array[i, head_col]
 | 
			
		||||
| 
						 | 
				
			
			@ -313,7 +313,7 @@ cdef class Span:
 | 
			
		|||
                    new_root = old_to_new_root.get(ancestor_i, None)
 | 
			
		||||
                    if new_root is not None:
 | 
			
		||||
                        # take the same artificial root as a previous token from the same sentence
 | 
			
		||||
                        array[i, head_col] = new_root - i
 | 
			
		||||
                        array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
 | 
			
		||||
                    else:
 | 
			
		||||
                        # set this token as the new artificial root
 | 
			
		||||
                        array[i, head_col] = 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -333,26 +333,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
 | 
			
		|||
        if key not in IDS:
 | 
			
		||||
            raise ValueError(Errors.E974.format(obj="token", key=key))
 | 
			
		||||
        elif key in ["ORTH", "SPACY"]:
 | 
			
		||||
            pass
 | 
			
		||||
            continue
 | 
			
		||||
        elif key == "HEAD":
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
 | 
			
		||||
            row = [h-i if h is not None else 0 for i, h in enumerate(value)]
 | 
			
		||||
        elif key == "DEP":
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
 | 
			
		||||
            row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
 | 
			
		||||
        elif key == "SENT_START":
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            values.append([to_ternary_int(v) for v in value])
 | 
			
		||||
            row = [to_ternary_int(v) for v in value]
 | 
			
		||||
        elif key == "MORPH":
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            values.append([vocab.morphology.add(v) for v in value])
 | 
			
		||||
            row = [vocab.morphology.add(v) for v in value]
 | 
			
		||||
        else:
 | 
			
		||||
            attrs.append(key)
 | 
			
		||||
            if not all(isinstance(v, str) for v in value):
 | 
			
		||||
                types = set([type(v) for v in value])
 | 
			
		||||
                raise TypeError(Errors.E969.format(field=key, types=types)) from None
 | 
			
		||||
            values.append([vocab.strings.add(v) for v in value])
 | 
			
		||||
    array = numpy.asarray(values, dtype="uint64")
 | 
			
		||||
            row = [vocab.strings.add(v) for v in value]
 | 
			
		||||
        values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
 | 
			
		||||
    array = numpy.array(values, dtype=numpy.uint64)
 | 
			
		||||
    return attrs, array.T
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user