diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225 01/FrameworkPTAdapter 2.0.2 PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225 01.md" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225 01/FrameworkPTAdapter 2.0.2 PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225 01.md" new file mode 100644 index 0000000000000000000000000000000000000000..9c0580cd6872806e3c4488490076e94eac30d543 --- /dev/null +++ "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225 01/FrameworkPTAdapter 2.0.2 PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225 01.md" @@ -0,0 +1,8229 @@ +# PyTorch API支持清单 +- [Tensors](#Tensors.md) +- [Generators](#Generators.md) +- [Random sampling](#Random-sampling.md) +- [Serialization](#Serialization.md) +- [Math operations](#Math-operations.md) +- [Utilities](#Utilities.md) +- [Other](#Other.md) +- [torch.Tensor](#torch-Tensor.md) +- [Layers \(torch.nn\)](#Layers-(torch-nn).md) +- [Functions\(torch.nn.functional\)](#Functions(torch-nn-functional).md) +- [torch.distributed](#torch-distributed.md) +- [NPU和CUDA功能对齐](#NPU和CUDA功能对齐.md) +

Tensors

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.is_tensor

+

+

2

+

torch.is_storage

+

+

3

+

torch.is_complex

+

+

4

+

torch.is_floating_point

+

+

5

+

torch.set_default_dtype

+

+

6

+

torch.get_default_dtype

+

+

7

+

torch.set_default_tensor_type

+

+

8

+

torch.numel

+

+

9

+

torch.set_printoptions

+

+

10

+

torch.set_flush_denormal

+

+

11

+

torch.tensor

+

+

12

+

torch.sparse_coo_tensor

+

+

13

+

torch.as_tensor

+

+

14

+

torch.as_strided

+

+

15

+

torch.from_numpy

+

+

16

+

torch.zeros

+

+

17

+

torch.zeros_like

+

+

18

+

torch.ones

+

+

19

+

torch.ones_like

+

+

20

+

torch.arange

+

+

21

+

torch.range

+

+

22

+

torch.linspace

+

+

23

+

torch.logspace

+

+

24

+

torch.eye

+

+

25

+

torch.empty

+

+

26

+

torch.empty_like

+

+

27

+

torch.empty_strided

+

+

28

+

torch.full

+

+

29

+

torch.full_like

+

+

30

+

torch.quantize_per_tensor

+

+

31

+

torch.quantize_per_channel

+

+

32

+

torch.cat

+

+

33

+

torch.chunk

+

+

34

+

torch.gather

+

+

35

+

torch.index_select

+

+

36

+

torch.masked_select

+

+

37

+

torch.narrow

+

+

38

+

torch.nonzero

+

+

39

+

torch.reshape

+

+

40

+

torch.split

+

+

41

+

torch.squeeze

+

+

42

+

torch.stack

+

+

43

+

torch.t

+

+

44

+

torch.take

+

+

45

+

torch.transpose

+

+

46

+

torch.unbind

+

+

47

+

torch.unsqueeze

+

+

48

+

torch.where

+

+
+ +

Generators

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch._C.Generator

+

+

2

+

torch._C.Generator.device

+

+

3

+

torch._C.Generator.get_state

+

+

4

+

torch._C.Generator.initial_seed

+

+

5

+

torch._C.Generator.manual_seed

+

+

6

+

torch._C.Generator.seed

+

+

7

+

torch._C.Generator.set_state

+

+
+ +

Random sampling

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.seed

+

+

2

+

torch.manual_seed

+

+

3

+

torch.initial_seed

+

+

4

+

torch.get_rng_state

+

+

5

+

torch.set_rng_state

+

+

6

+

torch.torch.default_generator

+

+

7

+

torch.bernoulli

+

+

8

+

torch.multinomial

+

+

9

+

torch.normal

+

+

10

+

torch.poisson

+

+

11

+

torch.rand

+

+

12

+

torch.rand_like

+

+

13

+

torch.randint

+

+

14

+

torch.randint_like

+

+

15

+

torch.randn

+

+

16

+

torch.randn_like

+

+

17

+

torch.randperm

+

+

18

+

torch.Tensor.bernoulli_()

+

+

19

+

torch.Tensor.bernoulli_()

+

+

20

+

torch.Tensor.exponential_()

+

+

21

+

torch.Tensor.geometric_()

+

+

22

+

torch.Tensor.log_normal_()

+

+

23

+

torch.Tensor.normal_()

+

+

24

+

torch.Tensor.random_()

+

+

25

+

torch.Tensor.uniform_()

+

+

26

+

torch.quasirandom.SobolEngine

+

+

27

+

torch.quasirandom.SobolEngine.draw

+

+

28

+

torch.quasirandom.SobolEngine.fast_forward

+

+

29

+

torch.quasirandom.SobolEngine.reset

+

+
+ +

Serialization

+ + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.save

+

+

2

+

torch.load

+

+
+ +

Math operations

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.abs

+

+

2

+

torch.acos

+

+

3

+

torch.add

+

+

4

+

torch.addcdiv

+

+

5

+

torch.addcmul

+

+

6

+

torch.angle

+

+

7

+

torch.asin

+

+

8

+

torch.atan

+

+

9

+

torch.atan2

+

+

10

+

torch.bitwise_not

+

+

11

+

torch.bitwise_and

+

+

12

+

torch.bitwise_or

+

+

13

+

torch.bitwise_xor

+

+

14

+

torch.ceil

+

+

15

+

torch.clamp

+

+

16

+

torch.conj

+

+

17

+

torch.cos

+

+

18

+

torch.cosh

+

+

19

+

torch.div

+

+

20

+

torch.digamma

+

+

21

+

torch.erf

+

+

22

+

torch.erfc

+

+

23

+

torch.erfinv

+

+

24

+

torch.exp

+

+

25

+

torch.expm1

+

+

26

+

torch.floor

+

+

27

+

torch.floor_divide

+

+

28

+

torch.fmod

+

+

29

+

torch.frac

+

+

30

+

torch.imag

+

+

31

+

torch.lerp

+

+

32

+

torch.lgamma

+

+

33

+

torch.log

+

+

34

+

torch.log10

+

+

35

+

torch.log1p

+

+

36

+

torch.log2

+

+

37

+

torch.logical_and

+

+

38

+

torch.logical_not

+

+

39

+

torch.logical_or

+

+

40

+

torch.logical_xor

+

+

41

+

torch.mul

+

+

42

+

torch.mvlgamma

+

+

43

+

torch.neg

+

+

44

+

torch.polygamma

+

+

45

+

torch.pow

+

+

46

+

torch.real

+

+

47

+

torch.reciprocal

+

+

48

+

torch.remainder

+

+

49

+

torch.round

+

+

50

+

torch.rsqrt

+

+

51

+

torch.sigmoid

+

+

52

+

torch.sign

+

+

53

+

torch.sin

+

+

54

+

torch.sinh

+

+

55

+

torch.sqrt

+

+

56

+

torch.square

+

+

57

+

torch.tan

+

+

58

+

torch.tanh

+

+

59

+

torch.true_divide

+

+

60

+

torch.trunc

+

+

61

+

torch.argmax

+

+

62

+

torch.argmin

+

+

63

+

torch.dist

+

+

64

+

torch.logsumexp

+

+

65

+

torch.mean

+

+

66

+

torch.median

+

+

67

+

torch.mode

+

+

68

+

torch.norm

+

+

69

+

torch.prod

+

+

70

+

torch.std

+

+

71

+

torch.std_mean

+

+

72

+

torch.sum

+

+

73

+

torch.unique

+

+

74

+

torch.unique_consecutive

+

+

75

+

torch.var

+

+

76

+

torch.var_mean

+

+

77

+

torch.allclose

+

+

78

+

torch.argsort

+

+

79

+

torch.eq

+

+

80

+

torch.equal

+

+

81

+

torch.ge

+

+

82

+

torch.gt

+

+

83

+

torch.isfinite

+

+

84

+

torch.isinf

+

+

85

+

torch.isnan

+

+

86

+

torch.kthvalue

+

+

87

+

torch.le

+

+

88

+

torch.lt

+

+

89

+

torch.max

+

+

90

+

torch.min

+

+

91

+

torch.ne

+

+

92

+

torch.sort

+

+

93

+

torch.topk

+

+

94

+

torch.fft

+

+

95

+

torch.ifft

+

+

96

+

torch.rfft

+

+

97

+

torch.irfft

+

+

98

+

torch.stft

+

+

99

+

torch.bartlett_window

+

+

100

+

torch.blackman_window

+

+

101

+

torch.hamming_window

+

+

102

+

torch.hann_window

+

+

103

+

torch.bincount

+

+

104

+

torch.broadcast_tensors

+

+

105

+

torch.cartesian_prod

+

+

106

+

torch.cdist

+

+

107

+

torch.combinations

+

+

108

+

torch.cross

+

+

109

+

torch.cummax

+

+

110

+

torch.cummin

+

+

111

+

torch.cumprod

+

+

112

+

torch.cumsum

+

+

113

+

torch.diag

+

+

114

+

torch.diag_embed

+

+

115

+

torch.diagflat

+

+

116

+

torch.diagonal

+

+

117

+

torch.einsum

+

+

118

+

torch.flatten

+

+

119

+

torch.flip

+

+

120

+

torch.rot90

+

+

121

+

torch.histc

+

+

122

+

torch.meshgrid

+

+

123

+

torch.renorm

+

+

124

+

torch.repeat_interleave

+

+

125

+

torch.roll

+

+

126

+

torch.tensordot

+

+

127

+

torch.trace

+

+

128

+

torch.tril

+

+

129

+

torch.tril_indices

+

+

130

+

torch.triu

+

+

131

+

torch.triu_indices

+

+

132

+

torch.addbmm

+

+

133

+

torch.addmm

+

+

134

+

torch.addmv

+

+

135

+

torch.addr

+

+

136

+

torch.baddbmm

+

+

137

+

torch.bmm

+

+

138

+

torch.chain_matmul

+

+

139

+

torch.cholesky

+

+

140

+

torch.cholesky_inverse

+

+

141

+

torch.cholesky_solve

+

+

142

+

torch.dot

+

+

143

+

torch.eig

+

+

144

+

torch.geqrf

+

+

145

+

torch.ger

+

+

146

+

torch.inverse

+

+

147

+

torch.det

+

+

148

+

torch.logdet

+

+

149

+

torch.slogdet

+

+

150

+

torch.lstsq

+

+

151

+

torch.lu

+

+

152

+

torch.lu_solve

+

+

153

+

torch.lu_unpack

+

+

154

+

torch.matmul

+

+

155

+

torch.matrix_power

+

+

156

+

torch.matrix_rank

+

+

157

+

torch.mm

+

+

158

+

torch.mv

+

+

159

+

torch.orgqr

+

+

160

+

torch.ormqr

+

+

161

+

torch.pinverse

+

+

162

+

torch.qr

+

+

163

+

torch.solve

+

+

164

+

torch.svd

+

+

165

+

torch.svd_lowrank

+

+

166

+

torch.pca_lowrank

+

+

167

+

torch.symeig

+

+

168

+

torch.lobpcg

+

+

169

+

torch.trapz

+

+

170

+

torch.triangular_solve

+

+
+ +

Utilities

+ + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.compiled_with_cxx11_abi

+

+

2

+

torch.result_type

+

+

3

+

torch.can_cast

+

+

4

+

torch.promote_types

+

+
+ +

Other

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.no_grad

+

+

2

+

torch.enable_grad

+

+

3

+

torch.set_grad_enabled

+

+

4

+

torch.get_num_threads

+

+

5

+

torch.set_num_threads

+

+

6

+

torch.get_num_interop_threads

+

+

7

+

torch.set_num_interop_threads

+

+
+ +

torch.Tensor

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.Tensor

+

+

2

+

torch.Tensor.new_tensor

+

+

3

+

torch.Tensor.new_full

+

+

4

+

torch.Tensor.new_empty

+

+

5

+

torch.Tensor.new_ones

+

+

6

+

torch.Tensor.new_zeros

+

+

7

+

torch.Tensor.is_cuda

+

+

8

+

torch.Tensor.is_quantized

+

+

9

+

torch.Tensor.device

+

+

10

+

torch.Tensor.ndim

+

+

11

+

torch.Tensor.T

+

+

12

+

torch.Tensor.abs

+

+

13

+

torch.Tensor.abs_

+

+

14

+

torch.Tensor.acos

+

+

15

+

torch.Tensor.acos_

+

+

16

+

torch.Tensor.add

+

+

17

+

torch.Tensor.add_

+

+

18

+

torch.Tensor.addbmm

+

+

19

+

torch.Tensor.addbmm_

+

+

20

+

torch.Tensor.addcdiv

+

+

21

+

torch.Tensor.addcdiv_

+

+

22

+

torch.Tensor.addcmul

+

+

23

+

torch.Tensor.addcmul_

+

+

24

+

torch.Tensor.addmm

+

+

25

+

torch.Tensor.addmm_

+

+

26

+

torch.Tensor.addmv

+

+

27

+

torch.Tensor.addmv_

+

+

28

+

torch.Tensor.addr

+

+

29

+

torch.Tensor.addr_

+

+

30

+

torch.Tensor.allclose

+

+

31

+

torch.Tensor.angle

+

+

32

+

torch.Tensor.apply_

+

+

33

+

torch.Tensor.argmax

+

+

34

+

torch.Tensor.argmin

+

+

35

+

torch.Tensor.argsort

+

+

36

+

torch.Tensor.asin

+

+

37

+

torch.Tensor.asin_

+

+

38

+

torch.Tensor.as_strided

+

+

39

+

torch.Tensor.atan

+

+

40

+

torch.Tensor.atan2

+

+

41

+

torch.Tensor.atan2_

+

+

42

+

torch.Tensor.atan_

+

+

43

+

torch.Tensor.baddbmm

+

+

44

+

torch.Tensor.baddbmm_

+

+

45

+

torch.Tensor.bernoulli

+

+

46

+

torch.Tensor.bernoulli_

+

+

47

+

torch.Tensor.bfloat16

+

+

48

+

torch.Tensor.bincount

+

+

49

+

torch.Tensor.bitwise_not

+

+

50

+

torch.Tensor.bitwise_not_

+

+

51

+

torch.Tensor.bitwise_and

+

+

52

+

torch.Tensor.bitwise_and_

+

+

53

+

torch.Tensor.bitwise_or

+

+

54

+

torch.Tensor.bitwise_or_

+

+

55

+

torch.Tensor.bitwise_xor

+

+

56

+

torch.Tensor.bitwise_xor_

+

+

57

+

torch.Tensor.bmm

+

+

58

+

torch.Tensor.bool

+

+

59

+

torch.Tensor.byte

+

+

60

+

torch.Tensor.cauchy_

+

+

61

+

torch.Tensor.ceil

+

+

62

+

torch.Tensor.ceil_

+

+

63

+

torch.Tensor.char

+

+

64

+

torch.Tensor.cholesky

+

+

65

+

torch.Tensor.cholesky_inverse

+

+

66

+

torch.Tensor.cholesky_solve

+

+

67

+

torch.Tensor.chunk

+

+

68

+

torch.Tensor.clamp

+

+

69

+

torch.Tensor.clamp_

+

+

70

+

torch.Tensor.clone

+

+

71

+

torch.Tensor.contiguous

+

+

72

+

torch.Tensor.copy_

+

+

73

+

torch.Tensor.conj

+

+

74

+

torch.Tensor.cos

+

+

75

+

torch.Tensor.cos_

+

+

76

+

torch.Tensor.cosh

+

+

77

+

torch.Tensor.cosh_

+

+

78

+

torch.Tensor.cpu

+

+

79

+

torch.Tensor.cross

+

+

80

+

torch.Tensor.cuda

+

+

81

+

torch.Tensor.cummax

+

+

82

+

torch.Tensor.cummin

+

+

83

+

torch.Tensor.cumprod

+

+

84

+

torch.Tensor.cumsum

+

+

85

+

torch.Tensor.data_ptr

+

+

86

+

torch.Tensor.dequantize

+

+

87

+

torch.Tensor.det

+

+

88

+

torch.Tensor.dense_dim

+

+

89

+

torch.Tensor.diag

+

+

90

+

torch.Tensor.diag_embed

+

+

91

+

torch.Tensor.diagflat

+

+

92

+

torch.Tensor.diagonal

+

+

93

+

torch.Tensor.fill_diagonal_

+

+

94

+

torch.Tensor.digamma

+

+

95

+

torch.Tensor.digamma_

+

+

96

+

torch.Tensor.dim

+

+

97

+

torch.Tensor.dist

+

+

98

+

torch.Tensor.div

+

+

99

+

torch.Tensor.div_

+

+

100

+

torch.Tensor.dot

+

+

101

+

torch.Tensor.double

+

+

102

+

torch.Tensor.eig

+

+

103

+

torch.Tensor.element_size

+

+

104

+

torch.Tensor.eq

+

+

105

+

torch.Tensor.eq_

+

+

106

+

torch.Tensor.equal

+

+

107

+

torch.Tensor.erf

+

+

108

+

torch.Tensor.erf_

+

+

109

+

torch.Tensor.erfc

+

+

110

+

torch.Tensor.erfc_

+

+

111

+

torch.Tensor.erfinv

+

+

112

+

torch.Tensor.erfinv_

+

+

113

+

torch.Tensor.exp

+

+

114

+

torch.Tensor.exp_

+

+

115

+

torch.Tensor.expm1

+

+

116

+

torch.Tensor.expm1_

+

+

117

+

torch.Tensor.expand

+

+

118

+

torch.Tensor.expand_as

+

+

119

+

torch.Tensor.exponential_

+

+

120

+

torch.Tensor.fft

+

+

121

+

torch.Tensor.fill_

+

+

122

+

torch.Tensor.flatten

+

+

123

+

torch.Tensor.flip

+

+

124

+

torch.Tensor.float

+

+

125

+

torch.Tensor.floor

+

+

126

+

torch.Tensor.floor_

+

+

127

+

torch.Tensor.floor_divide

+

+

128

+

torch.Tensor.floor_divide_

+

+

129

+

torch.Tensor.fmod

+

+

130

+

torch.Tensor.fmod_

+

+

131

+

torch.Tensor.frac

+

+

132

+

torch.Tensor.frac_

+

+

133

+

torch.Tensor.gather

+

+

134

+

torch.Tensor.ge

+

+

135

+

torch.Tensor.ge_

+

+

136

+

torch.Tensor.geometric_

+

+

137

+

torch.Tensor.geqrf

+

+

138

+

torch.Tensor.ger

+

+

139

+

torch.Tensor.get_device

+

+

140

+

torch.Tensor.gt

+

+

141

+

torch.Tensor.gt_

+

+

142

+

torch.Tensor.half

+

+

143

+

torch.Tensor.hardshrink

+

+

144

+

torch.Tensor.histc

+

+

145

+

torch.Tensor.ifft

+

+

146

+

torch.Tensor.index_add_

+

+

147

+

torch.Tensor.index_add

+

+

148

+

torch.Tensor.index_copy_

+

+

149

+

torch.Tensor.index_copy

+

+

150

+

torch.Tensor.index_fill_

+

+

151

+

torch.Tensor.index_fill

+

+

152

+

torch.Tensor.index_put_

+

+

153

+

torch.Tensor.index_put

+

+

154

+

torch.Tensor.index_select

+

+

155

+

torch.Tensor.indices

+

+

156

+

torch.Tensor.int

+

+

157

+

torch.Tensor.int_repr

+

+

158

+

torch.Tensor.inverse

+

+

159

+

torch.Tensor.irfft

+

+

160

+

torch.Tensor.is_contiguous

+

+

161

+

torch.Tensor.is_complex

+

+

162

+

torch.Tensor.is_floating_point

+

+

163

+

torch.Tensor.is_pinned

+

+

164

+

torch.Tensor.is_set_to

+

+

165

+

torch.Tensor.is_shared

+

+

166

+

torch.Tensor.is_signed

+

+

167

+

torch.Tensor.is_sparse

+

+

168

+

torch.Tensor.item

+

+

169

+

torch.Tensor.kthvalue

+

+

170

+

torch.Tensor.le

+

+

171

+

torch.Tensor.le_

+

+

172

+

torch.Tensor.lerp

+

+

173

+

torch.Tensor.lerp_

+

+

174

+

torch.Tensor.lgamma

+

+

175

+

torch.Tensor.lgamma_

+

+

176

+

torch.Tensor.log

+

+

177

+

torch.Tensor.log_

+

+

178

+

torch.Tensor.logdet

+

+

179

+

torch.Tensor.log10

+

+

180

+

torch.Tensor.log10_

+

+

181

+

torch.Tensor.log1p

+

+

182

+

torch.Tensor.log1p_

+

+

183

+

torch.Tensor.log2

+

+

184

+

torch.Tensor.log2_

+

+

185

+

torch.Tensor.log_normal_

+

+

186

+

torch.Tensor.logsumexp

+

+

187

+

torch.Tensor.logical_and

+

+

188

+

torch.Tensor.logical_and_

+

+

189

+

torch.Tensor.logical_not

+

+

190

+

torch.Tensor.logical_not_

+

+

191

+

torch.Tensor.logical_or

+

+

192

+

torch.Tensor.logical_or_

+

+

193

+

torch.Tensor.logical_xor

+

+

194

+

torch.Tensor.logical_xor_

+

+

195

+

torch.Tensor.long

+

+

196

+

torch.Tensor.lstsq

+

+

197

+

torch.Tensor.lt

+

+

198

+

torch.Tensor.lt_

+

+

199

+

torch.Tensor.lu

+

+

200

+

torch.Tensor.lu_solve

+

+

201

+

torch.Tensor.map_

+

+

202

+

torch.Tensor.masked_scatter_

+

+

203

+

torch.Tensor.masked_scatter

+

+

204

+

torch.Tensor.masked_fill_

+

+

205

+

torch.Tensor.masked_fill

+

+

206

+

torch.Tensor.masked_select

+

+

207

+

torch.Tensor.matmul

+

+

208

+

torch.Tensor.matrix_power

+

+

209

+

torch.Tensor.max

+

+

210

+

torch.Tensor.mean

+

+

211

+

torch.Tensor.median

+

+

212

+

torch.Tensor.min

+

+

213

+

torch.Tensor.mm

+

+

214

+

torch.Tensor.mode

+

+

215

+

torch.Tensor.mul

+

+

216

+

torch.Tensor.mul_

+

+

217

+

torch.Tensor.multinomial

+

+

218

+

torch.Tensor.mv

+

+

219

+

torch.Tensor.mvlgamma

+

+

220

+

torch.Tensor.mvlgamma_

+

+

221

+

torch.Tensor.narrow

+

+

222

+

torch.Tensor.narrow_copy

+

+

223

+

torch.Tensor.ndimension

+

+

224

+

torch.Tensor.ne

+

+

225

+

torch.Tensor.ne_

+

+

226

+

torch.Tensor.neg

+

+

227

+

torch.Tensor.neg_

+

+

228

+

torch.Tensor.nelement

+

+

229

+

torch.Tensor.nonzero

+

+

230

+

torch.Tensor.norm

+

+

231

+

torch.Tensor.normal_

+

+

232

+

torch.Tensor.numel

+

+

233

+

torch.Tensor.numpy

+

+

234

+

torch.Tensor.orgqr

+

+

235

+

torch.Tensor.ormqr

+

+

236

+

torch.Tensor.permute

+

+

237

+

torch.Tensor.pin_memory

+

+

238

+

torch.Tensor.pinverse

+

+

239

+

torch.Tensor.polygamma

+

+

240

+

torch.Tensor.polygamma_

+

+

241

+

torch.Tensor.pow

+

+

242

+

torch.Tensor.pow_

+

+

243

+

torch.Tensor.prod

+

+

244

+

torch.Tensor.put_

+

+

245

+

torch.Tensor.qr

+

+

246

+

torch.Tensor.qscheme

+

+

247

+

torch.Tensor.q_scale

+

+

248

+

torch.Tensor.q_zero_point

+

+

249

+

torch.Tensor.q_per_channel_scales

+

+

250

+

torch.Tensor.q_per_channel_zero_points

+

+

251

+

torch.Tensor.q_per_channel_axis

+

+

252

+

torch.Tensor.random_

+

+

253

+

torch.Tensor.reciprocal

+

+

254

+

torch.Tensor.reciprocal_

+

+

255

+

torch.Tensor.record_stream

+

+

256

+

torch.Tensor.remainder

+

+

257

+

torch.Tensor.remainder_

+

+

258

+

torch.Tensor.renorm

+

+

259

+

torch.Tensor.renorm_

+

+

260

+

torch.Tensor.repeat

+

+

261

+

torch.Tensor.repeat_interleave

+

+

262

+

torch.Tensor.requires_grad_

+

+

263

+

torch.Tensor.reshape

+

+

264

+

torch.Tensor.reshape_as

+

+

265

+

torch.Tensor.resize_

+

+

266

+

torch.Tensor.resize_as_

+

+

267

+

torch.Tensor.rfft

+

+

268

+

torch.Tensor.roll

+

+

269

+

torch.Tensor.rot90

+

+

270

+

torch.Tensor.round

+

+

271

+

torch.Tensor.round_

+

+

272

+

torch.Tensor.rsqrt

+

+

273

+

torch.Tensor.rsqrt_

+

+

274

+

torch.Tensor.scatter

+

+

275

+

torch.Tensor.scatter_

+

+

276

+

torch.Tensor.scatter_add_

+

+

277

+

torch.Tensor.scatter_add

+

+

278

+

torch.Tensor.select

+

+

279

+

torch.Tensor.set_

+

+

280

+

torch.Tensor.share_memory_

+

+

281

+

torch.Tensor.short

+

+

282

+

torch.Tensor.sigmoid

+

+

283

+

torch.Tensor.sigmoid_

+

+

284

+

torch.Tensor.sign

+

+

285

+

torch.Tensor.sign_

+

+

286

+

torch.Tensor.sin

+

+

287

+

torch.Tensor.sin_

+

+

288

+

torch.Tensor.sinh

+

+

289

+

torch.Tensor.sinh_

+

+

290

+

torch.Tensor.size

+

+

291

+

torch.Tensor.slogdet

+

+

292

+

torch.Tensor.solve

+

+

293

+

torch.Tensor.sort

+

+

294

+

torch.Tensor.split

+

+

295

+

torch.Tensor.sparse_mask

+

+

296

+

torch.Tensor.sparse_dim

+

+

297

+

torch.Tensor.sqrt

+

+

298

+

torch.Tensor.sqrt_

+

+

299

+

torch.Tensor.square

+

+

300

+

torch.Tensor.square_

+

+

301

+

torch.Tensor.squeeze

+

+

302

+

torch.Tensor.squeeze_

+

+

303

+

torch.Tensor.std

+

+

304

+

torch.Tensor.stft

+

+

305

+

torch.Tensor.storage

+

+

306

+

torch.Tensor.storage_offset

+

+

307

+

torch.Tensor.storage_type

+

+

308

+

torch.Tensor.stride

+

+

309

+

torch.Tensor.sub

+

+

310

+

torch.Tensor.sub_

+

+

311

+

torch.Tensor.sum

+

+

312

+

torch.Tensor.sum_to_size

+

+

313

+

torch.Tensor.svd

+

+

314

+

torch.Tensor.symeig

+

+

315

+

torch.Tensor.t

+

+

316

+

torch.Tensor.t_

+

+

317

+

torch.Tensor.to

+

+

318

+

torch.Tensor.to_mkldnn

+

+

319

+

torch.Tensor.take

+

+

320

+

torch.Tensor.tan

+

+

321

+

torch.Tensor.tan_

+

+

322

+

torch.Tensor.tanh

+

+

323

+

torch.Tensor.tanh_

+

+

324

+

torch.Tensor.tolist

+

+

325

+

torch.Tensor.topk

+

+

326

+

torch.Tensor.to_sparse

+

+

327

+

torch.Tensor.trace

+

+

328

+

torch.Tensor.transpose

+

+

329

+

torch.Tensor.transpose_

+

+

330

+

torch.Tensor.triangular_solve

+

+

331

+

torch.Tensor.tril

+

+

332

+

torch.Tensor.tril_

+

+

333

+

torch.Tensor.triu

+

+

334

+

torch.Tensor.triu_

+

+

335

+

torch.Tensor.true_divide

+

+

336

+

torch.Tensor.true_divide_

+

+

337

+

torch.Tensor.trunc

+

+

338

+

torch.Tensor.trunc_

+

+

339

+

torch.Tensor.type

+

+

340

+

torch.Tensor.type_as

+

+

341

+

torch.Tensor.unbind

+

+

342

+

torch.Tensor.unfold

+

+

343

+

torch.Tensor.uniform_

+

+

344

+

torch.Tensor.unique

+

+

345

+

torch.Tensor.unique_consecutive

+

+

346

+

torch.Tensor.unsqueeze

+

+

347

+

torch.Tensor.unsqueeze_

+

+

348

+

torch.Tensor.values

+

+

349

+

torch.Tensor.var

+

+

350

+

torch.Tensor.view

+

+

351

+

torch.Tensor.view_as

+

+

352

+

torch.Tensor.where

+

+

353

+

torch.Tensor.zero_

+

+

354

+

torch.BoolTensor

+

+

355

+

torch.BoolTensor.all

+

+

356

+

torch.BoolTensor.any

+

+
+ +

Layers \(torch.nn\)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.nn.Parameter

+

+

2

+

torch.nn.Module

+

+

3

+

torch.nn.Module.add_module

+

+

4

+

torch.nn.Module.apply

+

+

5

+

torch.nn.Module.bfloat16

+

+

6

+

torch.nn.Module.buffers

+

+

7

+

torch.nn.Module.children

+

+

8

+

torch.nn.Module.cpu

+

+

9

+

torch.nn.Module.cuda

+

+

10

+

torch.nn.Module.double

+

+

11

+

torch.nn.Module.dump_patches

+

+

12

+

torch.nn.Module.eval

+

+

13

+

torch.nn.Module.extra_repr

+

+

14

+

torch.nn.Module.float

+

+

15

+

torch.nn.Module.forward

+

+

16

+

torch.nn.Module.half

+

+

17

+

torch.nn.Module.load_state_dict

+

+

18

+

torch.nn.Module.modules

+

+

19

+

torch.nn.Module.named_buffers

+

+

20

+

torch.nn.Module.named_children

+

+

21

+

torch.nn.Module.named_modules

+

+

22

+

torch.nn.Module.named_parameters

+

+

23

+

torch.nn.Module.parameters

+

+

24

+

torch.nn.Module.register_backward_hook

+

+

25

+

torch.nn.Module.register_buffer

+

+

26

+

torch.nn.Module.register_forward_hook

+

+

27

+

torch.nn.Module.register_forward_pre_hook

+

+

28

+

torch.nn.Module.register_parameter

+

+

29

+

torch.nn.Module.requires_grad_

+

+

30

+

torch.nn.Module.state_dict

+

+

31

+

torch.nn.Module.to

+

+

32

+

torch.nn.Module.train

+

+

33

+

torch.nn.Module.type

+

+

34

+

torch.nn.Module.zero_grad

+

+

35

+

torch.nn.Sequential

+

+

36

+

torch.nn.ModuleList

+

+

37

+

torch.nn.ModuleList.append

+

+

38

+

torch.nn.ModuleList.extend

+

+

39

+

torch.nn.ModuleList.insert

+

+

40

+

torch.nn.ModuleDict

+

+

41

+

torch.nn.ModuleDict.clear

+

+

42

+

torch.nn.ModuleDict.items

+

+

43

+

torch.nn.ModuleDict.keys

+

+

44

+

torch.nn.ModuleDict.pop

+

+

45

+

torch.nn.ModuleDict.update

+

+

46

+

torch.nn.ModuleDict.values

+

+

47

+

torch.nn.ParameterList

+

+

48

+

torch.nn.ParameterList.append

+

+

49

+

torch.nn.ParameterList.extend

+

+

50

+

torch.nn.ParameterDict

+

+

51

+

torch.nn.ParameterDict.clear

+

+

52

+

torch.nn.ParameterDict.items

+

+

53

+

torch.nn.ParameterDict.keys

+

+

54

+

torch.nn.ParameterDict.pop

+

+

55

+

torch.nn.ParameterDict.update

+

+

56

+

torch.nn.ParameterDict.values

+

+

57

+

torch.nn.Conv1d

+

+

58

+

torch.nn.Conv2d

+

+

59

+

torch.nn.Conv3d

+

+

60

+

torch.nn.ConvTranspose1d

+

+

61

+

torch.nn.ConvTranspose2d

+

+

62

+

torch.nn.ConvTranspose3d

+

+

63

+

torch.nn.Unfold

+

+

64

+

torch.nn.Fold

+

+

65

+

torch.nn.MaxPool1d

+

+

66

+

torch.nn.MaxPool2d

+

+

67

+

torch.nn.MaxPool3d

+

+

68

+

torch.nn.MaxUnpool1d

+

+

69

+

torch.nn.MaxUnpool2d

+

+

70

+

torch.nn.MaxUnpool3d

+

+

71

+

torch.nn.AvgPool1d

+

+

72

+

torch.nn.AvgPool2d

+

+

73

+

torch.nn.AvgPool3d

+

+

74

+

torch.nn.FractionalMaxPool2d

+

+

75

+

torch.nn.LPPool1d

+

+

76

+

torch.nn.LPPool2d

+

+

77

+

torch.nn.AdaptiveMaxPool1d

+

+

78

+

torch.nn.AdaptiveMaxPool2d

+

+

79

+

torch.nn.AdaptiveMaxPool3d

+

+

80

+

torch.nn.AdaptiveAvgPool1d

+

+

81

+

torch.nn.AdaptiveAvgPool2d

+

+

82

+

torch.nn.AdaptiveAvgPool3d

+

+

83

+

torch.nn.ReflectionPad1d

+

+

84

+

torch.nn.ReflectionPad2d

+

+

85

+

torch.nn.ReplicationPad1d

+

+

86

+

torch.nn.ReplicationPad2d

+

+

87

+

torch.nn.ReplicationPad3d

+

+

88

+

torch.nn.ZeroPad2d

+

+

89

+

torch.nn.ConstantPad1d

+

+

90

+

torch.nn.ConstantPad2d

+

+

91

+

torch.nn.ConstantPad3d

+

+

92

+

torch.nn.ELU

+

+

93

+

torch.nn.Hardshrink

+

+

94

+

torch.nn.Hardtanh

+

+

95

+

torch.nn.LeakyReLU

+

+

96

+

torch.nn.LogSigmoid

+

+

97

+

torch.nn.MultiheadAttention

+

+

98

+

torch.nn.PReLU

+

+

99

+

torch.nn.ReLU

+

+

100

+

torch.nn.ReLU6

+

+

101

+

torch.nn.RReLU

+

+

102

+

torch.nn.SELU

+

+

103

+

torch.nn.CELU

+

+

104

+

torch.nn.GELU

+

+

105

+

torch.nn.Sigmoid

+

+

106

+

torch.nn.Softplus

+

+

107

+

torch.nn.Softshrink

+

是,SoftShrink场景暂不支持

+

108

+

torch.nn.Softsign

+

+

109

+

torch.nn.Tanh

+

+

110

+

torch.nn.Tanhshrink

+

+

111

+

torch.nn.Threshold

+

+

112

+

torch.nn.Softmin

+

+

113

+

torch.nn.Softmax

+

+

114

+

torch.nn.Softmax2d

+

+

115

+

torch.nn.LogSoftmax

+

+

116

+

torch.nn.AdaptiveLogSoftmaxWithLoss

+

+

117

+

torch.nn.AdaptiveLogSoftmaxWithLoss.log_prob

+

+

118

+

torch.nn.AdaptiveLogSoftmaxWithLoss.predict

+

+

119

+

torch.nn.BatchNorm1d

+

+

120

+

torch.nn.BatchNorm2d

+

+

121

+

torch.nn.BatchNorm3d

+

+

122

+

torch.nn.GroupNorm

+

+

123

+

torch.nn.SyncBatchNorm

+

+

124

+

torch.nn.SyncBatchNorm.convert_sync_batchnorm

+

+

125

+

torch.nn.InstanceNorm1d

+

+

126

+

torch.nn.InstanceNorm2d

+

+

127

+

torch.nn.InstanceNorm3d

+

+

128

+

torch.nn.LayerNorm

+

+

129

+

torch.nn.LocalResponseNorm

+

+

130

+

torch.nn.RNNBase

+

+

131

+

torch.nn.RNNBase.flatten_parameters

+

+

132

+

torch.nn.RNN

+

+

133

+

torch.nn.LSTM

+

是,DynamicRNN场景暂不支持

+

134

+

torch.nn.GRU

+

是,DynamicGRUV2场景暂不支持

+

135

+

torch.nn.RNNCell

+

+

136

+

torch.nn.LSTMCell

+

+

137

+

torch.nn.GRUCell

+

+

138

+

torch.nn.Transformer

+

+

139

+

torch.nn.Transformer.forward

+

+

140

+

torch.nn.Transformer.generate_square_subsequent_mask

+

+

141

+

torch.nn.TransformerEncoder

+

+

142

+

torch.nn.TransformerEncoder.forward

+

+

143

+

torch.nn.TransformerDecoder

+

+

144

+

torch.nn.TransformerDecoder.forward

+

+

145

+

torch.nn.TransformerEncoderLayer

+

+

146

+

torch.nn.TransformerEncoderLayer.forward

+

+

147

+

torch.nn.TransformerDecoderLayer

+

+

148

+

torch.nn.TransformerDecoderLayer.forward

+

+

149

+

torch.nn.Identity

+

+

150

+

torch.nn.Linear

+

+

151

+

torch.nn.Bilinear

+

+

152

+

torch.nn.Dropout

+

+

153

+

torch.nn.Dropout2d

+

+

154

+

torch.nn.Dropout3d

+

+

155

+

torch.nn.AlphaDropout

+

+

156

+

torch.nn.Embedding

+

+

157

+

torch.nn.Embedding.from_pretrained

+

+

158

+

torch.nn.EmbeddingBag

+

+

159

+

torch.nn.EmbeddingBag.from_pretrained

+

+

160

+

torch.nn.CosineSimilarity

+

+

161

+

torch.nn.PairwiseDistance

+

+

162

+

torch.nn.L1Loss

+

+

163

+

torch.nn.MSELoss

+

+

164

+

torch.nn.CrossEntropyLoss

+

+

165

+

torch.nn.CTCLoss

+

+

166

+

torch.nn.NLLLoss

+

+

167

+

torch.nn.PoissonNLLLoss

+

+

168

+

torch.nn.KLDivLoss

+

+

169

+

torch.nn.BCELoss

+

+

170

+

torch.nn.BCEWithLogitsLoss

+

+

171

+

torch.nn.MarginRankingLoss

+

+

172

+

torch.nn.HingeEmbeddingLoss

+

+

173

+

torch.nn.MultiLabelMarginLoss

+

+

174

+

torch.nn.SmoothL1Loss

+

+

175

+

torch.nn.SoftMarginLoss

+

+

176

+

torch.nn.MultiLabelSoftMarginLoss

+

+

177

+

torch.nn.CosineEmbeddingLoss

+

+

178

+

torch.nn.MultiMarginLoss

+

+

179

+

torch.nn.TripletMarginLoss

+

+

180

+

torch.nn.PixelShuffle

+

+

181

+

torch.nn.Upsample

+

+

182

+

torch.nn.UpsamplingNearest2d

+

+

183

+

torch.nn.UpsamplingBilinear2d

+

+

184

+

torch.nn.DataParallel

+

+

185

+

torch.nn.parallel.DistributedDataParallel

+

+

186

+

torch.nn.parallel.DistributedDataParallel.no_sync

+

+

187

+

torch.nn.utils.clip_grad_norm_

+

+

188

+

torch.nn.utils.clip_grad_value_

+

+

189

+

torch.nn.utils.parameters_to_vector

+

+

190

+

torch.nn.utils.vector_to_parameters

+

+

197

+

torch.nn.utils.prune.PruningContainer

+

+

198

+

torch.nn.utils.prune.PruningContainer.add_pruning_method

+

+

199

+

torch.nn.utils.prune.PruningContainer.apply

+

+

200

+

torch.nn.utils.prune.PruningContainer.apply_mask

+

+

201

+

torch.nn.utils.prune.PruningContainer.compute_mask

+

+

202

+

torch.nn.utils.prune.PruningContainer.prune

+

+

203

+

torch.nn.utils.prune.PruningContainer.remove

+

+

204

+

torch.nn.utils.prune.Identity

+

+

205

+

torch.nn.utils.prune.Identity.apply

+

+

206

+

torch.nn.utils.prune.Identity.apply_mask

+

+

207

+

torch.nn.utils.prune.Identity.prune

+

+

208

+

torch.nn.utils.prune.Identity.remove

+

+

209

+

torch.nn.utils.prune.RandomUnstructured

+

+

210

+

torch.nn.utils.prune.RandomUnstructured.apply

+

+

211

+

torch.nn.utils.prune.RandomUnstructured.apply_mask

+

+

212

+

torch.nn.utils.prune.RandomUnstructured.prune

+

+

213

+

torch.nn.utils.prune.RandomUnstructured.remove

+

+

214

+

torch.nn.utils.prune.L1Unstructured

+

+

215

+

torch.nn.utils.prune.L1Unstructured.apply

+

+

216

+

torch.nn.utils.prune.L1Unstructured.apply_mask

+

+

217

+

torch.nn.utils.prune.L1Unstructured.prune

+

+

218

+

torch.nn.utils.prune.L1Unstructured.remove

+

+

219

+

torch.nn.utils.prune.RandomStructured

+

+

220

+

torch.nn.utils.prune.RandomStructured.apply

+

+

221

+

torch.nn.utils.prune.RandomStructured.apply_mask

+

+

222

+

torch.nn.utils.prune.RandomStructured.compute_mask

+

+

223

+

torch.nn.utils.prune.RandomStructured.prune

+

+

224

+

torch.nn.utils.prune.RandomStructured.remove

+

+

225

+

torch.nn.utils.prune.LnStructured

+

+

226

+

torch.nn.utils.prune.LnStructured.apply

+

+

227

+

torch.nn.utils.prune.LnStructured.apply_mask

+

+

228

+

torch.nn.utils.prune.LnStructured.compute_mask

+

+

229

+

torch.nn.utils.prune.LnStructured.prune

+

+

230

+

torch.nn.utils.prune.LnStructured.remove

+

+

231

+

torch.nn.utils.prune.CustomFromMask

+

+

232

+

torch.nn.utils.prune.CustomFromMask.apply

+

+

233

+

torch.nn.utils.prune.CustomFromMask.apply_mask

+

+

234

+

torch.nn.utils.prune.CustomFromMask.prune

+

+

235

+

torch.nn.utils.prune.CustomFromMask.remove

+

+

236

+

torch.nn.utils.prune.identity

+

+

237

+

torch.nn.utils.prune.random_unstructured

+

+

238

+

torch.nn.utils.prune.l1_unstructured

+

+

239

+

torch.nn.utils.prune.random_structured

+

+

240

+

torch.nn.utils.prune.ln_structured

+

+

241

+

torch.nn.utils.prune.global_unstructured

+

+

242

+

torch.nn.utils.prune.custom_from_mask

+

+

243

+

torch.nn.utils.prune.remove

+

+

244

+

torch.nn.utils.prune.is_pruned

+

+

245

+

torch.nn.utils.weight_norm

+

+

246

+

torch.nn.utils.remove_weight_norm

+

+

247

+

torch.nn.utils.spectral_norm

+

+

248

+

torch.nn.utils.remove_spectral_norm

+

+

249

+

torch.nn.utils.rnn.PackedSequence

+

+

250

+

torch.nn.utils.rnn.pack_padded_sequence

+

+

251

+

torch.nn.utils.rnn.pad_packed_sequence

+

+

252

+

torch.nn.utils.rnn.pad_sequence

+

+

253

+

torch.nn.utils.rnn.pack_sequence

+

+

254

+

torch.nn.Flatten

+

+

255

+

torch.quantization.quantize

+

+

256

+

torch.quantization.quantize_dynamic

+

+

257

+

torch.quantization.quantize_qat

+

+

258

+

torch.quantization.prepare

+

+

259

+

torch.quantization.prepare_qat

+

+

260

+

torch.quantization.convert

+

+

261

+

torch.quantization.QConfig

+

+

262

+

torch.quantization.QConfigDynamic

+

+

263

+

torch.quantization.fuse_modules

+

+

264

+

torch.quantization.QuantStub

+

+

265

+

torch.quantization.DeQuantStub

+

+

266

+

torch.quantization.QuantWrapper

+

+

267

+

torch.quantization.add_quant_dequant

+

+

268

+

torch.quantization.add_observer_

+

+

269

+

torch.quantization.swap_module

+

+

270

+

torch.quantization.propagate_qconfig_

+

+

271

+

torch.quantization.default_eval_fn

+

+

272

+

torch.quantization.MinMaxObserver

+

+

273

+

torch.quantization.MovingAverageMinMaxObserver

+

+

274

+

torch.quantization.PerChannelMinMaxObserver

+

+

275

+

torch.quantization.MovingAveragePerChannelMinMaxObserver

+

+

276

+

torch.quantization.HistogramObserver

+

+

277

+

torch.quantization.FakeQuantize

+

+

278

+

torch.quantization.NoopObserver

+

+

279

+

torch.quantization.get_observer_dict

+

+

280

+

torch.quantization.RecordingObserver

+

+

281

+

torch.nn.intrinsic.ConvBn2d

+

+

282

+

torch.nn.intrinsic.ConvBnReLU2d

+

+

283

+

torch.nn.intrinsic.ConvReLU2d

+

+

284

+

torch.nn.intrinsic.ConvReLU3d

+

+

285

+

torch.nn.intrinsic.LinearReLU

+

+

286

+

torch.nn.intrinsic.qat.ConvBn2d

+

+

287

+

torch.nn.intrinsic.qat.ConvBnReLU2d

+

+

288

+

torch.nn.intrinsic.qat.ConvReLU2d

+

+

289

+

torch.nn.intrinsic.qat.LinearReLU

+

+

290

+

torch.nn.intrinsic.quantized.ConvReLU2d

+

+

291

+

torch.nn.intrinsic.quantized.ConvReLU3d

+

+

292

+

torch.nn.intrinsic.quantized.LinearReLU

+

+

293

+

torch.nn.qat.Conv2d

+

+

294

+

torch.nn.qat.Conv2d.from_float

+

+

295

+

torch.nn.qat.Linear

+

+

296

+

torch.nn.qat.Linear.from_float

+

+

297

+

torch.nn.quantized.functional.relu

+

+

298

+

torch.nn.quantized.functional.linear

+

+

299

+

torch.nn.quantized.functional.conv2d

+

+

300

+

torch.nn.quantized.functional.conv3d

+

+

301

+

torch.nn.quantized.functional.max_pool2d

+

+

302

+

torch.nn.quantized.functional.adaptive_avg_pool2d

+

+

303

+

torch.nn.quantized.functional.avg_pool2d

+

+

304

+

torch.nn.quantized.functional.interpolate

+

+

305

+

torch.nn.quantized.functional.upsample

+

+

306

+

torch.nn.quantized.functional.upsample_bilinear

+

+

307

+

torch.nn.quantized.functional.upsample_nearest

+

+

308

+

torch.nn.quantized.ReLU

+

+

309

+

torch.nn.quantized.ReLU6

+

+

310

+

torch.nn.quantized.Conv2d

+

+

311

+

torch.nn.quantized.Conv2d.from_float

+

+

312

+

torch.nn.quantized.Conv3d

+

+

313

+

torch.nn.quantized.Conv3d.from_float

+

+

314

+

torch.nn.quantized.FloatFunctional

+

+

315

+

torch.nn.quantized.QFunctional

+

+

316

+

torch.nn.quantized.Quantize

+

+

317

+

torch.nn.quantized.DeQuantize

+

+

318

+

torch.nn.quantized.Linear

+

+

319

+

torch.nn.quantized.Linear.from_float

+

+

320

+

torch.nn.quantized.dynamic.Linear

+

+

321

+

torch.nn.quantized.dynamic.Linear.from_float

+

+

322

+

torch.nn.quantized.dynamic.LSTM

+

+
+ +

Functions\(torch.nn.functional\)

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.nn.functional.conv1d

+

+

2

+

torch.nn.functional.conv2d

+

+

3

+

torch.nn.functional.conv3d

+

+

4

+

torch.nn.functional.conv_transpose1d

+

+

5

+

torch.nn.functional.conv_transpose2d

+

+

6

+

torch.nn.functional.conv_transpose3d

+

+

7

+

torch.nn.functional.unfold

+

+

8

+

torch.nn.functional.fold

+

+

9

+

torch.nn.functional.avg_pool1d

+

+

10

+

torch.nn.functional.avg_pool2d

+

+

11

+

torch.nn.functional.avg_pool3d

+

+

12

+

torch.nn.functional.max_pool1d

+

+

13

+

torch.nn.functional.max_pool2d

+

+

14

+

torch.nn.functional.max_pool3d

+

+

15

+

torch.nn.functional.max_unpool1d

+

+

16

+

torch.nn.functional.max_unpool2d

+

+

17

+

torch.nn.functional.max_unpool3d

+

+

18

+

torch.nn.functional.lp_pool1d

+

+

19

+

torch.nn.functional.lp_pool2d

+

+

20

+

torch.nn.functional.adaptive_max_pool1d

+

+

21

+

torch.nn.functional.adaptive_max_pool2d

+

+

22

+

torch.nn.functional.adaptive_max_pool3d

+

+

23

+

torch.nn.functional.adaptive_avg_pool1d

+

+

24

+

torch.nn.functional.adaptive_avg_pool2d

+

+

25

+

torch.nn.functional.adaptive_avg_pool3d

+

+

26

+

torch.nn.functional.threshold

+

+

27

+

torch.nn.functional.threshold_

+

+

28

+

torch.nn.functional.relu

+

+

29

+

torch.nn.functional.relu_

+

+

30

+

torch.nn.functional.hardtanh

+

+

31

+

torch.nn.functional.hardtanh_

+

+

32

+

torch.nn.functional.relu6

+

+

33

+

torch.nn.functional.elu

+

+

34

+

torch.nn.functional.elu_

+

+

35

+

torch.nn.functional.selu

+

+

36

+

torch.nn.functional.celu

+

+

37

+

torch.nn.functional.leaky_relu

+

+

38

+

torch.nn.functional.leaky_relu_

+

+

39

+

torch.nn.functional.prelu

+

+

40

+

torch.nn.functional.rrelu

+

+

41

+

torch.nn.functional.rrelu_

+

+

42

+

torch.nn.functional.glu

+

+

43

+

torch.nn.functional.gelu

+

+

44

+

torch.nn.functional.logsigmoid

+

+

45

+

torch.nn.functional.hardshrink

+

+

46

+

torch.nn.functional.tanhshrink

+

+

47

+

torch.nn.functional.softsign

+

+

48

+

torch.nn.functional.softplus

+

+

49

+

torch.nn.functional.softmin

+

+

50

+

torch.nn.functional.softmax

+

+

51

+

torch.nn.functional.softshrink

+

+

52

+

torch.nn.functional.gumbel_softmax

+

+

53

+

torch.nn.functional.log_softmax

+

+

54

+

torch.nn.functional.tanh

+

+

55

+

torch.nn.functional.sigmoid

+

+

56

+

torch.nn.functional.batch_norm

+

+

57

+

torch.nn.functional.instance_norm

+

+

58

+

torch.nn.functional.layer_norm

+

+

59

+

torch.nn.functional.local_response_norm

+

+

60

+

torch.nn.functional.normalize

+

+

61

+

torch.nn.functional.linear

+

+

62

+

torch.nn.functional.bilinear

+

+

63

+

torch.nn.functional.dropout

+

+

64

+

torch.nn.functional.alpha_dropout

+

+

65

+

torch.nn.functional.dropout2d

+

+

66

+

torch.nn.functional.dropout3d

+

+

67

+

torch.nn.functional.embedding

+

+

68

+

torch.nn.functional.embedding_bag

+

+

69

+

torch.nn.functional.one_hot

+

+

70

+

torch.nn.functional.pairwise_distance

+

+

71

+

torch.nn.functional.cosine_similarity

+

+

72

+

torch.nn.functional.pdist

+

+

73

+

torch.nn.functional.binary_cross_entropy

+

+

74

+

torch.nn.functional.binary_cross_entropy_with_logits

+

+

75

+

torch.nn.functional.poisson_nll_loss

+

+

76

+

torch.nn.functional.cosine_embedding_loss

+

+

77

+

torch.nn.functional.cross_entropy

+

+

78

+

torch.nn.functional.ctc_loss

+

+

79

+

torch.nn.functional.hinge_embedding_loss

+

+

80

+

torch.nn.functional.kl_div

+

+

81

+

torch.nn.functional.l1_loss

+

+

82

+

torch.nn.functional.mse_loss

+

+

83

+

torch.nn.functional.margin_ranking_loss

+

+

84

+

torch.nn.functional.multilabel_margin_loss

+

+

85

+

torch.nn.functional.multilabel_soft_margin_loss

+

+

86

+

torch.nn.functional.multi_margin_loss

+

+

87

+

torch.nn.functional.nll_loss

+

+

88

+

torch.nn.functional.smooth_l1_loss

+

+

89

+

torch.nn.functional.soft_margin_loss

+

+

90

+

torch.nn.functional.triplet_margin_loss

+

+

91

+

torch.nn.functional.pixel_shuffle

+

+

92

+

torch.nn.functional.pad

+

+

93

+

torch.nn.functional.interpolate

+

+

94

+

torch.nn.functional.upsample

+

+

95

+

torch.nn.functional.upsample_nearest

+

+

96

+

torch.nn.functional.upsample_bilinear

+

+

97

+

torch.nn.functional.grid_sample

+

+

98

+

torch.nn.functional.affine_grid

+

+

99

+

torch.nn.parallel.data_parallel

+

+
+ +

torch.distributed

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

是否支持

+

1

+

torch.distributed.init_process_group

+

+

2

+

torch.distributed.Backend

+

+

3

+

torch.distributed.get_backend

+

+

4

+

torch.distributed.get_rank

+

+

5

+

torch.distributed.get_world_size

+

+

6

+

torch.distributed.is_initialized

+

+

7

+

torch.distributed.is_mpi_available

+

+

8

+

torch.distributed.is_nccl_available

+

+

9

+

torch.distributed.new_group

+

+

10

+

torch.distributed.send

+

+

11

+

torch.distributed.recv

+

+

12

+

torch.distributed.isend

+

+

13

+

torch.distributed.irecv

+

+

14

+

is_completed

+

+

15

+

wait

+

+

16

+

torch.distributed.broadcast

+

+

17

+

torch.distributed.all_reduce

+

+

18

+

torch.distributed.reduce

+

+

19

+

torch.distributed.all_gather

+

+

20

+

torch.distributed.gather

+

+

21

+

torch.distributed.scatter

+

+

22

+

torch.distributed.barrier

+

+

23

+

torch.distributed.ReduceOp

+

+

24

+

torch.distributed.reduce_op

+

+

25

+

torch.distributed.broadcast_multigpu

+

+

26

+

torch.distributed.all_reduce_multigpu

+

+

27

+

torch.distributed.reduce_multigpu

+

+

28

+

torch.distributed.all_gather_multigpu

+

+

29

+

torch.distributed.launch

+

+

30

+

torch.multiprocessing.spawn

+

+
+ +

NPU和CUDA功能对齐

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

API名称

+

npu对应API名称

+

是否支持

+

1

+

torch.cuda.current_blas_handle

+

torch.npu.current_blas_handle

+

+

2

+

torch.cuda.current_device

+

torch.npu.current_device

+

+

3

+

torch.cuda.current_stream

+

torch.npu.current_stream

+

+

4

+

torch.cuda.default_stream

+

torch.npu.default_stream

+

+

5

+

torch.cuda.device

+

torch.npu.device

+

+

6

+

torch.cuda.device_count

+

torch.npu.device_count

+

+

7

+

torch.cuda.device_of

+

torch.npu.device_of

+

+

8

+

torch.cuda.get_device_capability

+

torch.npu.get_device_capability

+

+

9

+

torch.cuda.get_device_name

+

torch.npu.get_device_name

+

+

10

+

torch.cuda.init

+

torch.npu.init

+

+

11

+

torch.cuda.ipc_collect

+

torch.npu.ipc_collect

+

+

12

+

torch.cuda.is_available

+

torch.npu.is_available

+

+

13

+

torch.cuda.is_initialized

+

torch.npu.is_initialized

+

+

14

+

torch.cuda.set_device

+

torch.npu.set_device

+

部分支持

+

15

+

torch.cuda.stream

+

torch.npu.stream

+

+

16

+

torch.cuda.synchronize

+

torch.npu.synchronize

+

+

17

+

torch.cuda.get_rng_state

+

torch.npu.get_rng_state

+

+

18

+

torch.cuda.get_rng_state_all

+

torch.npu.get_rng_state_all

+

+

19

+

torch.cuda.set_rng_state

+

torch.npu.set_rng_state

+

+

20

+

torch.cuda.set_rng_state_all

+

torch.npu.set_rng_state_all

+

+

21

+

torch.cuda.manual_seed

+

torch.npu.manual_seed

+

+

22

+

torch.cuda.manual_seed_all

+

torch.npu.manual_seed_all

+

+

23

+

torch.cuda.seed

+

torch.npu.seed

+

+

24

+

torch.cuda.seed_all

+

torch.npu.seed_all

+

+

25

+

torch.cuda.initial_seed

+

torch.npu.initial_seed

+

+

26

+

torch.cuda.comm.broadcast

+

torch.npu.comm.broadcast

+

+

27

+

torch.cuda.comm.broadcast_coalesced

+

torch.npu.comm.broadcast_coalesced

+

+

28

+

torch.cuda.comm.reduce_add

+

torch.npu.comm.reduce_add

+

+

29

+

torch.cuda.comm.scatter

+

torch.npu.comm.scatter

+

+

30

+

torch.cuda.comm.gather

+

torch.npu.comm.gather

+

+

31

+

torch.cuda.Stream

+

torch.npu.Stream

+

+

32

+

torch.cuda.Stream.query

+

torch.npu.Stream.query

+

+

33

+

torch.cuda.Stream.record_event

+

torch.npu.Stream.record_event

+

+

34

+

torch.cuda.Stream.synchronize

+

torch.npu.Stream.synchronize

+

+

35

+

torch.cuda.Stream.wait_event

+

torch.npu.Stream.wait_event

+

+

36

+

torch.cuda.Stream.wait_stream

+

torch.npu.Stream.wait_stream

+

+

37

+

torch.cuda.Event

+

torch.npu.Event

+

+

38

+

torch.cuda.Event.elapsed_time

+

torch.npu.Event.elapsed_time

+

+

39

+

torch.cuda.Event.from_ipc_handle

+

torch.npu.Event.from_ipc_handle

+

+

40

+

torch.cuda.Event.ipc_handle

+

torch.npu.Event.ipc_handle

+

+

41

+

torch.cuda.Event.query

+

torch.npu.Event.query

+

+

42

+

torch.cuda.Event.record

+

torch.npu.Event.record

+

+

43

+

torch.cuda.Event.synchronize

+

torch.npu.Event.synchronize

+

+

44

+

torch.cuda.Event.wait

+

torch.npu.Event.wait

+

+

45

+

torch.cuda.empty_cache

+

torch.npu.empty_cache

+

+

46

+

torch.cuda.memory_stats

+

torch.npu.memory_stats

+

+

47

+

torch.cuda.memory_summary

+

torch.npu.memory_summary

+

+

48

+

torch.cuda.memory_snapshot

+

torch.npu.memory_snapshot

+

+

49

+

torch.cuda.memory_allocated

+

torch.npu.memory_allocated

+

+

50

+

torch.cuda.max_memory_allocated

+

torch.npu.max_memory_allocated

+

+

51

+

torch.cuda.reset_max_memory_allocated

+

torch.npu.reset_max_memory_allocated

+

+

52

+

torch.cuda.memory_reserved

+

torch.npu.memory_reserved

+

+

53

+

torch.cuda.max_memory_reserved

+

torch.npu.max_memory_reserved

+

+

54

+

torch.cuda.memory_cached

+

torch.npu.memory_cached

+

+

55

+

torch.cuda.max_memory_cached

+

torch.npu.max_memory_cached

+

+

56

+

torch.cuda.reset_max_memory_cached

+

torch.npu.reset_max_memory_cached

+

+

57

+

torch.cuda.nvtx.mark

+

torch.npu.nvtx.mark

+

+

58

+

torch.cuda.nvtx.range_push

+

torch.npu.nvtx.range_push

+

+

59

+

torch.cuda.nvtx.range_pop

+

torch.npu.nvtx.range_pop

+

+

60

+

torch.cuda._sleep

+

torch.npu._sleep

+

+

61

+

torch.cuda.Stream.priority_range

+

torch.npu.Stream.priority_range

+

+

62

+

torch.cuda.get_device_properties

+

torch.npu.get_device_properties

+

+

63

+

torch.cuda.amp.GradScaler

+

torch.npu.amp.GradScaler

+

+
+ +>![](public_sys-resources/icon-note.gif) **说明:** +>torch.npu.set\_device\(\)接口只支持在程序开始的位置通过set\_device进行指定,不支持多次指定和with torch.npu.device\(id\)方式的device切换 + diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01.md" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01.md" new file mode 100644 index 0000000000000000000000000000000000000000..7bf4e1dc551fd767e8565cfa211a1766477a6319 --- /dev/null +++ "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01.md" @@ -0,0 +1,685 @@ +# PyTorch在线推理指南 +- [使用场景](#使用场景.md) +- [基本使用流程](#基本使用流程.md) + - [前提条件](#前提条件.md) + - [在线推理流程](#在线推理流程.md) + - [环境变量配置](#环境变量配置.md) + - [样例参考](#样例参考.md) +- [专题](#专题.md) + - [混合精度](#混合精度.md) + - [权重更新](#权重更新.md) +- [FAQ](#FAQ.md) + - [pip3.7 install Pillow==5.3.0安装失败](#pip3-7-install-Pillow-5-3-0安装失败.md) + - [安装“torch-\*.whl ”提示“torch 1.5.0xxxx”与“torchvision”所依赖的版本不匹配](#安装-torch--whl-提示-torch-1-5-0xxxx-与-torchvision-所依赖的版本不匹配.md) +- [安装7.3.0版本gcc](#安装7-3-0版本gcc.md) +

使用场景

+ +在线推理是在AI框架内执行推理的场景,例如在PyTorch框架上,加载模型后,通过model.eval\(\)执行推理。 + +相比于离线推理场景,使用在线推理可以方便将原来基于PyTorch框架做推理的应用快速迁移到昇腾AI处理器,适用于数据中心推理场景。 + +## 支持的芯片型号 + +昇腾910 AI处理器 + +昇腾710 AI处理器 + +

基本使用流程

+ +- **[前提条件](#前提条件.md)** + +- **[在线推理流程](#在线推理流程.md)** + +- **[环境变量配置](#环境变量配置.md)** + +- **[样例参考](#样例参考.md)** + + +

前提条件

+ +已完成PyTorch框架及混合精度模块的安装,详情请参考《PyTorch网络模型移植&训练指南》的"环境准备"章节。 + +

在线推理流程

+ +在线推理流程如[图1](#fig13802941161818)所示: + +**图 1** 在线推理流程图 +![](figures/在线推理流程图.png "在线推理流程图") + +

环境变量配置

+ +Pytorch在线推理进程启动所依赖的环境变量: + +根据环境实际安装的软件包(toolkit或nnae),在下列场景中选择一个并运行对应的环境变量配置脚本。 + +``` +# 场景一:昇腾设备安装部署开发套件包Ascend-cann-toolkit(此时开发环境可进行推理任务),可根据不同安装用户执行相应脚本。 + # 以root用户安装toolkit包 + . /usr/local/Ascend/ascend-toolkit/set_env.sh + # 以非root用户安装toolkit包 + . ${HOME}/Ascend/ascend-toolkit/set_env.sh +# 场景二:昇腾设备安装部署软件包Ascend-cann-nnae,可根据不同安装用户执行相应脚本。 + # 以root用户安装nnae包 + . /usr/local/Ascend/nnae/set_env.sh + # 以非root用户安装nnae包 + . ${HOME}/Ascend/nnae/set_env.sh +``` + +其他环境变量配置。 + +``` +# 若参见《PyTorch网络模型移植&训练指南》的"环境准备"章节进行环境搭建时安装python3.7.5,或存在多个python3版本时,需要在环境变量中配置python3.7.5的安装路径。 +export PATH=/usr/local/python3.7.5/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib:$LD_LIBRARY_PATH + +# 指定芯片的逻辑ID +export ASCEND_DEVICE_ID=0 + +# 输出日志信息,可根据实际修改 +export ASCEND_SLOG_PRINT_TO_STDOUT=1 +export ASCEND_GLOBAL_LOG_LEVEL=0 +export TASK_QUEUE_ENABLE=0 +``` + +**表 1** + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

配置项

+

说明

+

必选/可选

+

LD_LIBRARY_PATH

+

动态库的查找路径,参考上述举例配置。

+
说明:

若系统环境升级了gcc版本(例如Centos、Debian和BClinux系统),需要配置gcc相关环境变量,详情请参见5

+
+

必选

+

PATH

+

可执行程序的查找路径,参考上述举例配置。

+

必选

+

ASCEND_DEVICE_ID

+

指定芯片的逻辑ID。

+

取值范围[0,N-1],默认为0。其中N为当前物理机/虚拟机/容器内的设备总数。

+

可选

+

ASCEND_SLOG_PRINT_TO_STDOUT

+

是否开启日志打屏。取值:

+
  • 0或不配置:关闭日志打屏
  • 1:开启日志打屏
+

可选

+

ASCEND_GLOBAL_LOG_LEVEL

+

设置日志的全局日志级别。取值:

+
  • 0:对应DEBUG级别。
  • 1:对应INFO级别。
  • 2:对应WARNING级别。
  • 3:对应ERROR级别。
  • 4:对应NULL级别,不输出日志。
  • 其他值为非法值。
+

可选

+

TASK_QUEUE_ENABLE

+

s是否开启TASK多线程下发,绝大多数情况下,打开该功能会进一步提升整网训练性能。取值:

+
  • 0或不配置:关闭TASK多线程下发。
  • 1:开启TASK多线程下发。
+

可选

+
+ +>![](public_sys-resources/icon-note.gif) **说明:** +>更多日志信息,请参见《CANN 日志参考》。 + +

样例参考

+ +## 样例代码 + +在进行推理应用时,应尽量保证应用在生命周期内不频繁初始化。推理模式通过模型model.eval\(\)进行设置,并且推理过程要在“with torch.no\_grad\(\):”代码分支下运行。本例中,我们将使用Resnet50网络的python代码进行说明。 + +样例代码 resnet50\_infer\_for\_pytorch.py: + +``` +import argparse +import os +import time +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +import torch.utils.data.distributed +import torchvision.transforms as transforms +import torchvision.datasets as datasets +import torchvision.models as models + +import torch.npu +from apex import amp + +model_names = sorted(name for name in models.__dict__ + if name.islower() and not name.startswith("__") + and callable(models.__dict__[name])) + + +def parse_args(): + """ 用户自定义数据集路径、模型路径 """ + parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') + parser.add_argument('--data', metavar='DIR', default="/data/imagenet", + help='path to dataset') + parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet50', + choices=model_names, + help='model architecture: ' + + ' | '.join(model_names) + + ' (default: resnet18)') + + parser.add_argument('--epochs', default=100, type=int, metavar='N', + help='number of total epochs to run') + + parser.add_argument('-b', '--batch_size', default=512, type=int, + metavar='N', + help='mini-batch size (default: 256), this is the total ' + 'batch size of all GPUs on the current node when ' + 'using Data Parallel or Distributed Data Parallel') + + parser.add_argument('--resume', default='', type=str, metavar='PATH', + help='path to latest checkpoint (default: none)') + + parser.add_argument('--pretrained', dest='pretrained', action='store_true', + help='use pre-trained model') + + parser.add_argument('--npu', default=None, type=int, + help='NPU id to use.') + + parser.add_argument('-j', '--workers', default=32, type=int, metavar='N', + help='number of data loading workers (default: 8)') + + parser.add_argument('--lr', '--learning_rate', default=0.1, type=float, + metavar='LR', help='initial learning rate', dest='lr') + parser.add_argument('--wd', '--weight_decay', default=1e-4, type=float, + metavar='W', help='weight decay (default: 1e-4)', + dest='weight_decay') + + args, unknown_args = parser.parse_known_args() + if len(unknown_args) > 0: + for bad_arg in unknown_args: + print("ERROR: Unknown command line arg: %s" % bad_arg) + raise ValueError("Invalid command line arg(s)") + + return args + + +# ========================================================================= +# 主函数入口 +# ========================================================================= +def main(): + args = parse_args() + if args.npu is None: + args.npu = 0 + global CALCULATE_DEVICE + CALCULATE_DEVICE = "npu:{}".format(args.npu) + torch.npu.set_device(CALCULATE_DEVICE) + print("use ", CALCULATE_DEVICE) + main_worker(args.npu, args) + + +def main_worker(npu, args): + global best_acc1 + args.npu = npu + + # ========================================================================= + # 创建模型 + # ========================================================================= + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch](zero_init_residual=True) + + # 将模型数据置于昇腾AI处理器中 + model = model.to(CALCULATE_DEVICE) + + optimizer = torch.optim.SGD([ + {'params': [param for name, param in model.named_parameters() if name[-4:] == 'bias'], 'weight_decay': 0.0}, + {'params': [param for name, param in model.named_parameters() if name[-4:] != 'bias'], + 'weight_decay': args.weight_decay}], + args.lr) + + # ========================================================================= + # 初始化混合精度模型,使用后可加速运算,但推理结果的准确率可能会轻微降低。可根据实际场景选择使用 + # ========================================================================= + model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024, verbosity=1) + + # ========================================================================= + # 加载训练好的模型参数:通过命令行参数“--resume checkpoint文件” + # ========================================================================= + # 从模型文件中恢复模型参数 + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume) + + best_acc1 = checkpoint['best_acc1'] + best_acc1 = best_acc1.to("npu:{}".format(args.npu)) + + model.load_state_dict(checkpoint['state_dict']) + print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) + + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + # ========================================================================= + # 初始化推理数据集 + # ========================================================================= + # 图像数据加载与预处理 + valdir = os.path.join(args.data, 'val') + normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=True) + + # ========================================================================= + # 进入在线推理模式 + # ========================================================================= + validate(val_loader, model, args) + + +# ========================================================================= +# 在线推理样例接口实现 +# ========================================================================= +def validate(val_loader, model, args): + batch_time = AverageMeter('Time', ':6.3f') + top1 = AverageMeter('Acc@1', ':6.2f') + top5 = AverageMeter('Acc@5', ':6.2f') + progress = ProgressMeter( + len(val_loader), + [batch_time, top1, top5], + prefix='Test: ') + + # ========================================================================= + # 切换到推理模式 + # ========================================================================= + model.eval() + + # ========================================================================= + # 在 torch.no_grad():分支下执行模型正向计算 + # ========================================================================= + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + + # 将图像数据置于NPU中 + images = images.to(CALCULATE_DEVICE, non_blocking=True) + target = target.to(torch.int32).to(CALCULATE_DEVICE, non_blocking=True) + + # 计算输出 + output = model(images) + + # 测量结果精度 + acc1, acc5 = accuracy(output, target, topk=(1, 5)) + top1.update(acc1[0], images.size(0)) + top5.update(acc5[0], images.size(0)) + + # 测量运行时间 + batch_time.update(time.time() - end) + end = time.time() + + # 打印推理运算过程日志 + progress.display(i) + + # TODO: this should also be done with the ProgressMeter + print(' * Acc@1 {top1.avg:.3f} Acc@5 {top5.avg:.3f}'.format(top1=top1, top5=top5)) + + return top1.avg + + +class AverageMeter(object): + """计算并存储平均值和当前值""" + def __init__(self, name, fmt=':f'): + self.name = name + self.fmt = fmt + self.reset() + self.start_count_index = 10 + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + if self.count == 0: + self.batchsize = n + + self.val = val + self.count += n + if self.count > (self.start_count_index * self.batchsize): + self.sum += val * n + self.avg = self.sum / (self.count - self.start_count_index * self.batchsize) + + def __str__(self): + fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' + return fmtstr.format(**self.__dict__) + + +class ProgressMeter(object): + """记录模型运算过程""" + def __init__(self, num_batches, meters, prefix=""): + self.batch_fmtstr = self._get_batch_fmtstr(num_batches) + self.meters = meters + self.prefix = prefix + + def display(self, batch): + entries = [self.prefix + self.batch_fmtstr.format(batch)] + entries += [str(meter) for meter in self.meters] + print('\t'.join(entries)) + + def _get_batch_fmtstr(self, num_batches): + num_digits = len(str(num_batches // 1)) + fmt = '{:' + str(num_digits) + 'd}' + return '[' + fmt + '/' + fmt.format(num_batches) + ']' + + +def accuracy(output, target, topk=(1,)): + """根据指定值k,计算k个顶部预测的精度""" + with torch.no_grad(): + maxk = max(topk) + batch_size = target.size(0) + + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.view(1, -1).expand_as(pred)) + + res = [] + for k in topk: + correct_k = correct[:k].view(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / batch_size)) + return res + + +if __name__ == '__main__': + main() +``` + +## 样例执行 + +以ResNet50模型为例,执行在线推理样例。 + +1. 下载预训练模型。 + + 打开ModelZoo中[ResNet50详情页](https://ascend.huawei.com/zh/#/software/modelzoo/detail/C/cf20ab8b8bea4032a6b056ab503112e4),点击该页面的“下载模型“下载已训练好的模型文件。 + +2. 编辑推理脚本。 + + 创建“resnet50\_infer\_for\_pytorch.py“模型脚本文件,并参考[样例代码]()写入相关代码。 + +3. 执行推理。 + + 参考[环境变量配置](#环境变量配置.md)设置环境变量,并执行命令: + + ``` + python3.7 pytorch-resnet50-apex.py --data /data/imagenet --npu 7 --epochs 90 --resume checkpoint_npu7_epoch53.pth.tar + # 参数'--resume'加载训练好的权重参数文件,用户可根据实际模型名称修改 + ``` + + >![](public_sys-resources/icon-note.gif) **说明:** + >上述为样例输入,用户可根据实际修改传入参数。 + + +

专题

+ +- **[混合精度](#混合精度.md)** + +- **[权重更新](#权重更新.md)** + + +

混合精度

+ +## 概述 + +基于NPU芯片的架构特性,模型运算会涉及到混合精度,即混合使用float16和float32数据类型的应用场景。使用float16代替float32有如下好处: + +●对于中间变量的内存占用更少,节省内存的使用。 + +●因内存使用会减少,所以数据传出的时间也会减半。 + +●float16的计算单元可以提供更快的计算性能。 + +但是,混合精度训练受限于float16表达的精度范围,单纯将float32转换成float16会影响训练收敛情况,为了保证部分计算使用float16来进行加速的同时能保证训练收敛,这里采用混合精度模块Apex来达到以上效果。混合精度模块Apex是一个集优化性能、精度收敛于一身的综合优化库。 + +## 特性支持 + +混合精度模块功能和优化描述如[表1](#zh-cn_topic_0278765773_table10717173813332)所示: + +**表 1** 混合精度模块功能 + + + + + + + + + + + + + + + + + + + +

功能

+

描述

+

O1配置模式

+

Conv, Matmal等使用float16计算,其他如Softmax、BN使用float32

+

O2配置

+

除了BN使用float32外,其他绝大部分使用float16

+

静态Loss Scale功能

+

静态设置参数确保混合精度训练收敛。

+

动态Loss Scale功能

+

动态计算Loss Scale值并判读是否溢出。

+
+ +>![](public_sys-resources/icon-note.gif) **说明:** +>当前版本的实现方式主要为python实现,不支持AscendCL或者CUDA优化。 + +## 混合精度模型初始化 + +1. 使用apex混合精度模块需要首先从apex库中导入amp,代码如下: + + ``` + from apex import amp + ``` + +2. 导入amp模块后,需要初始化amp,使其能对模型、优化器以及PyTorch内部函数进行必要的改动,初始化代码如下: + + ``` + model, optimizer = amp.initialize(model, optimizer) + ``` + + - 可参考[样例代码](#样例参考.md)中的“初始化混合精度模型”: + + ``` + model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=1024, verbosity=1) + ``` + + + +## 混合精度推理 + +按混合精度模型初始化后,正常执行模型正向计算即可。 + +参考代码:可参考[样例代码](#样例参考.md)实现。 + +

权重更新

+ +## 背景 + +推理进行的同时,训练服务器不断训练得到新的权重。 + +如果希望使用最新的权重进行推理,可以采用在线推理方式,直接更新权重。 + +## 整体流程 + +**图 1** 权重更新流程示意图 +![](figures/权重更新流程示意图.png "权重更新流程示意图") + +如[图1](#fig6243201383)所示,支持循环地更新权重与执行推理。主要流程: + +1. 模型初始化; +2. 权重加载/更新:获取在线推理模型和权重信息,例如从ckpt文件中加载,实际更新用的权重则来自于外部的key-value; +3. 加载数据; +4. 执行推理图模型。 + +## 样例参考 + +请参考[样例参考](#样例参考.md)。 + +

FAQ

+ +- **[pip3.7 install Pillow==5.3.0安装失败](#pip3-7-install-Pillow-5-3-0安装失败.md)** + +- **[安装“torch-\*.whl ”提示“torch 1.5.0xxxx”与“torchvision”所依赖的版本不匹配](#安装-torch--whl-提示-torch-1-5-0xxxx-与-torchvision-所依赖的版本不匹配.md)** + + +

pip3.7 install Pillow==5.3.0安装失败

+ +## 现象描述 + +pip3.7 install pillow==5.3.0安装失败。 + +## 可能原因 + +缺少必要的依赖,如:libjpeg、python-devel、 zlib-devel 、libjpeg-turbo-devel等等。 + +## 处理方法 + +安装相关依赖,通过如下命令安装: + +- CentOS/EulerOS/Tlinux/BClinux/Suse + + **yum install libjpeg python-devel zlib-devel libjpeg-turbo-devel** + +- Ubuntu/Debian/UOS + + **apt-get install libjpeg python-devel zlib-devel libjpeg-turbo-devel** + + +

安装“torch-\*.whl ”提示“torch 1.5.0xxxx”与“torchvision”所依赖的版本不匹配

+ +## 现象描述 + +安装“torch-\*.whl”时,提示"ERROR:torchvision 0.6.0 has requirement torch==1.5.0, but you'll have torch 1.5.0a0+1977093 which is incompatible"。 + +![](figures/zh-cn_image_0000001152776305.png) + +## 可能原因 + +安装torch时,会自动触发torchvision进行依赖版本检查,环境中安装的torchvision版本为0.6.0,检查时发现我们安装的torch-\*.whl的版本号与要求的1.5.0不一致,所以提示报错,但实际安装成功。 + +## 处理方法 + +对实际结果无影响,无需处理。 + +

安装7.3.0版本gcc

+ +以下步骤请在root用户下执行。 + +1. 下载gcc-7.3.0.tar.gz,下载地址为[https://mirrors.tuna.tsinghua.edu.cn/gnu/gcc/gcc-7.3.0/gcc-7.3.0.tar.gz](https://mirrors.tuna.tsinghua.edu.cn/gnu/gcc/gcc-7.3.0/gcc-7.3.0.tar.gz)。 +2. 安装gcc时候会占用大量临时空间,所以先执行下面的命令清空/tmp目录: + + ``` + sudo rm -rf /tmp/* + ``` + +3. 安装依赖。 + + centos/bclinux执行如下命令安装。 + + ``` + yum install bzip2 + ``` + + ubuntu/debian执行如下命令安装。 + + ``` + apt-get install bzip2 + ``` + +4. 编译安装gcc。 + 1. 进入gcc-7.3.0.tar.gz源码包所在目录,解压源码包,命令为: + + ``` + tar -zxvf gcc-7.3.0.tar.gz + ``` + + 2. 进入解压后的文件夹,执行如下命令下载gcc依赖包: + + ``` + cd gcc-7.3.0 + ./contrib/download_prerequisites + ``` + + 如果执行上述命令报错,需要执行如下命令在“gcc-7.3.0/“文件夹下下载依赖包: + + ``` + wget http://gcc.gnu.org/pub/gcc/infrastructure/gmp-6.1.0.tar.bz2 + wget http://gcc.gnu.org/pub/gcc/infrastructure/mpfr-3.1.4.tar.bz2 + wget http://gcc.gnu.org/pub/gcc/infrastructure/mpc-1.0.3.tar.gz + wget http://gcc.gnu.org/pub/gcc/infrastructure/isl-0.16.1.tar.bz2 + ``` + + 下载好上述依赖包后,重新执行以下命令: + + ``` + ./contrib/download_prerequisites + ``` + + 如果上述命令校验失败,需要确保依赖包为一次性下载成功,无重复下载现象。 + + 3. 执行配置、编译和安装命令: + + ``` + ./configure --enable-languages=c,c++ --disable-multilib --with-system-zlib --prefix=/usr/local/linux_gcc7.3.0 + make -j15 # 通过grep -w processor /proc/cpuinfo|wc -l查看cpu数,示例为15,用户可自行设置相应参数。 + make install + ``` + + >![](public_sys-resources/icon-caution.gif) **注意:** + >其中“--prefix“参数用于指定linux\_gcc7.3.0安装路径,用户可自行配置,但注意不要配置为“/usr/local“及“/usr“,因为会与系统使用软件源默认安装的gcc相冲突,导致系统原始gcc编译环境被破坏。示例指定为“/usr/local/linux\_gcc7.3.0“。 + + +5. 配置环境变量。 + + 当用户执行训练时,需要用到gcc升级后的编译环境,因此要在训练脚本中配置环境变量,通过如下命令配置。 + + ``` + export LD_LIBRARY_PATH=${install_path}/lib64:${LD_LIBRARY_PATH} + ``` + + 其中$\{install\_path\}为[3.](#zh-cn_topic_0000001146754749_zh-cn_topic_0000001072593337_l75d31a2874534a2092e80a5f865b46f0)中配置的gcc7.3.0安装路径,本示例为“/usr/local/gcc7.3.0/“。 + + >![](public_sys-resources/icon-note.gif) **说明:** + >本步骤为用户在需要用到gcc升级后的编译环境时才配置环境变量。 + + diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152776305.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152776305.png" new file mode 100644 index 0000000000000000000000000000000000000000..ede83f4bc1b0ed21a9c746c358c45681d5ffb49a Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152776305.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/\345\234\250\347\272\277\346\216\250\347\220\206\346\265\201\347\250\213\345\233\276.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/\345\234\250\347\272\277\346\216\250\347\220\206\346\265\201\347\250\213\345\233\276.png" new file mode 100644 index 0000000000000000000000000000000000000000..9ddcc9d912b91bcb3b47dc6fcb1a8b1d86398c90 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/\345\234\250\347\272\277\346\216\250\347\220\206\346\265\201\347\250\213\345\233\276.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/\346\235\203\351\207\215\346\233\264\346\226\260\346\265\201\347\250\213\347\244\272\346\204\217\345\233\276.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/\346\235\203\351\207\215\346\233\264\346\226\260\346\265\201\347\250\213\347\244\272\346\204\217\345\233\276.png" new file mode 100644 index 0000000000000000000000000000000000000000..87b61f83d611b35ea15d0f069b71245d72ee7a0a Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\345\234\250\347\272\277\346\216\250\347\220\206\346\214\207\345\215\227 01/figures/\346\235\203\351\207\215\346\233\264\346\226\260\346\265\201\347\250\213\347\244\272\346\204\217\345\233\276.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01.md" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01.md" new file mode 100644 index 0000000000000000000000000000000000000000..82c6aca110f215d87eb75aeb0f26d42cdb4afd66 --- /dev/null +++ "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01.md" @@ -0,0 +1,3934 @@ +# PyTorch网络模型移植&训练指南 +- [概述](#概述.md) +- [约束与限制](#约束与限制.md) +- [迁移流程](#迁移流程.md) +- [模型移植评估](#模型移植评估.md) +- [环境准备](#环境准备.md) + - [简介](#简介.md) + - [手动编译安装](#手动编译安装.md) + - [前提条件](#前提条件.md) + - [安装PyTorch框架](#安装PyTorch框架.md) + - [配置环境变量](#配置环境变量.md) + - [安装混合精度模块](#安装混合精度模块.md) + - [使用Ascend Hub镜像](#使用Ascend-Hub镜像.md) + - [Ascend Hub获取PyTorch镜像](#Ascend-Hub获取PyTorch镜像.md) + - [配置环境变量](#配置环境变量-0.md) +- [模型迁移](#模型迁移.md) + - [工具迁移](#工具迁移.md) + - [功能介绍](#功能介绍.md) + - [操作指南](#操作指南.md) + - [结果解析](#结果解析.md) + - [手工迁移](#手工迁移.md) + - [单P训练模型迁移](#单P训练模型迁移.md) + - [多P训练模型迁移](#多P训练模型迁移.md) + - [PyTorch接口替换](#PyTorch接口替换.md) + - [混合精度](#混合精度.md) + - [性能优化](#性能优化.md) + - [概述](#概述-1.md) + - [修改CPU性能模式(X86服务器)](#修改CPU性能模式(X86服务器).md) + - [修改CPU性能模式(ARM服务器)](#修改CPU性能模式(ARM服务器).md) + - [安装高性能pillow库(X86服务器)](#安装高性能pillow库(X86服务器).md) + - [(可选)安装指定版本OpenCV库](#(可选)安装指定版本OpenCV库.md) +- [模型训练](#模型训练.md) +- [性能调优和分析](#性能调优和分析.md) + - [前提条件](#前提条件-2.md) + - [调测过程](#调测过程.md) + - [总体思路](#总体思路.md) + - [采集训练过程相关数据](#采集训练过程相关数据.md) + - [性能优化](#性能优化-3.md) + - [亲和库](#亲和库.md) + - [来源介绍](#来源介绍.md) + - [功能介绍](#功能介绍-4.md) +- [精度调测](#精度调测.md) + - [前提条件](#前提条件-5.md) + - [调测过程](#调测过程-6.md) + - [总体思路](#总体思路-7.md) + - [精度调优方法](#精度调优方法.md) +- [模型保存与转换](#模型保存与转换.md) + - [简介](#简介-8.md) + - [模型保存](#模型保存.md) + - [导出ONNX模型](#导出ONNX模型.md) +- [样例说明](#样例说明.md) + - [ResNet50模型迁移示例](#ResNet50模型迁移示例.md) + - [样例获取](#样例获取.md) + - [训练脚本迁移](#训练脚本迁移.md) + - [单P训练修改](#单P训练修改.md) + - [分布式训练修改](#分布式训练修改.md) + - [脚本执行](#脚本执行.md) + - [ShuffleNet模型调优示例](#ShuffleNet模型调优示例.md) + - [样例获取](#样例获取-9.md) + - [模型评估](#模型评估.md) + - [网络迁移](#网络迁移.md) + - [网络调测](#网络调测.md) +- [参考信息](#参考信息.md) + - [单算子样例编写说明](#单算子样例编写说明.md) + - [单算子dump方法](#单算子dump方法.md) + - [常用环境变量说明](#常用环境变量说明.md) + - [dump op方法](#dump-op方法.md) + - [CMake安装方法](#CMake安装方法.md) +- [FAQ](#FAQ.md) + - [软件安装常见问题](#软件安装常见问题.md) + - [pip3.7 install Pillow==5.3.0安装失败](#pip3-7-install-Pillow-5-3-0安装失败.md) + - [安装“torch-\*.whl ”提示“torch 1.5.0xxxx”与“torchvision”所依赖的版本不匹配](#安装-torch--whl-提示-torch-1-5-0xxxx-与-torchvision-所依赖的版本不匹配.md) + - [模型和算子运行常见问题](#模型和算子运行常见问题.md) + - [在模型运行或者算子运行时遇到报错“RuntimeError: ExchangeDevice:”](#在模型运行或者算子运行时遇到报错-RuntimeError-ExchangeDevice.md) + - [在模型运行或者算子运行时遇到报错“Error in atexit.\_run\_exitfuncs:”](#在模型运行或者算子运行时遇到报错-Error-in-atexit-_run_exitfuncs.md) + - [在模型运行时遇到报错“terminate called after throwing an instance of 'c10::Error' what\(\): HelpACLExecute:”](#在模型运行时遇到报错-terminate-called-after-throwing-an-instance-of-c10-Error-what()-HelpACLExecute.md) + - [在模型运行时遇到报错“ImportError: libhccl.so.”](#在模型运行时遇到报错-ImportError-libhccl-so.md) + - [在模型运行时遇到报错“RuntimeError: Initialize.”](#在模型运行时遇到报错-RuntimeError-Initialize.md) + - [在模型运行时遇到报错“TVM/te/cce error.”](#在模型运行时遇到报错-TVM-te-cce-error.md) + - [在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”](#在模型运行时遇到报错-MemCopySync-drvMemcpy-failed.md) + - [在模型运行时将多任务下发关闭\(export TASK\_QUEUE\_ENABLE=0\)后仍然遇到报错“HelpACLExecute.”](#在模型运行时将多任务下发关闭(export-TASK_QUEUE_ENABLE-0)后仍然遇到报错-HelpACLExecute.md) + - [模型调测常见问题](#模型调测常见问题.md) + - [在模型调测时遇到报错“RuntimeError: malloc:/..../pytorch/c10/npu/NPUCachingAllocator.cpp:293 NPU error, error code is 500000.”](#在模型调测时遇到报错-RuntimeError-malloc-pytorch-c10-npu-NPUCachingAllocator-cpp-293-NPU-error-error-code-is-5.md) + - [在模型调测时遇到报错“RuntimeError: Could not run 'aten::trunc.out' with arguments from the 'NPUTensorId' backend.”](#在模型调测时遇到报错-RuntimeError-Could-not-run-aten-trunc-out-with-arguments-from-the-NPUTensorId-backend.md) + - [在模型调测时遇到如MaxPoolGradWithArgmaxV1算子和max算子报错](#在模型调测时遇到如MaxPoolGradWithArgmaxV1算子和max算子报错.md) + - [在调用torch时遇到报错“ModuleNotFoundError: No module named 'torch.\_C'”](#在调用torch时遇到报错-ModuleNotFoundError-No-module-named-torch-_C.md) + - [其他操作相关问题](#其他操作相关问题.md) + - [cuda流同步操作报错](#cuda流同步操作报错.md) + - [aicpu\_kernels/libpt\_kernels.so不存在](#aicpu_kernels-libpt_kernels-so不存在.md) + - [使用npu-smi info查看显存时发现python进程残留](#使用npu-smi-info查看显存时发现python进程残留.md) + - [动态shape报错“match op inputs failed”](#动态shape报错-match-op-inputs-failed.md) + - [Op type SigmoidCrossEntropyWithLogitsV2 of ops kernel AIcoreEngine is unsupported](#Op-type-SigmoidCrossEntropyWithLogitsV2-of-ops-kernel-AIcoreEngine-is-unsupported.md) + - [Hook失败](#Hook失败.md) + - [加载权重时遇到报错“load state\_dict error.”](#加载权重时遇到报错-load-state_dict-error.md) + - [模型分布式训练常见问题](#模型分布式训练常见问题.md) + - [在进行模型分布式训练时遇到报错“host not found.”](#在进行模型分布式训练时遇到报错-host-not-found.md) + - [在进行模型分布式训练时遇到报错“RuntimeError:connect\(\) timed out.”](#在进行模型分布式训练时遇到报错-RuntimeError-connect()-timed-out.md) +

概述

+ +当前阶段针对PyTorch框架实现的对接适配昇腾AI处理器的方案为在线对接方案。 + +## 方案特性及优点 + +昇腾AI处理器的加速实现方式是以各种算子为粒度进行调用(OP-based),即通过AscendCL调用一个或几个D亲和算子组合的形式,代替原有GPU的实现方式。其逻辑模型如[图1](#fig2267112413239)所示。 + +**图 1** 逻辑模型 + + +![](figures/pytorch适配逻辑结构图-优化.png) + +当前选择在线对接适配方案的主要原因有以下几点: + +1. 最大限度的继承PyTorch框架动态图的特性。 +2. 最大限度的继承GPU在PyTorch上的使用方式,可以使用户在将模型移植到昇腾AI处理器设备进行训练时,在开发方式和代码重用方面做到最小的改动。 +3. 最大限度的继承PyTorch原生的体系结构,保留框架本身出色的特性,比如自动微分、动态分发、Debug、Profiling、Storage共享机制以及设备侧的动态内存管理等。 +4. 扩展性好。在打通流程的通路之上,对于新增的网络类型或结构,只需涉及相关计算类算子的开发和实现。框架类算子,反向图建立和实现机制等结构可保持复用。 +5. 与GPU的使用方式和风格保持一致。用户在使用在线对接方案时,只需在Python侧和Device相关操作中,指定device为昇腾AI处理器,即可完成用昇腾AI处理器在PyTorch对网络的开发、训练以及调试,用户无需额外进一步关注昇腾AI处理器具体的底层细节。这样可以确保用户的最小化修改及完成平台迁移,迁移成本较低。 + +

约束与限制

+ +1. infershape阶段算子不支持unknowshape的推导。 +2. cube计算的算子只支持fp16。 +3. 不支持inf/nan类型的输入。 +4. 出现4D以上的format时不能降维。 +5. Apex当前版本的实现方式为python实现,不支持APEX中的自定义优化CUDA Kernel。 +6. Apex当前版本只支持适配昇腾AI处理器的混合精度计算和多种融合优化器功能,其他功能暂未支持。 +7. 集合通信约束: + - 数据并行模式中不同device上执行的图相同。 + - 只支持1/2/4/8P粒度的分配。 + - 只支持int8,int32,float16和float32数据类型。 + - 服务器网卡名称要求以eth开头。 + + +

迁移流程

+ +模型迁移主要指将开源社区中实现过的模型迁移到昇腾AI处理器上,主要流程如[图1](#fig759451810422)所示。 + +**图 1** 迁移流程 +![](figures/迁移流程.png "迁移流程") + +**表 1** 迁移流程说明 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

场景

+

说明

+

模型选取

+

详情请参见模型选取

+

模型移植评估

+

详情请参见模型移植评估

+

算子开发

+

详情请参见《PyTorch算子开发指南》

+

环境准备

+

详情请参见环境准备

+

模型迁移

+

详情请参见模型迁移

+

模型训练

+

详情请参见模型训练

+

错误分析

+

详情请参见《CANN 日志参考》《CANN 开发辅助工具指南 (训练)》中“AI Core Error分析工具使用指南”章节。

+

性能调优和分析

+

详情请参见性能调优和分析

+

精度调测

+

详情请参见精度调测

+

模型保存与转换

+

详情请参见模型保存与转换《CANN 开发辅助工具指南 (推理)》中“ATC工具使用指南”章节。

+

应用软件开发

+

详情请参见《CANN 应用软件开发指南(C&C++, 推理)》

+

FAQ

+

主要涉及环境准备、模型迁移、模型调测和其他常见问题的解决方法。详情请参见FAQ

+
+ +

模型移植评估

+ +1. 在选取模型时,尽可能选取权威Pytorch模型实现仓作为标杆,包括但不限于Pytorch\([example](https://github.com/pytorch/examples/tree/master/imagenet)/[vision](https://github.com/pytorch/vision)等\)、facebookresearch\([Detectron](https://github.com/facebookresearch/Detectron)/[detectron2](https://github.com/facebookresearch/detectron2)等\)和open-mmlab\([mmdetection](https://github.com/open-mmlab/mmdetection)/[mmpose](https://github.com/open-mmlab/mmpose)等\)。 +2. 查看算子适配情况。将原始模型及训练脚本迁移到昇腾AI处理器上之前,可以将原始模型及训练脚本在CPU上进行训练,使用dump op方法获取算子信息,与《PyTorch适配算子清单》算子进行比较,查看是否支持。dump op方法参见[dump op方法](#dump-op方法.md),当有不支持算子时参见《PyTorch算子开发指南》进行算子开发。 + + >![](public_sys-resources/icon-note.gif) **说明:** + >查看算子适配情况也可以先将模型及训练脚本迁移到昇腾AI处理器进行训练来查看报错信息,迁移方法参见下文。一般会提示不能在昇腾AI处理器的backend下运行某个算子。 + + +

环境准备

+ +- **[简介](#简介.md)** + +- **[手动编译安装](#手动编译安装.md)** + +- **[使用Ascend Hub镜像](#使用Ascend-Hub镜像.md)** + + +

简介

+ +用户在准备相关环境进行PyTorch模型的移植及训练时,可以选择在训练服务器中手动编译安装PyTorch框架相关模块,也可使用Ascend Hub镜像中心提供的基础镜像(镜像中已安装PyTorch模块和混合精度模块),进行模型的移植与训练。 + +**图 1** 环境准备流程图 +![](figures/环境准备流程图.png "环境准备流程图") + +

手动编译安装

+ +- **[前提条件](#前提条件.md)** + +- **[安装PyTorch框架](#安装PyTorch框架.md)** + +- **[配置环境变量](#配置环境变量.md)** + +- **[安装混合精度模块](#安装混合精度模块.md)** + + +

前提条件

+ +## 前提条件 + +- 已完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南》。 +- 需安装3.12.0以上版本的CMake,安装方法请参考[CMake安装方法](#CMake安装方法.md)。 +- 需确保已安装7.3.0以上版本的gcc,7.3.0版本gcc具体安装及使用方式请参见《CANN 软件安装指南》中的“安装7.3.0版本gcc”章节。 +- 需确保环境中已安装patch、git工具,以Ubuntu和CentOS系统为例,命令如下: + - Ubuntu系统 + + **apt-get install patch** + + **apt-get install git** + + - CentOS系统 + + **yum install patch** + + **yum install git** + + + +

安装PyTorch框架

+ +## 安装流程 + +1. 以root或非root用户登录服务器。 +2. 依次执行如下命令安装PyTorch依赖环境。 + + 如果使用非root用户安装Python及其依赖,用户需要在本步骤中的每句命令结尾加上**--user**,命令示例为:**pip3.7 install pyyaml --user** + + ``` + pip3.7 install pyyaml + pip3.7 install wheel + ``` + + 若以上过程报错,请参考[FAQ](#FAQ.md)尝试解决问题。 + +3. 获取PyTorch源代码。 + + 1. 运行如下命令,获取适配昇腾AI处理器的PyTorch源代码。 + + ``` + git clone https://gitee.com/ascend/pytorch.git + ``` + + 下载的源码主要目录结构如下所示: + + ``` + pytorch + │ ├─patch # 昇腾AI处理器适配补丁目录 + │ ├─npu.patch + │ ├─scripts # 编译构建目录 + │ ├─gen.sh + │ ├─src # 源码目录 + │ ├─test # 测试用例存放目录 + │ ├─README.md + ``` + + 2. 运行如下命令,进入“pytorch“目录,并获取原生PyTorch源代码。 + + ``` + cd pytorch + git clone -b v1.5.0 --depth=1 https://github.com/pytorch/pytorch.git + ``` + + 下载原生pytorch源码后,代码主要目录结构如下所示: + + ``` + pytorch + │ ├─patch # 昇腾AI处理器适配补丁目录 + │ ├─npu.patch + │ ├─pytorch # 原生pytorch代码目录 + │ ├─scripts # 编译构建目录 + │ ├─gen.sh + │ ├─src # 源码目录 + │ ├─test # 测试用例存放目录 + │ ├─README.md + ``` + + 3. 运行如下命令,进入原生pytorch代码目录“pytorch“,并获取PyTorch被动依赖代码。 + + ``` + cd pytorch + git submodule sync + git submodule update --init --recursive + ``` + + + >![](public_sys-resources/icon-note.gif) **说明:** + >受网络波动影响,源码获取时间可能较长,下载过程中请耐心等待。 下载完成之后若没有报错,即生成了PyTorch及其依赖的第三方代码。 + +4. 编译生成适配昇腾AI处理器的PyTorch安装包。 + 1. 进入“pytorch/scripts“文件夹,执行转换脚本,生成适配昇腾AI处理器的全量代码。 + + ``` + cd ../scripts + bash gen.sh + ``` + + 将在"pytorch/pytorch"目录中生成适配昇腾AI处理器的全量代码。 + + 2. 进入适配后的全量代码目录,即“pytorch/pytorch“目录,编译生成pytorch的二进制安装包。 + + ``` + cd ../pytorch + bash build.sh + ``` + + 生成的二进制包在当前的dist目录下,即“pytorch/pytorch/dist”文件夹目录下。 + + +5. 安装PyTorch。 + + 进入“pytorch/pytorch/dist“文件夹目录,执行如下命令安装。 + + ``` + pip3 install --upgrade torch-1.5.0+ascend-cp37-cp37m-linux_{arch}.whl + ``` + + _**\{arch\}**_表示架构信息,为aarch64或x86\_64。 + + >![](public_sys-resources/icon-note.gif) **说明:** + >若环境中已安装了PyTorch或需要对PyTorch进行升级时,需要先卸载环境中已安装的PyTorch软件包再执行[5. 安装PyTorch。](#li49671667141)。 + + +

配置环境变量

+ +安装完软件包后,需要配置环境变量才能正常使用昇腾PyTorch。建议构建启动脚本,例如构建set\_env.sh脚本,使用source set\_env.sh配置当前窗口的环境变量。set\_env.sh脚本内容如下。 + +``` +cpu_type=$(echo $HOSTTYPE) + +if [ x"${cpu_type}" == x"x86_64" ];then + cpu_type=x86_64-linux +else + cpu_type=arm64-linux +fi +if [ -d /usr/local/Ascend/nnae/latest ];then + export LD_LIBRARY_PATH=/usr/local/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64_64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:/usr/local/Ascend/nnae/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/nnae/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=/usr/local/Ascend/nnae/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/nnae/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/nnae/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=/usr/local/Ascend/nnae/latest/ +else + export LD_LIBRARY_PATH=/usr/local/:/usr/local/lib/:/usr/lib64/:/usr/lib/:/usr/local/python3.7.5/lib/:/usr/local/openblas/lib:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/:/usr/local/Ascend/driver/lib64/common/:/usr/local/Ascend/driver/lib64/driver/:/usr/local/Ascend/add-ons/:/usr/lib/aarch64-linux-gnu:$LD_LIBRARY_PATH + export PATH=$PATH:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin/:/usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/ide_daemon/bin/ + export ASCEND_OPP_PATH=/usr/local/Ascend/ascend-toolkit/latest/opp/ + export OPTION_EXEC_EXTERN_PLUGIN_PATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libfe.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libaicpu_engine.so:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/plugin/opskernel/libge_local_engine.so + export PYTHONPATH=/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/auto_tune.egg/auto_tune:/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/python/site-packages/schedule_search.egg:$PYTHONPATH + export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest/${cpu_type} +fi +path_lib=$(python3.7 -c """ +import sys +import re +result='' +for index in range(len(sys.path)): + match_sit = re.search('-packages', sys.path[index]) + if match_sit is not None: + match_lib = re.search('lib', sys.path[index]) + if match_lib is not None: + end=match_lib.span()[1] + result += sys.path[index][0:end] + ':' + result+=sys.path[index] + '/torch/lib:' +print(result)""" +) +export LD_LIBRARY_PATH=/usr/local/python3.7.5/lib/:${path_lib}:$LD_LIBRARY_PATH +export TASK_QUEUE_ENABLE=1 + +# (可选)当系统为openeuler时,需设置此命令,取消CPU绑核。 +# unset GOMP_CPU_AFFINITY + +# 请依据实际,在下列场景中选择合适的HCCL初始化方式,并配置相应环境变量。具体如下: +# 场景一:单机场景 +export HCCL_WHITELIST_DISABLE=1 # 关闭HCCL通信白名单 +# 场景二:多机场景。 +export HCCL_WHITELIST_DISABLE=1 # 关闭HCCL通信白名单 +export HCCL_IF_IP="1.1.1.1" # “1.1.1.1”为示例使用的host网卡IP,请根据实际修改。需要保证使用的网卡IP在集群内是互通的。 +``` + +相关参数介绍参见下[表1](#table42017516135)。 + +**表 1** 环境变量说明 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

配置项

+

说明

+

LD_LIBRARY_PATH

+

动态库的查找路径,参考上述举例配置。

+

PYTHONPATH

+

Python搜索路径,参考上述举例配置。

+

PATH

+

可执行程序的查找路径,参考上述举例配置。

+

ASCEND_OPP_PATH

+

算子根目录,参考上述举例配置。

+

OPTION_EXEC_EXTERN_PLUGIN_PATH

+

算子信息库路径。

+

ASCEND_AICPU_PATH

+

aicpu算子包路径。

+

TASK_QUEUE_ENABLE

+

使用异步任务下发,异步调用acl接口。建议开启,开启设置为1。

+

HCCL_WHITELIST_DISABLE

+

配置在使用HCCL时是否开启通信白名单。

+

1:关闭白名单,需校验HCCL通信白名单。

+

0:开启白名单,无需校验HCCL通信白名单。

+

缺省值为0,默认开启白名单。

+

HCCL_IF_IP

+

配置HCCL的初始化通信网卡IP。

+

- ip格式为点分十进制。

+

- 暂只支持host网卡。

+

缺省时,按照以下优先级选定host通信网卡名:docker/local以外网卡(网卡名字字典序升序排列) >docker 网卡 > local网卡

+

unset GOMP_CPU_AFFINITY

+

(可选)当系统为openeuler时,需设置此命令,取消CPU绑核。

+
+ +

安装混合精度模块

+ +## 前提条件 + +1. 请确保运行环境中适配昇腾AI处理器的PyTorch框架能正常使用。 +2. 编译安装Apex前,需参见[配置环境变量](#配置环境变量.md)配置好编译过程依赖的环境变量。 + +## 安装流程 + +1. 以root或非root用户登录服务器。 +2. 获取apex源代码。 + + 1. 运行如下命令,获取适配昇腾AI处理器的apex源代码。 + + ``` + git clone https://gitee.com/ascend/apex.git + ``` + + 下载的源码主要目录结构如下所示: + + ``` + apex + │ ├─patch # 昇腾AI处理器适配补丁目录 + │ ├─npu.patch + │ ├─scripts # 编译构建目录 + │ ├─gen.sh + │ ├─src # 源码目录 + │ ├─tests # 测试用例存放目录 + │ ├─README.md + ``` + + 2. 运行如下命令,进入“apex“目录,并获取原生apex源代码。 + + ``` + cd apex + git clone https://github.com/NVIDIA/apex.git + ``` + + 下载原生apex源码后,代码主要目录结构如下所示: + + ``` + apex + │ ├─apex # 原生apex代码目录 + │ ├─patch # 昇腾AI处理器适配补丁目录 + │ ├─npu.patch + │ ├─scripts # 编译构建目录 + │ ├─gen.sh + │ ├─src # 源码目录 + │ ├─tests # 测试用例存放目录 + │ ├─README.md + ``` + + 3. 进入原生pytorch代码目录,即“apex/apex“目录。切换至commitid为4ef930c1c884fdca5f472ab2ce7cb9b505d26c1a的代码分支。 + + ``` + cd apex + git checkout 4ef930c1c884fdca5f472ab2ce7cb9b505d26c1a + cd .. + ``` + + + >![](public_sys-resources/icon-note.gif) **说明:** + >受网络波动影响,源码获取时间可能较长,下载过程中请耐心等待。 + +3. 编译生成适配昇腾AI处理器的apex安装包。 + 1. 进入“apex/scripts“文件夹,执行转换脚本,生成适配昇腾AI处理器的全量代码。 + + ``` + cd ../scripts + bash gen.sh + ``` + + 将在"apex/apex"目录中生成适配昇腾AI处理器的全量代码。 + + 2. 进入适配后的全量代码目录,即“apex/apex“目录,编译生成apex的二进制安装包。 + + ``` + cd ../apex + python3 setup.py --cpp_ext --npu_float_status bdist_wheel + ``` + + 生成的二进制包在当前的dist目录下,即“apex/apex/dist”文件夹目录下。 + + +4. 安装apex。 + + 进入“apex/apex/dist“文件夹目录,执行如下命令安装。 + + ``` + pip3.7 install --upgrade apex-0.1+ascend-cp37-cp37m-linux_{arch}.whl + ``` + + _**\{arch\}**_表示架构信息,为aarch64或x86\_64。 + + >![](public_sys-resources/icon-note.gif) **说明:** + >若环境中已安装了Apex或需要对Apex进行升级时,需要先卸载环境中已安装的Apex软件包再执行[4](#li425495374416)。 + + +

使用Ascend Hub镜像

+ +- **[Ascend Hub获取PyTorch镜像](#Ascend-Hub获取PyTorch镜像.md)** + +- **[配置环境变量](#配置环境变量-0.md)** + + +

Ascend Hub获取PyTorch镜像

+ +## 前提条件 + +- 已完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南》。 +- 宿主机上已安装Docker。 + +## 获取并使用镜像 + +用户可登录[Ascend Hub](https://ascendhub.huawei.com/#/home)获取相应镜像(首次申请需要激活账号)。 + +当前支持的镜像列表如[表1](#zh-cn_topic_0000001074498056_table1519011227314)所示。用户可根据实际选择所需的镜像进行下载并使用。 + +**表 1** 镜像列表 + + + + + + + + + + + + +

镜像名称

+

镜像版本

+

配套CANN版本

+
+

21.0.2

+

5.0.2

+
+ +

配置环境变量

+ +启动并进入镜像容器后,请参见[配置环境变量](#配置环境变量.md)配置模型训练依赖的环境变量。 + +

模型迁移

+ +- **[工具迁移](#工具迁移.md)** + +- **[手工迁移](#手工迁移.md)** + +- **[混合精度](#混合精度.md)** + +- **[性能优化](#性能优化.md)** + + +

工具迁移

+ +Ascend平台提供了脚本转换工具使用户能通过命令行方式将训练脚本迁移到昇腾AI处理器上进行训练,命令行方式工具详细使用说明参见下文。除命令行方式外,用户也可通过MindStudio中集成的PyTorch GPU2Ascend功能进行迁移,详情请参见《MindStudio 用户指南》。 + +- **[功能介绍](#功能介绍.md)** + +- **[操作指南](#操作指南.md)** + +- **[结果解析](#结果解析.md)** + + +

功能介绍

+ +## 简介 + +昇腾NPU是AI算力的后起之秀,但目前训练和在线推理脚本大多是基于GPU的。由于NPU与GPU的架构差异,基于GPU的训练和在线推理脚本不能直接在NPU上使用,脚本转换工具提供了将基于GPU的脚本转换为基于NPU的脚本的自动化方法,节省了人工手动进行脚本迁移的学习成本与工作量,大幅提升了迁移效率。 + +>![](public_sys-resources/icon-note.gif) **说明:** +>- 脚本转换工具根据适配规则,对用户脚本给出修改建议并提供转换功能,大幅度提高了脚本迁移速度,降低了开发者的工作量。但转换结果仅供参考,仍需用户根据实际情况做少量适配。 +>- 脚本转换工具当前仅支持PyTorch训练脚本转换。 + +## 系统要求 + +脚本转换工具支持Ubuntu 18.04、Centos 7.6或EulerOS 2.8。 + +## 环境准备 + +详情请参考《CANN 软件安装指南》安装开发环境。 + +

操作指南

+ +## 参数说明 + +**表 1** 参数说明 + + + + + + + + + + + + + + + + + + + + + + + + +

参数

+

参数说明

+

取值示例

+

-i

+

--input

+
  • 要进行转换的原始脚本文件所在文件夹路径或文件路径。
  • 必选。
+
  • /home/username/fmktransplt
  • /home/username/fmktransplt.py
+

-o

+

--output

+
  • 脚本转换结果文件输出路径。会在该路径下输出带有msft后缀的文件夹。
  • 必选。
+

/home/username/fmktransplt_output

+

-r

+

--rule

+
  • 用户自定义通用转换规则的json文件路径,主要分为:函数参数修改、函数名称修改和模块名称修改三部分。
  • 可选。
+

/home/username/fmktransplt_rule.json

+

-h

+

--help

+

显示帮助信息。

+

-

+
+ +## 自定义规则文件 + +自定义转换规则样例如下: + +``` +{ + "rules": { + "ArgsModifyRule": [ + { + "func_name": "name1", + "arg_idx": 0, + "arg_new": "agrs0" + }, + { + "func_name": "name2", + "arg_idx": 0, + "arg_new": "agrs0" + } + ], + "FuncNameModifyRule": [ + { + "old_name": "func", + "new_name": "new_func" + } + ], + "ModuleNameModifyRule": [ + { + "old_name": "module", + "new_name": "new_module" + } + ] + } +} +``` + +**表 2** 参数说明 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

参数

+

说明

+

ArgsModifyRule

+

函数参数修改

+

func_name

+

函数名称

+

arg_idx

+

参数的位置

+

arg_new

+

新的参数

+

FuncNameModifyRule

+

函数名称修改

+

ModuleNameModifyRule

+

模块名称修改

+

old_name

+

旧名称

+

new_name

+

新名称

+
+ +## 执行转换 + +1. 进入脚本转换工具所在路径。 + + ``` + cd Ascend-cann-toolkit安装目录/ascend-toolkit/{version}/{arch}-linux/toolkit/tools/ms_fmk_transplt + ``` + +2. 执行脚本转换工具。 + + ``` + python3.7.5 ms_fmk_transplt.py -i 原始脚本路径 -o 脚本转换结果输出路径 [-r 自定义规则json文件路径] + ``` + +3. 完成脚本转换。 + +

结果解析

+ +脚本转换完成后,进入脚本转换结果输出路径查看结果文件。 + +``` +├── xxx_msft // 脚本转换结果输出目录,默认为原始脚本路径。xxx为原始脚本所在文件夹名称。 +│ ├── 生成脚本文件 // 与转换前的脚本文件目录结构一致 +│ ├── msFmkTranspltlog.txt // 脚本转换过程日志文件。 +│ ├── unsupported_op.xlsx // 不支持算子列表文件。 +``` + +

手工迁移

+ +- **[单P训练模型迁移](#单P训练模型迁移.md)** + +- **[多P训练模型迁移](#多P训练模型迁移.md)** + +- **[PyTorch接口替换](#PyTorch接口替换.md)** + + +

单P训练模型迁移

+ +当前在线对接方案优点在于保证在昇腾AI处理器上训练与GPU的使用方式和风格保持一致。用户在使用在线对接方案时,**只需在Python侧和Device相关操作中,指定device为昇腾AI处理器**,即可完成用昇腾AI处理器在PyTorch对网络的开发、训练以及调试。针对单P模型训练,主要迁移改动如下: + +迁移前GPU代码: + +``` + CALCULATE_DEVICE = “gpu:0” + torch.cuda.set_device(CALCULATE_DEVICE) + # 放到device的两种方法 + model = model.cuda() # 写法1 + model = model.to(CALCULATE_DEVICE) # 写法2 + # 将输入也从host放到device + images = images.to(CALCULATE_DEVICE) + target = target.to(CALCULATE_DEVICE) +``` + +迁移到昇腾AI处理器上代码为: + +``` + CALCULATE_DEVICE = “npu:0” + torch.npu.set_device(CALCULATE_DEVICE) + # 放到device的两种方法 + model = model.npu() # 写法1 + model = model.to(CALCULATE_DEVICE) # 写法2 + # 将输入也从host放到device + images = images.to(CALCULATE_DEVICE) + target = target.to(CALCULATE_DEVICE) +``` + +更多迁移细节请参见[单P训练修改](#单P训练修改.md)。 + +

多P训练模型迁移

+ +多P训练模型迁移除了需在**Python侧和Device相关操作中,指定device为昇腾AI处理器**外,依然通过PyTorch的DistributedDataParallel方式来进行分布式训练,即在模型初始化阶段执行init\_process\_group,再将模型初始化为DistributedDataParallel模型。但须注意的是在初始化init\_process\_group时需要将**backend**配置为**hccl**并屏蔽掉初始化方式。 + +PyTorch分布式训练代码示例(部分代码省略): + +``` +import torch +import torch.distributed as dist +import torch.nn.parallel +def main(): + args = parser.parse_args() + # 需屏蔽掉初始化方式 + dist.init_process_group(backend='hccl',# init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=True, sampler=train_sampler) + for epoch in range(args.start_epoch, args.epochs): + acc1 = train(train_loader, model, criterion, optimizer, epoch, args,ngpus_per_node, + lr_scheduler) +``` + +更多迁移细节请参见[分布式训练修改](#分布式训练修改.md)。 + +

PyTorch接口替换

+ +1. 为了使昇腾AI处理器使用PyTorch框架的能力,需要对原生的PyTorch框架进行一定Device层面的适配,对外呈现是需要将跟cpu和cuda相关的接口进行切换;在进行网络迁移时,需要将某些设备相关的接口转换成跟昇腾AI处理器相关的接口,当前适配的设备相关接口参见: + + **表 1** 设备接口替换 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

PyTorch原始接口

+

适配昇腾AI处理器后的接口

+

说明

+

torch.cuda.is_available()

+

torch.npu.is_available()

+

判断当前环境上设备是否可用(不代表最后结果)。

+

torch.cuda.current_device()

+

torch.npu.current_device()

+

获取当前正在使用的device。

+

torch.cuda.device_count()

+

torch.npu.device_count()

+

获取当前环境上的设备数量。

+

torch.cuda.set_device()

+

torch.npu.set_device()

+

设置当前正在使用的device。

+

torch.tensor([1,2,3]).is_cuda

+

torch.tensor([1,2,3]).is_npu

+

判断某个tensor是否是cuda/npu设备上的格式。

+

torch.tensor([1,2,3]).cuda()

+

torch.tensor([1,2,3]).npu()

+

将某个tensor转换成cuda/npu格式。

+

torch.tensor([1,2,3]).to("cuda")

+

torch.tensor([1,2,3]).to('npu')

+

将某个tensor转换成cuda/npu格式。

+

torch.cuda.synchronize()

+

torch.npu.synchronize()

+

同步等待事件完成。

+

torch.cuda.device

+

torch.npu.device

+

生成一个device类,可以执行device相关操作。

+

torch.cuda.Stream(device)

+

torch.npu.Stream(device)

+

生成一个stream对象。

+

torch.cuda.stream(Stream)

+

torch.npu.stream(Stream)

+

多用于作用域限定。

+

torch.cuda.current_stream()

+

torch.npu.current_stream()

+

获取当前stream。

+

torch.cuda.default_stream()

+

torch.npu.default_stream()

+

获取默认stream。

+

device = torch.device("cuda:0")

+

device = torch.device("npu:0")

+

指定一个设备。

+

torch.autograd.profiler.profile

+

(use_cuda=True)

+

torch.autograd.profiler.profile

+

(use_npu=True)

+

指定执行profiler过程中使用cuda/npu。

+

torch.cuda.Event()

+

torch.npu.Event()

+

返回某个设备上的事件。

+
+ +2. 用户在构建网络或进行网络迁移时,需要创建指定数据类型的tensor,在昇腾AI处理器上创建的tensor如下。 + + **表 2** tensor创建接口替换 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

GPU tensor

+

适配昇腾AI处理器后的接口

+

torch.tensor([1,2,3],dtype=torch.long,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.long,device='npu')

+

torch.tensor([1,2,3],dtype=torch.int,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.int,device='npu')

+

torch.tensor([1,2,3],dtype=torch.half,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.half,device='npu')

+

torch.tensor([1,2,3],dtype=torch.float,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.float,device='npu')

+

torch.tensor([1,2,3],dtype=torch.bool,device='cuda')

+

torch.tensor([1,2,3],dtype=torch.bool,device='npu')

+

torch.cuda.BoolTensor([1,2,3])

+

torch.npu.BoolTensor([1,2,3])

+

torch.cuda.FloatTensor([1,2,3])

+

torch.npu.FloatTensor([1,2,3])

+

torch.cuda.IntTensor([1,2,3])

+

torch.npu.IntTensor([1,2,3])

+

torch.cuda.LongTensor([1,2,3])

+

torch.npu.LongTensor([1,2,3])

+

torch.cuda.HalfTensor([1,2,3])

+

torch.npu.HalfTensor([1,2,3])

+
+ + +更多接口请参见《PyTorch API支持清单》。 + +

混合精度

+ +## 概述 + +基于NPU芯片的架构特性,会涉及到混合精度训练,即混合使用float16和float32数据类型的应用场景。使用float16代替float32有如下一些好处: + +- 对于中间变量的内存占用更少,节省内存的使用。 +- 因内存使用会减少,所以数据传出的时间也会减半。 +- float16的计算单元可以提供更快的计算性能。 + +但是,混合精度训练受限于float16表达的精度范围,单纯将float32转换成float16会影响训练收敛情况,为了保证部分计算使用float16来进行加速的同时能保证训练收敛,这里采用混合精度模块Apex来达到以上效果。混合精度模块Apex是一个集优化性能、精度收敛于一身的综合优化库。 + +适配昇腾AI处理器的混合精度模块Apex除了上述优点外,还能提升运算性能。具体如下: + +- Apex在进行混合精度运算时,会对模型的grad进行运算,开启combine\_grad开关,可以加速这些运算。具体为将amp.initialize\(\)接口参数combine\_grad设置为True; +- 适配后的Apex针对adadelta/adam/sgd/lamb做了昇腾AI处理器亲和性优化,得到的NPU融合优化器与原生算法保持一致,但运算速度更快。使用时只需将原有优化器替换为apex.optimizers.\*(“\*”为优化器名称,例如NpuFusedSGD)。 + +## 特性支持 + +混合精度模块功能和优化描述如[表1](#table10717173813332)所示: + +**表 1** 混合精度模块功能 + + + + + + + + + + + + + + + + + + + +

功能

+

描述

+

O1配置模式

+

Conv, Matmal等使用float16计算,其他如Softmax、BN使用float32

+

O2配置

+

除了BN使用float32外,其他绝大部分使用float16

+

静态Loss Scale功能

+

静态设置参数确保混合精度训练收敛。

+

动态Loss Scale功能

+

动态计算loss Scale值并判断是否溢出。

+
+ +>![](public_sys-resources/icon-note.gif) **说明:** +>- 当前版本的实现方式主要为python实现,不支持AscendCL或者CUDA优化。 +>- 当前昇腾AI设备暂不支持原始Apex的FusedLayerNorm接口模块,如果模型原始脚本文件使用了FusedLayerNorm接口模块,需要在模型迁移过程中将脚本头文件“from apex.normalization import FusedLayerNorm“替换为“from torch.nn import LayerNorm“。 + +## 将混合精度模块集成到PyTorch模型中 + +1. 使用apex混合精度模块需要首先从apex库中导入amp,代码如下: + + ``` + from apex import amp + ``` + +2. 导入amp模块后,需要初始化amp,使其能对模型、优化器以及PyTorch内部函数进行必要的改动,初始化代码如下: + + ``` + model, optimizer = amp.initialize(model, optimizer) + ``` + +3. 标记反向传播.backward\(\)发生的位置,这样Amp就可以进行Loss Scaling并清除每次迭代的状态,代码如下: + + 原始代码: + + ``` + loss = criterion(…) + loss.backward() + optimizer.step() + ``` + + 修改以支持loss scaling后的代码: + + ``` + loss = criterion(…) + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + optimizer.step() + ``` + + +

性能优化

+ +- **[概述](#概述-1.md)** + +- **[修改CPU性能模式(X86服务器)](#修改CPU性能模式(X86服务器).md)** + +- **[修改CPU性能模式(ARM服务器)](#修改CPU性能模式(ARM服务器).md)** + +- **[安装高性能pillow库(X86服务器)](#安装高性能pillow库(X86服务器).md)** + +- **[(可选)安装指定版本OpenCV库](#(可选)安装指定版本OpenCV库.md)** + + +

概述

+ +在进行PyTorch模型迁移训练时,部分网络模型会出现1秒内识别的图像数(fps)较低、性能不达标的情况。此时需要针对服务器进行以下优化。 + +- 修改CPU性能模式。 +- 安装高性能pillow库。 + +

修改CPU性能模式(X86服务器)

+ +## 设置电源策略为高性能模式 + +提升网络性能需要在X86服务器BIOS设置中将电源策略设为高性能模式,具体操作如下。 + +1. 登录ibmc界面,启动虚拟控制台,远程控制选择HTML5集成远程控制台,如[图1](#fig15869135420288)。 + + **图 1** 远程登录控制台 + ![](figures/远程登录控制台.png "远程登录控制台") + +2. 在虚拟界面工具栏中,单击启动项工具![](figures/zh-cn_image_0000001106016350.png),弹出启动项配置界面,如[图2](#fig744814574243)。 + + **图 2** 启动项工具 + ![](figures/启动项工具.png "启动项工具") + +3. 在启动项配置界面选择,选择“BIOS设置”,然后在虚拟界面工具栏中单击重启工具![](figures/zh-cn_image_0000001152616281.png),重启服务器。 +4. 系统重启后进入BIOS配置界面,依次选择“Advanced”\>“Socket Configuration”,如[图3](#fig4546303814)所示。 + + **图 3** Socket Configuration + ![](figures/Socket-Configuration.png "Socket-Configuration") + +5. 进入Advanced Power Mgmt. Configuration,设置Power Policy为Performance。如[图4](#fig15501111014442)。 + + **图 4** 设置电源策略 + ![](figures/设置电源策略.png "设置电源策略") + +6. 按下“F10”保存配置并重启服务器。 + +## 将CPU设置为performance模式 + +请使用root用户执行如下操作。 + +1. 使用如下命令查看当前CPU模式。 + + ``` + cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor + ``` + + 执行以上命令会输出当前CPU模式,参见[表1](#table354392019384)。 + + **表 1** CPU模式 + + + + + + + + + + + + + + + + + + + + + + + + + +

调速器

+

描述

+

performance

+

运行于最大频率。

+

powersave

+

运行于最小频率。

+

userspace

+

运行于用户指定的频率。

+

ondemand

+

按需快速动态调整CPU频率, 一有cpu计算量的任务,就会立即达到最大频率运行,空闲时间增加就降低频率。

+

conservative

+

按需快速动态调整CPU频率, 比ondemand的调整更保守。

+

schedutil

+

基于调度程序调整 CPU 频率。

+
+ +2. 安装工具,使用如下命令安装。 + + ubuntu/debian: + + ``` + apt-get install linux-tools-$(uname -r) + ``` + + centos/bclinux/euler: + + ``` + yum install kernel-tools -y + systemctl daemon-reload + systemctl enable cpupower + systemctl start cpupower + ``` + +3. 设置CPU为performance模式。 + + ``` + cpupower frequency-set -g performance + ``` + +4. 再次执行[步骤1](#li158435131344)查看是否已修改。 + +

修改CPU性能模式(ARM服务器)

+ +## 设置电源策略为高性能模式 + +在某些对Host侧CPU要求较高的模型中,例如目标检测类模型,需要进行较为复杂的图像预处理,开启电源高性能模式能一定程度上提高性能和稳定性。ARM服务器提升网络性能需要在BIOS设置中将电源策略设为高性能模式,具体操作如下。 + +1. 登录ibmc界面,启动虚拟控制台,远程控制选择HTML5集成远程控制台,如[图1](#fig15869135420288)。 + + **图 1** 远程登录控制台 + ![](figures/远程登录控制台-0.png "远程登录控制台-0") + +2. 在虚拟界面工具栏中,单击启动项工具![](figures/zh-cn_image_0000001152616289.png),弹出启动项配置界面,如[图2](#fig744814574243)。 + + **图 2** 启动项工具 + ![](figures/启动项工具-1.png "启动项工具-1") + +3. 在启动项配置界面选择,选择“BIOS设置”,然后在虚拟界面工具栏中单击重启工具![](figures/zh-cn_image_0000001152736233.png),重启服务器。 +4. 系统重启后进入BIOS配置界面,依次选择“Advanced”\>“ Performance Config”,如[图3](#fig4546303814)所示。 + + **图 3** Performance Config + ![](figures/Performance-Config.png "Performance-Config") + +5. 进入“Performance Config”,设置Power Policy为Performance。如[图4](#fig15501111014442)。 + + **图 4** 设置电源策略 + ![](figures/设置电源策略-2.png "设置电源策略-2") + +6. 按下“F10”保存配置并重启服务器。 + +

安装高性能pillow库(X86服务器)

+ +1. 安装高性能pillow库相关依赖,命令如下。 + + ubuntu/debian: + + ``` + apt-get install libtiff5-dev libjpeg8-dev libopenjp2-7-dev zlib1g-dev libfreetype6-dev liblcms2-dev libwebp-dev tcl8.6-dev tk8.6-dev python3-tk libharfbuzz-dev libfribidi-dev libxcb1-dev + ``` + + centos/bclinux/euler: + + ``` + yum install libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel + ``` + +2. 安装高性能pillow库。 + 1. 执行如下命令卸载原生pillow。 + + ``` + pip3.7 uninstall -y pillow + ``` + + 2. 安装SSE4版本pillow-simd。 + + 使用root用户安装,执行如下命令,若使用非root用户安装,需在命令结尾加上--user。 + + ``` + pip3.7 install pillow-simd + ``` + + >![](public_sys-resources/icon-note.gif) **说明:** + >如果CPU支持AVX2指令集,可安装AVX2版本pillow-simd,命令如下: + >``` + >CC="cc -mavx2" pip3.7 install -U --force-reinstall pillow-simd + >``` + + +3. 修改torchvision代码解决pillow-simd缺少PILLOW\_VERSION问题。torchvision安装参见[样例获取](#样例获取.md)。 + + 将/usr/local/python3.7.5/lib/python3.7/site-packages/torchvision/transforms/functional.py第5行代码修改如下: + + ``` + try: + from PIL import Image, ImageOps, ImageEnhance,PILLOW_VERSION + except: + from PIL import Image, ImageOps, ImageEnhance + PILLOW_VERSION="7.0.0" + ``` + + +

(可选)安装指定版本OpenCV库

+ +如模型依赖OpenCV,基于训练性能考虑,建议安装OpenCV-3.4.10版本。 + +1. 获取源码:_[获取地址](https://opencv.org/releases/)_。 +2. 安装指导:_[获取地址](https://docs.opencv.org/3.4.10/d7/d9f/tutorial_linux_install.html)_。 + +

模型训练

+ +训练脚本迁移完成后,需要参见[配置环境变量](#配置环境变量.md)设置环境变量,然后执行**python3.7** _xxx_进行模型训练。具体样例请参考[脚本执行](#脚本执行.md)。 + +

性能调优和分析

+ +- **[前提条件](#前提条件-2.md)** + +- **[调测过程](#调测过程.md)** + +- **[亲和库](#亲和库.md)** + + +

前提条件

+ +1. 参见[样例说明](#样例说明.md)改造开源代码,使模型能够正常运行,包括数据预处理,前向计算,loss计算,混合精度,反向计算,参数更新等。 +2. 模型迁移阶段优先关注模型是否能跑通,现有算子是否能满足,如果遇到不满足的算子需参见《PyTorch算子开发指南》进行算子适配开发。 +3. 优先打通单卡功能,再打通多卡功能。 + +

调测过程

+ +- **[总体思路](#总体思路.md)** + +- **[采集训练过程相关数据](#采集训练过程相关数据.md)** + +- **[性能优化](#性能优化-3.md)** + + +

总体思路

+ +1. 通过训练执行结果,判断吞吐量指标是否达到预期要求。 +2. 当吞吐量指标不达标时,需要找出制约性能瓶颈的原因,主要为以下几个方面: + - 算子瓶颈,在某个算子上执行过慢。 + - copy瓶颈,非连续转连续时进行copy带来的瓶颈。 + - 框架瓶颈,由于算子格式转换带来了额外操作。 + - 编译瓶颈,由于shape或属性来回变化造成反复编译。 + +3. 针对以上制约性能瓶颈的原因进行分析与优化。 + +

采集训练过程相关数据

+ +## Profiling数据采集 + +当吞吐量指标不达标时,需要通过采集训练过程中的profiling数据,分析哪个环节、哪个算子导致的性能消耗。请参见以下步骤进行profiling数据的获取。 + +1. 获取chrome\_trace文件。使用profile接口对原始代码的loss计算和优化过程进行改造。 + + ``` + # 使用ascend-pytorch适配的profile接口,即可获得,推荐只运行一个step + with torch.autograd.profiler.profile(use_npu=True) as prof: + out = model(input_tensor) + loss=loss_func(out) + loss.backward() + optimizer.zero_grad() + optimizer.step() + # 导出chrome_trace文件到指定路径 + prof.export_chrome_trace(output_path) + ``` + +2. chrome\_trace文件可以通过以下方式打开查看:在Chrome浏览器中输入“chrome://tracing“地址,然后将落盘文件拖到空白处即可打开文件内容,通过键盘W、A、S、D键,可以对profiler的结果进行缩放和移动。 + +## 获取算子信息OP\_INFO + +网络模型最终是以OP执行的,通过OPInfo日志,我们可以获取实际执行时的算子及其属性。通过get\_ascend\_op\_info.py脚本获取。 + +1. 编写get\_ascend\_op\_info.py脚本获取算子信息,脚本内容如下。 + + ``` + # -*- coding: utf-8 -*- + """用于导出OPINFO + """ + import os + import sys + import argparse + + def func(host_log_folder): + """ + :param host_log_folder: where host_log_folder addr is. + :return: + """ + host_log_files = os.listdir(host_log_folder) + result = {} + + for host_log in host_log_files: + if not host_log.endswith('.log') or host_log.endswith('.out'): + continue + with open(os.path.join(host_log_folder, host_log), 'r')as f: + host_log_lines = f.readlines() + for line in host_log_lines: + if line.startswith('[INFO] ASCENDCL') and "aclopCompile::aclOp" in line: + op_info = line.split('OpType: ')[1][:-2] + op_type = op_info.split(',')[0] + op_param = op_info[len(op_type) + 2:] + if op_type not in result.keys(): + result[op_type] = [op_param] + else: + result[op_type].append(op_param) + + with open('ascend_op_info_summary.txt', 'w')as f: + for k, v in result.items(): + v_set = set(v) + for info in v_set: + f.write(k + " " + info + "\n") + + if __name__ == "__main__": + parser = argparse.ArgumentParser(description='trans the log') + parser.add_argument('--host_log_folder', default="./", + help="input the dir name, trans the current dir with default") + ags = parser.parse_args() + func(ags.host_log_folder) + ``` + +2. 设置环境变量,将host日志打屏。 + + ``` + export ASCEND_SLOG_PRINT_TO_STDOUT=1 + ``` + +3. 设置日志级别为info,参考《CANN 日志参考》设置日志级别。 +4. 执行训练脚本,进行模型训练,训练完成后获取host侧日志,默认位置为$HOME/ascend/log/plog目录下,$HOME表示Host侧用户根目录。 +5. 解析host侧日志会在当前目录下得到OPInfo信息ascend\_op\_info\_summary.txt。 + + ``` + python3.7 get_ascend_op_info.py --host_log_folder $HOME/ascend/log/plog + ``` + +6. 分析TaskInfo中额外的task,尤其关注transdata。 + +

性能优化

+ +## 算子瓶颈优化 + +1. 获取训练过程中的Profiling数据,参见[Profiling数据采集](#采集训练过程相关数据.md)。 +2. 分析Profiling数据,得到耗时较大的算子。 +3. 参见[单算子样例编写说明](#单算子样例编写说明.md)构建耗时较大算子的单算子样例,通过与CPU或GPU运行单算子样例时间进行对比,若发现性能不足,则有以下两种方案解决。 + - 规避方案:使用同等语义其他高效算子替代。 + - 解决方案:改进算子性能。 + + +## copy瓶颈优化 + +1. 获取训练过程中的Profiling数据,参见[Profiling数据采集](#采集训练过程相关数据.md)。 +2. 分析Profiling数据分析整网中的D2DCopywithStreamSynchronize/PTCopy/format\_contiguous的耗时。 +3. 若发现耗时较大,则需参照以下两种方案解决。 + - 规避方案:PyTorch中View类型框架类算子会导致非连续转连续操作。优化思路为尽量使用计算类算子代替View类框架算子,常见的View类框架算子如View、Permute、Transpose等。更多View类框架算子可参考[https://pytorch.org/docs/stable/tensor\_view.html](https://pytorch.org/docs/stable/tensor_view.html)。 + - 解决方案:加速转连续操作。 + + +## 框架瓶颈优化 + +1. 获取训练过程中算子信息OP\_INFO,参见[获取算子信息OP\_INFO](#采集训练过程相关数据.md)。 +2. 分析OP\_INFO中算子的规格和调用关系,定位是否插入了多余的算子,重点关注transdata是否合理。 +3. 优化方案:通过指定部分算子初始化格式,对多余的格式转换算子进行消除。 +4. 在pytorch/torch/nn/modules/module.py中,在cast\_weight中指定算子初始化格式,如下图。 + + ![](figures/指定算子初始化方式.png) + + 格式设置原则可参考如下规则: + + - Conv2D相关:Weight 可设置为FZ格式,如第424行。 + - Linear相关的参数,可设置为NZ格式,如第409行。 + + +## 编译瓶颈优化 + +1. 获取训练过程中算子信息OP\_INFO,参见[获取算子信息OP\_INFO](#采集训练过程相关数据.md)。 +2. 查看INFO日志,观察第一个step以后的aclopCompile::aclOp关键字,如果后续接了Match op iunputs/type failed或To compile op则说明该算子存在动态编译,需要优化。 +3. 需参照以下两种方案解决。 + - 规避方案:在理解模型语义和相关API基础上,使用固定Shape的方式代替动态Shape。 + - 解决方案:减少编译或不需要编译该算子 + + +

亲和库

+ +- **[来源介绍](#来源介绍.md)** + +- **[功能介绍](#功能介绍-4.md)** + + +

来源介绍

+ +针对公版模型中常见的网络结构和函数,我们针对性地对其进行了优化,使得运算性能大幅度提升,同时,将其集成到Pytorch框架中,便于模型性能调优中使用。 + +

功能介绍

+ + + + + + + + + + + + + + + + + + + + + + + + +

函数名

+

位置

+

功能说明

+

pairwise_iou

+

torch.contrib.npu.optimized_lib

+

计算两个目标框的IOU。

+

fast_rcnn_inference_single_image

+

torch.contrib.npu.optimized_lib

+

Maskrcnn和Fasterrcnn模型的推理接口。

+

ChannelShuffle

+

torch.contrib.npu.optimized_lib

+

提供NPU亲和的channelshuffle操作,适用于shufflenetv2等模型。

+

PreLoader

+

torch.contrib.npu.optimized_lib

+

提供针对昇腾AI处理器加速的数据加载方法。

+
+ +>![](public_sys-resources/icon-note.gif) **说明:** +>该部分调优内容会随着版本不断增强和更新,请以实际PyTorch版本中对应路径下的内容为准。 + +

精度调测

+ +- **[前提条件](#前提条件-5.md)** + +- **[调测过程](#调测过程-6.md)** + + +

前提条件

+ +优先在同等语义和超参下,跑一定的epoch(推荐完整epoch数的20%),使精度,loss等对齐GPU相应水平,完成后再对齐最终精度。 + +

调测过程

+ +- **[总体思路](#总体思路-7.md)** + +- **[精度调优方法](#精度调优方法.md)** + + +

总体思路

+ +精度问题排查需要找出是哪一步出现的问题,主要以下几个方面: + +1. 模型网络计算错误。 + - 定位思路:在网络中加入hook进行排查判断是哪个地方有较大嫌疑,然后构建[单算子用例](#单算子样例编写说明.md)逐渐缩小错误范围,证明该算子在当前网络场景下计算有误,可以对比CPU或GPU结果证明。 + + - 规避方案:使用同等语义其他算子替代。 + + - 解决方案:改进算子精度或功能问题。 + + +2. loss计算错误。 + - 定位思路:由于Loss的特殊性和可以自定义,在判断Loss计算错误后建议dump网络中的loss的输入来测试而非随机同shape tensor,这样才能更好地复现证明。 + + - 规避方案:使用同等语义其他算子替代。 + + - 解决方案:改进算子精度或功能问题(loss也是由算子构成)。 + + +3. 参数更新错误。 + + - 定位思路:在每个optim.step\(\)前对网络中的参数逐个打印其grad进行排查判断是哪个地方有较大嫌疑,然后构建单算子用例逐渐缩小错误范围,证明该算子在当前网络场景下梯度计算有误,可以对比CPU或GPU结果证明。该项优先级应低于[1.](#li17755175510322)与[2.](#li25281726103316),因为1与2的错误同样可以造成grad异常。 + + - 规避方案:使用同等语义其他算子替代。 + + - 解决方案:改进计算grad的算子精度或功能问题。 + + +4. 多卡计算错误。 + + - 定位思路:在保证单卡精度OK的前提下,稳定复现多卡不收敛。 + + - 解决方案:建议联系华为方支撑人员,提供稳定复现的单P和多P脚本。 + + + +

精度调优方法

+ +1. 通过对比CPU和昇腾AI处理器的结果,判断在昇腾AI处理器上计算是否正确。 + + 代码样例(本样例只体现基本方法,禁止直接复制)如下: + + ``` + # 固定入参,保证模型与输入数据在CPU和昇腾AI处理器上相同 + input_tensor_cpu = torch.Tensor() + model_cpu = build_model() + # 将输入数据迁移到昇腾AI处理器上 + input_tensor_npu = input_tensor_cpu.npu() + # 将模型迁移到昇腾AI处理器上 + model_npu = model_cpu.npu() + + # 运算结果对比 + output_cpu = model_cpu(input_tensor_cpu) + output_npu = model_npu(input_tensor_npu) + compute_result = (output_cpu - output_npu).abs().mean()) + print(compute_result) + ``` + + 因昇腾AI处理器硬件架构与cpu不同,计算结果会略有不同。若运算结果较为接近\(一般不高于1e-4\),则认为运算结果正常。 + +2. 通过Pytorch的hook机制来打印正向反向传播中module的输入和输出来分析。 + + 代码样例(本样例只体现基本方法,禁止直接复制)如下: + + ``` + # 设置hook func + def hook_func(name, module): + def hook_function(module, inputs, outputs): + print(name+' inputs', inputs) + print(name+' outputs', outputs) + return hook_function + + # 注册正反向hook + for name, module in model.named_modules(): + module.register_forward_hook(hook_func('[forward]: '+name, module)) + module.register_backward_hook(hook_func('[backward]: '+name, module)) + + # 运行 + model(input_tensor) + ``` + + 通过分析打印正向反向传播中的inputs, outputs来确定。 + +3. 通过直接获取module的grad, running\_mean, running\_var等参数来分析更新量。 + + 代码样例(本样例只体现基本方法,禁止直接复制)如下: + + ``` + # 例如获取梯度和BN的均值方法来排查 + for name, module in model.named_modules(): + if isinstance(module, nn._BatchNorm): + print("[BN_buffer]: "+name, module.running_mean, module.running_var) + print("[grad]: "+name, module.grad) + ``` + + +

模型保存与转换

+ +- **[简介](#简介-8.md)** + +- **[模型保存](#模型保存.md)** + +- **[导出ONNX模型](#导出ONNX模型.md)** + + +

简介

+ +模型训练完成后,通过Pytorch提供的接口保存模型文件并导出ONNX模型,然后通过ATC工具将其转换为适配昇腾AI处理器的.om文件用于离线推理。 + +本章主要介绍如何将训练好的pth文件pth.tar文件转换为ONNX模型,将ONNX模型转换为适配昇腾AI处理器的.om文件流程请参考《CANN 开发辅助工具指南 \(推理\)》手册中“ATC工具使用指南”章节。 + +如果想使用Auto Tune优化功能,请参考《CANN 开发辅助工具指南 \(推理\)》手册中“Auto Tune工具使用指导”章节。 + +离线推理应用构建请参考《CANN 应用软件开发指南\(C&C++, 推理\)》。整体流程如下: + +![](figures/zh-cn_image_0000001106176222.png) + +

模型保存

+ +Pytorch在训练过程中,通常使用torch.save\(\)来保存Checkpoint文件,根据模型文件的后续用途会保存为两种格式的模型文件: + +- .pth或.pt扩展名的文件:用于在线推理或导出ONNX格式模型,仅保存模型参数,不保存模型结构,以便压缩文件的体积,可以用Netron等可视化工具打开,一般如[图1](#fig315704722610)所示。 + + **图 1** Pth文件 + ![](figures/Pth文件.jpg "Pth文件") + + 通过**state\_dict**来保存和加载模型,示例如下: + + 1. 保存模型。 + + ``` + # 创建保存路径 + PATH = "state_dict_model.pt" + # 保存模型 + torch.save(net.state_dict(), PATH) + ``` + + 2. 加载模型以用于在线推理,示例如下,详情请参见《PyTorch在线推理指南》。 + + ``` + # 模型文件保存路径 + PATH = "state_dict_model.pt" + model = TheModelClass(*args, **kwargs) + # 加载模型 + model.load_state_dict(torch.load(PATH)) + model.eval() + ``` + + + >![](public_sys-resources/icon-caution.gif) **注意:** + >保存.pth或.pt文件扩展名的文件时要提供模型定义文件,否则无法部署。 + +- .pth.tar扩展名的文件:可用于在线推理或重新加载后继续训练。保存多个组件,以字典形式保存,常见的组件包括模型和优化器的state\_dict、停止时的epoch、最新记录的训练损失以及外部的torch.nn.Embedding层等。如果仅用于部署推理模型,推荐只在.pth.tar扩展名的文件中保存权重信息即模型的state\_dict。 + + 保存和加载模型示例如下: + + 1. 保存模型。 + + ``` + PATH = "checkpoint.pth.tar" + torch.save({ + 'epoch': epoch, + 'loss': loss, + 'state_dict': model.state_dict(), + 'optimizer' : optimizer.state_dict(), + ... + }, PATH) + ``` + + 2. 加载模型用于推理或恢复训练。 + + ``` + model = TheModelClass(*args, **kwargs) + optimizer = TheOptimizerClass(*args, **kwargs) + + checkpoint = torch.load(PATH) + model.load_state_dict(checkpoint['model_state_dict']) + optimizer.load_state_dict(checkpoint['optimizer_state_dict']) + epoch = checkpoint['epoch'] + loss = checkpoint['loss'] + + model.eval() + # - or - + model.train() + ``` + + + +>![](public_sys-resources/icon-caution.gif) **注意:** +>通常情况下,训练图和推理图中对同一个算子处理方式不同(例如BatchNorm和dropout等算子),在输入格式上也有差别,因此在运行推理或导出ONNX模型之前,必须调用model.eval\(\) 来将dropout和batch normalization层设置为推理模式。 + +

导出ONNX模型

+ +## 简介 + +昇腾AI处理器Pytorch模型的部署策略是基于Pytorch官方支持的ONNX模块实现的。ONNX是业内目前比较主流的模型格式,广泛用于模型交流及部署。本节主要介绍如何将Checkpoint文件通过torch.onnx.export\(\)接口导出为ONNX模型。 + +## .pth或.pt文件导出ONNX模型 + +保存的.pth或.pt文件可以通过Pytorch构建模型再加载权重的方法恢复,然后导出ONNX模型,样例如下。 + +``` +import torch +import torch.onnx +import torchvision.models as models +# 设置使用CPU导出模型 +device = torch.device("cpu") + +def convert(): + # 模型定义来自于torchvision,样例生成的模型文件是基于resnet50模型 + model = models.resnet50(pretrained = False) + resnet50_model = torch.load('resnet50.pth', map_location='cpu') + model.load_state_dict(resnet50_model) + + batch_size = 1 #批处理大小 + input_shape = (3, 224, 224) #输入数据,改成自己的输入shape + + # 模型设置为推理模式 + model.eval() + + dummy_input = torch.randn(batch_size, *input_shape) # 定义输入shape + torch.onnx.export(model, + dummy_input, + "resnet50_official.onnx", + input_names = ["input"], # 构造输入名 + output_names = ["output"], # 构造输出名 + opset_version=11, # ATC工具目前仅支持opset_version=11 + dynamic_axes={"input":{0:"batch_size"}, "output":{0:"batch_size"}}) #支持输出动态轴 + ) + +if __name__ == "__main__": + convert() +``` + +>![](public_sys-resources/icon-note.gif) **说明:** +>- 在导出ONNX模型之前,必须调用model.eval\(\) 来将dropout和batch normalization层设置为推理模式。 +>- 样例脚本中的model来自于torchvision模块中的定义,用户使用自己的模型时需自行指定。 +>- 构造输入输出需要对应训练时的输入输出,否则无法正常推理。 + +## .pth.tar文件导出ONNX模型 + +.pth.tar在导出ONNX模型时需要先确定保存时的信息,有时保存的节点名称和模型定义中的节点会有差异,例如会多出前缀和后缀。在进行转换的时候,可以对节点名称进行修改。转换代码样例如下。 + +``` +import torch +import torch.onnx +from collections import OrderedDict +import mobilenet + +# 本样例中的pth.tar文件保存时节点名加了前缀module,通过遍历删除 +def proc_nodes_module(checkpoint, AttrName): + new_state_dict = OrderedDict() + for key, value in checkpoint[AttrName].items(): + if key == "module.features.0.0.weight": + print(value) + if(key[0:7] == "module."): + name = key[7:] + else: + name = key[0:] + + new_state_dict[name] = value + return new_state_dict + +def convert(): + checkpoint = torch.load("./mobilenet_cpu.pth.tar", map_location=torch.device('cpu')) + checkpoint['state_dict'] = proc_nodes_module(checkpoint,'state_dict') + model = mobilenet.mobilenet_v2(pretrained = False) + model.load_state_dict(checkpoint['state_dict']) + model.eval() + input_names = ["actual_input_1"] + output_names = ["output1"] + dummy_input = torch.randn(1, 3, 224, 224) + torch.onnx.export(model, dummy_input, "mobilenetV2_npu.onnx", input_names = input_names, output_names = output_names, opset_version=11) + +if __name__ == "__main__": + convert() +``` + +

样例说明

+ +- **[ResNet50模型迁移示例](#ResNet50模型迁移示例.md)** + +- **[ShuffleNet模型调优示例](#ShuffleNet模型调优示例.md)** + + +

ResNet50模型迁移示例

+ +- **[样例获取](#样例获取.md)** + +- **[训练脚本迁移](#训练脚本迁移.md)** + +- **[脚本执行](#脚本执行.md)** + + +

样例获取

+ +## 样例获取 + +1. 本样例基于PyTorch官网提供的Imagenet数据集训练模型进行适配昇腾910 AI处理器的迁移改造,样例获取路径为[https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet)。 +2. 本样例依赖torchvision,需要安装torchvision依赖,如果使用非root用户安装, 则需在命令末尾加上**--user**。 + + 当服务器运行环境为X86架构时,安装命令如下: + + ``` + pip3.7 install torchvision==0.6.0 --no-deps + ``` + + 当服务器运行环境为ARM架构时,安装命令如下: + + ``` + pip3.7 install torchvision==0.2.2.post3 --no-deps + ``` + +3. Resnet50模型参考PyTorch官网模型[https://pytorch.org/hub/pytorch\_vision\_resnet/](https://pytorch.org/hub/pytorch_vision_resnet/),实际使用有如下两种方式。 + 1. 直接调用对应接口,例如: + + ``` + import torchvision.models as models + model = models.resnet50() + ``` + + >![](public_sys-resources/icon-note.gif) **说明:** + >Resnet50为PyTorch内置模型,了解更多内置模型请前往[Pytorch官网](https://pytorch.org/)。 + + 2. 在脚本执行中直接指定参数arch为resnet50,内容如下,本样例迁移采用该种方式,请参见[脚本执行](#脚本执行.md)。 + + ``` + --arch resnet50 + ``` + + + +## 目录结构 + +主要文件目录结构如下所示: + +``` +├──main.py +``` + +

训练脚本迁移

+ +- **[单P训练修改](#单P训练修改.md)** + +- **[分布式训练修改](#分布式训练修改.md)** + + +

单P训练修改

+ +1. main.py增加头文件以支持基于PyTorch框架的模型在昇腾910 AI处理器上训练: + + ``` + import torch.npu + ``` + +2. 在main.py文件中头文件后添加参数以指定使用昇腾910 AI处理器进行训练: + + ``` + CALCULATE_DEVICE = "npu:1" + ``` + +3. 修改参数以及判断选项,使其只在昇腾910 AI处理器上进行训练。 + + 代码位置:main.py文件中的main\_worker\(\)函数(修改部分为字体加粗部分): + + ``` + def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + # 原代码为使用GPU进行训练,原代码如下: + # args.gpu = gpu + ############## npu modify begin ############# + args.gpu = None + ############## npu modify end ############# + if args.gpu is not None: + print("Use GPU: {} for training".format(args.gpu)) + + if args.distributed: + if args.dist_url == "env://" and args.rank == -1: + args.rank = int(os.environ["RANK"]) + if args.multiprocessing_distributed: + # For multiprocessing distributed training, rank needs to be the + # global rank among all the processes + args.rank = args.rank * ngpus_per_node + gpu + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + # 原代码中需要判断是否在GPU上进行训练,原代码如下: + # if not torch.cuda.is_available(): + # print('using CPU, this will be slow') + # elif args.distributed: + ############## npu modify begin ############# + # 迁移后为直接判断是否进行分布式训练,去掉判断是否在GPU上进行训练 + if args.distributed: + ############## npu modify end ############# + # For multiprocessing distributed, DistributedDataParallel constructor + # should always set the single device scope, otherwise, + # DistributedDataParallel will use all available devices. + if args.gpu is not None: + ...... + ``` + +4. 将模型以及损失函数迁移到昇腾910 AI处理器上进行计算。 + + 代码位置:main.py文件中的main\_worker\(\)函数(修改部分为字体加粗部分): + + ``` + elif args.gpu is not None: + torch.cuda.set_device(args.gpu) + model = model.cuda(args.gpu) + else: + # DataParallel will divide and allocate batch_size to all available GPUs + if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + model.features = torch.nn.DataParallel(model.features) + model.cuda() + else: + # 原代码使用torch.nn.DataParallel()类来用多个GPU加速训练 + # model = torch.nn.DataParallel(model).cuda() + ############## npu modify begin ############# + # 将模型迁移到NPU上进行训练。 + model = model.to(CALCULATE_DEVICE) + ############## npu modify end ############# + # 原代码中损失函数是在GPU上进行计算 + # # define loss function (criterion) and optimizer + # criterion = nn.CrossEntropyLoss().cuda(args.gpu) + ############## npu modify begin ############# + # 将损失函数迁移到NPU上进行计算。 + criterion = nn.CrossEntropyLoss().to(CALCULATE_DEVICE) + ############## npu modify end ############# + ``` + +5. 将数据集目标结果target修改成int32类型解决算子报错问题;将数据集迁移到昇腾910 AI处理器上进行计算。 + - 代码位置:main.py文件中的train\(\)函数(修改部分为字体加粗部分): + + ``` + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + # 原代码中训练数据集在GPU上进行加载计算,原代码如下: + # if torch.cuda.is_available(): + # target = target.cuda(args.gpu, non_blocking=True) + ############## npu modify begin ############# + # 将数据集迁移到NPU上进行计算并修改target数据类型 + if 'npu' in CALCULATE_DEVICE: + target = target.to(torch.int32) + images, target = images.to(CALCULATE_DEVICE, non_blocking=True), target.to(CALCULATE_DEVICE, non_blocking=True) + ############## npu modify end ############# + ``` + + + - 代码位置:main.py文件中的validate\(\)函数(修改部分为字体加粗部分): + + ``` + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + if args.gpu is not None: + images = images.cuda(args.gpu, non_blocking=True) + # 原代码中训练数据集在GPU上进行加载计算,原代码如下: + # if torch.cuda.is_available(): + # target = target.cuda(args.gpu, non_blocking=True) + ############## npu modify begin ############# + # 将数据集迁移到NPU上进行计算并修改target数据类型 + if 'npu' in CALCULATE_DEVICE: + target = target.to(torch.int32) + images, target = images.to(CALCULATE_DEVICE, non_blocking=True), target.to(CALCULATE_DEVICE, non_blocking=True) + ############## npu modify end ############# + ``` + + +6. 设置当前正在使用的device。 + + 代码位置:main.py文件中的主函数入口(修改部分为字体加粗部分): + + ``` + if __name__ == '__main__': + ############## npu modify begin ############# + if 'npu' in CALCULATE_DEVICE: + torch.npu.set_device(CALCULATE_DEVICE) + ############## npu modify begin ############# + main() + ``` + + +

分布式训练修改

+ +1. main.py增加头文件以支持基于PyTorch框架的模型在昇腾910 AI处理器上训练及进行混合精度训练。 + + ``` + import torch.npu + from apex import amp + ``` + +2. 参数设置增加以下参数,包括指定参与训练的昇腾910 AI处理器以及进行混合精度训练需要的参数。 + + ``` + parser.add_argument('--device', default='npu', type=str, help='npu or gpu') + parser.add_argument('--addr', default='10.136.181.115', type=str, help='master addr') + parser.add_argument('--device-list', default='0,1,2,3,4,5,6,7', type=str, help='device id list') + parser.add_argument('--amp', default=False, action='store_true', help='use amp to train the model') + parser.add_argument('--loss-scale', default=1024., type=float, + help='loss scale using in amp, default -1 means dynamic') + parser.add_argument('--opt-level', default='O2', type=str, + help='loss scale using in amp, default -1 means dynamic') + ``` + +3. 创建由device\_id到process\_id的映射函数,指定device进行训练。在main.py函数中增加以下接口。 + + ``` + def device_id_to_process_device_map(device_list): + devices = device_list.split(",") + devices = [int(x) for x in devices] + devices.sort() + + process_device_map = dict() + for process_id, device_id in enumerate(devices): + process_device_map[process_id] = device_id + + return process_device_map + ``` + +4. 指定训练服务器的ip和端口。 + + 代码位置:main.py文件中的主函数main\(\)(修改部分为字体加粗部分)。 + + ``` + def main(): + args = parser.parse_args() + ############## npu modify begin ############# + os.environ['MASTER_ADDR'] = args.addr + os.environ['MASTER_PORT'] = '29688' + ############## npu modify end ############# + ``` + +5. 创建由device\_id到process\_id的映射参数,获取单节点昇腾910 AI处理器数量。 + + 代码位置:main.py文件中的主函数main\(\)(修改部分为字体加粗部分)。 + + ``` + args.distributed = args.world_size > 1 or args.multiprocessing_distributed + ############## npu modify begin ############# + args.process_device_map = device_id_to_process_device_map(args.device_list) + if args.device == 'npu': + ngpus_per_node = len(args.process_device_map) + else: + ngpus_per_node = torch.cuda.device_count() + ############## npu modify end ############# + # 原代码如下: + # ngpus_per_node = torch.cuda.device_count() + ``` + +6. 获取进程process\_id对应的昇腾910 AI处理器编号,指定在对应的昇腾910 AI处理器上进行训练。 + + 代码位置:main.py文件中的main\_worker\(\)(修改部分为字体加粗部分)。 + + ``` + def main_worker(gpu, ngpus_per_node, args): + global best_acc1 + ############## npu modify begin ############# + args.gpu = args.process_device_map[gpu] + ############## npu modify end ############# + # 原代码如下: + # args.gpu = gpu + ``` + +7. 初始化进程组,屏蔽掉初始化方式。 + + 代码位置:main.py文件中的main\_worker\(\)(修改部分为字体加粗部分)。 + + ``` + ############## npu modify begin ############# + if args.device == 'npu': + dist.init_process_group(backend=args.dist_backend, #init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + else: + dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + ############## npu modify begin ############# + # 原代码如下: + # dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, + world_size=args.world_size, rank=args.rank) + ``` + +8. 要进行分布式训练且需要引入混合精度模块,并且需要将模型迁移到昇腾AI处理器上,因此需要屏蔽掉原始代码中判断是否为分布式训练以及模型是否在GPU上进行训练的代码部分。 + + 代码位置:main.py文件中的main\_worker\(\)(修改部分为字体加粗部分)。 + + ``` + # create model + if args.pretrained: + print("=> using pre-trained model '{}'".format(args.arch)) + model = models.__dict__[args.arch](pretrained=True) + else: + print("=> creating model '{}'".format(args.arch)) + model = models.__dict__[args.arch]() + ############## npu modify begin ############# + # 代码中添加如下内容 + # 指定训练设备为昇腾AI处理器 + loc = 'npu:{}'.format(args.gpu) + torch.npu.set_device(loc) + # 计算用于训练的batch_size和workers + args.batch_size = int(args.batch_size / ngpus_per_node) + args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + ############## npu modify end ############# + # 原始代码如下,需屏蔽掉,已注释 + # if not torch.cuda.is_available(): + # print('using CPU, this will be slow') + # elif args.distributed: + # # For multiprocessing distributed, DistributedDataParallel constructor + # # should always set the single device scope, otherwise, + # # DistributedDataParallel will use all available devices. + # if args.gpu is not None: + # torch.cuda.set_device(args.gpu) + # model.cuda(args.gpu) + # # When using a single GPU per process and per + # # DistributedDataParallel, we need to divide the batch size + # # ourselves based on the total number of GPUs we have + # args.batch_size = int(args.batch_size / ngpus_per_node) + # args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) + # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + # else: + # model.cuda() + # # DistributedDataParallel will divide and allocate batch_size to all + # # available GPUs if device_ids are not set + # model = torch.nn.parallel.DistributedDataParallel(model) + # elif args.gpu is not None: + # torch.cuda.set_device(args.gpu) + # model = model.cuda(args.gpu) + # else: + # # DataParallel will divide and allocate batch_size to all available GPUs + # if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): + # model.features = torch.nn.DataParallel(model.features) + # model.cuda() + # else: + # model = torch.nn.DataParallel(model).cuda() + ``` + +9. 屏蔽掉损失函数、优化器和断点训练部分,将这部分在后面与混合精度训练结合起来。 + + 代码位置:main.py文件中的main\_worker\(\)(修改部分为字体加粗部分)。 + + ``` + # 屏蔽掉原始代码,已注释 + # # define loss function (criterion) and optimizer + # criterion = nn.CrossEntropyLoss().cuda(args.gpu) + # + # optimizer = torch.optim.SGD(model.parameters(), args.lr, + # momentum=args.momentum, + # weight_decay=args.weight_decay) + # + # # optionally resume from a checkpoint + # if args.resume: + # if os.path.isfile(args.resume): + # print("=> loading checkpoint '{}'".format(args.resume)) + # if args.gpu is None: + # checkpoint = torch.load(args.resume) + # else: + # # Map model to be loaded to specified single gpu. + # loc = 'cuda:{}'.format(args.gpu) + # checkpoint = torch.load(args.resume, map_location=loc) + # args.start_epoch = checkpoint['epoch'] + # best_acc1 = checkpoint['best_acc1'] + # if args.gpu is not None: + # # best_acc1 may be from a checkpoint from a different GPU + # best_acc1 = best_acc1.to(args.gpu) + # model.load_state_dict(checkpoint['state_dict']) + # optimizer.load_state_dict(checkpoint['optimizer']) + # print("=> loaded checkpoint '{}' (epoch {})" + # .format(args.resume, checkpoint['epoch'])) + # else: + # print("=> no checkpoint found at '{}'".format(args.resume)) + # + # cudnn.benchmark = True + ``` + +10. 数据加载器,结合了数据集和取样器,并且可以提供多个线程处理数据集。由于是使用昇腾AI处理器进行训练,因此需要将**pin\_memory**设置为**False**;由于当前仅支持固定shape下的训练,数据流中剩余的样本数可能小于batch大小,因此需要将**drop\_last**设置为**True**;另外需要将验证部分数据集**shuffle**设置为**True**。 + + 代码位置:main.py文件中的main\_worker\(\)(修改部分为字体加粗部分)。 + + ``` + ############## npu modify begin ############# + train_loader = torch.utils.data.DataLoader( + train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), + num_workers=args.workers, pin_memory=False, sampler=train_sampler, drop_last=True) + + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=False, drop_last=True) + ############## npu modify end ############# + ``` + +11. 进行损失函数及优化器构建,将模型、损失函数迁移到昇腾AI处理器上;将优化器、模型与混合精度模块进行结合以支持混合精度训练;将断点训练部分与混合精度模块结合以支持混合精度训练。 + + 代码位置:main.py文件中的main\_worker\(\)中验证数据加载**后**(修改部分为字体加粗部分)。 + + ``` + val_loader = torch.utils.data.DataLoader( + datasets.ImageFolder(valdir, transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + normalize, + ])), + batch_size=args.batch_size, shuffle=True, + num_workers=args.workers, pin_memory=False, drop_last=True) + + ############## npu modify begin ############# + model = model.to(loc) + # define loss function (criterion) and optimizer + criterion = nn.CrossEntropyLoss().to(loc) + optimizer = torch.optim.SGD(model.parameters(), args.lr, + momentum=args.momentum, + weight_decay=args.weight_decay) + + if args.amp: + model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level, loss_scale=args.loss_scale) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu], broadcast_buffers=False) + + # optionally resume from a checkpoint + if args.resume: + if os.path.isfile(args.resume): + print("=> loading checkpoint '{}'".format(args.resume)) + checkpoint = torch.load(args.resume, map_location=loc) + args.start_epoch = checkpoint['epoch'] + best_acc1 = checkpoint['best_acc1'] + model.load_state_dict(checkpoint['state_dict']) + optimizer.load_state_dict(checkpoint['optimizer']) + if args.amp: + amp.load_state_dict(checkpoint['amp']) + print("=> loaded checkpoint '{}' (epoch {})" + .format(args.resume, checkpoint['epoch'])) + else: + print("=> no checkpoint found at '{}'".format(args.resume)) + + cudnn.benchmark = True + ############## npu modify end ############# + ``` + +12. 断点checkpoint保存需要与混合精度训练结合,修改如下。 + + 代码位置:main.py文件中的main\_worker\(\)(修改部分为字体加粗部分)。 + + ``` + # remember best acc@1 and save checkpoint + is_best = acc1 > best_acc1 + best_acc1 = max(acc1, best_acc1) + + if not args.multiprocessing_distributed or (args.multiprocessing_distributed + and args.rank % ngpus_per_node == 0): + ############## npu modify begin ############# + if args.amp: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + 'amp': amp.state_dict(), + }, is_best) + else: + save_checkpoint({ + 'epoch': epoch + 1, + 'arch': args.arch, + 'state_dict': model.state_dict(), + 'best_acc1': best_acc1, + 'optimizer' : optimizer.state_dict(), + }, is_best) + ############## npu modify end ############# + ``` + +13. 训练时,需要将数据集迁移到昇腾AI处理器上,修改如下: + + 代码位置:main.py文件中的train\(\)(修改部分为字体加粗部分)。 + + ``` + for i, (images, target) in enumerate(train_loader): + # measure data loading time + data_time.update(time.time() - end) + ############## npu modify begin ############# + loc = 'npu:{}'.format(args.gpu) + target = target.to(torch.int32) + images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False) + ############## npu modify end ############# + # 原模型代码如下: + # if args.gpu is not None: + # images = images.cuda(args.gpu, non_blocking=True) + # if torch.cuda.is_available(): + # target = target.cuda(args.gpu, non_blocking=True) + ``` + +14. 标记反向传播.backward\(\)发生的位置,这样混合精度模块就可以进行Loss Scaling并清除每次迭代的状态,代码如下: + + 代码位置:main.py文件中的train\(\)(修改部分为字体加粗部分)。 + + ``` + optimizer.zero_grad() + ############## npu modify begin ############# + if args.amp: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + # 原代码如下注释部分: + # loss.backward() + ############## npu modify end ############# + optimizer.step() + ``` + +15. 验证时,需要将验证数据集迁移到昇腾AI处理器上,修改如下: + + 代码位置:main.py文件中的validate\(\)(修改部分为字体加粗部分)。 + + ``` + with torch.no_grad(): + end = time.time() + for i, (images, target) in enumerate(val_loader): + ############## npu modify begin ############# + loc = 'npu:{}'.format(args.gpu) + target = target.to(torch.int32) + images, target = images.to(loc, non_blocking=False), target.to(loc, non_blocking=False) + ############## npu modify end ############# + # 原模型代码如下注释部分: + # if args.gpu is not None: + # images = images.cuda(args.gpu, non_blocking=True) + # if torch.cuda.is_available(): + # target = target.cuda(args.gpu, non_blocking=True) + ``` + + +

脚本执行

+ +## 准备数据集 + +准备数据集并上传到运行环境的目录下,例如:/home/data/resnet50/imagenet + +## 配置环境变量 + +请参考[配置环境变量](#配置环境变量.md)配置环境变量。 + +## 执行命令 + +例如: + +单卡: + +``` +python3.7 main.py /home/data/resnet50/imagenet --batch-size 128 --lr 0.1 --epochs 90 --arch resnet50 --world-size 1 --rank 0 --workers 40 --momentum 0.9 --weight-decay 1e-4 +``` + +分布式: + +``` +python3.7 main.py /home/data/resnet50/imagenet --addr='10.174.216.194' --seed 49 --workers 160 --lr 0.8 --print-freq 1 --arch resnet50 --dist-url 'tcp://127.0.0.1:50000' --dist-backend 'hccl' --multiprocessing-distributed --world-size 1 --batch-size 2048 --epochs 90 --rank 0 --device-list '0,1,2,3,4,5,6,7' --amp +``` + +>![](public_sys-resources/icon-note.gif) **说明:** +>dist-backend需配置成hccl以支持在昇腾AI设备上进行分布式训练。 + +

ShuffleNet模型调优示例

+ +- **[样例获取](#样例获取-9.md)** + +- **[模型评估](#模型评估.md)** + +- **[网络迁移](#网络迁移.md)** + +- **[网络调测](#网络调测.md)** + + +

样例获取

+ +## 样例获取 + +1. 本样例基于PyTorch官网提供的Imagenet数据集训练模型进行适配昇腾910 AI处理器的迁移改造,样例获取路径为[https://github.com/pytorch/examples/tree/master/imagenet](https://github.com/pytorch/examples/tree/master/imagenet)。 +2. ShuffleNet模型参考PyTorch官网模型[ShuffleNet V2](https://pytorch.org/hub/pytorch_vision_shufflenet_v2/),实际使用在脚本执行中直接指定参数arch为shufflenet\_v2\_x1\_0。 + + ``` + --arch shufflenet_v2_x1_0 + ``` + + >![](public_sys-resources/icon-note.gif) **说明:** + >ShuffleNet为PyTorch内置模型,了解更多内置模型请前往[Pytorch官网](https://pytorch.org/)。 + + +## 目录结构 + +主要文件目录结构如下所示: + +``` +├──main.py +``` + +

模型评估

+ +模型评估主要关注算子适配情况,使用dump op方法获取ShuffleNet网络算子信息,与《PyTorch适配算子清单》算子进行对比,若是发现某个算子当前暂不支持,对于简单场景我们可以考虑先暂时替换成类似的算子或者把该算子单独放到cpu上执行两种方式规避,复杂场景不支持算子需要参见《PyTorch算子开发指南》进行算子开发。 + +

网络迁移

+ +训练脚本迁移请参见[单P训练修改](#单P训练修改.md)和[分布式训练修改](#分布式训练修改.md)。脚本执行时注意选择参数--arch shufflenet\_v2\_x1\_0。 + +

网络调测

+ +网络调测具体方法请参见[调测过程](#调测过程.md)。经排查ShuffleNet运行时相关算子耗时过大,以下给出耗时数据及解决方法。 + +## 前向排查 + +前向排查记录表如下: + +**表 1** 前向排查 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

time(ms)

+

batch_size

+

detail

+

1

+

1100

+

512

+

channel_shuffle操作使用channel_shuffle_index_select替代。

+

2

+

600

+

512

+

使用两个channel_shuffle_index_select操作消减chunk带来的不连续。

+

3

+

300

+

512

+

通过框架层,指定concat输出格式为NCHW以消除过多的transdata。

+

4

+

285

+

512

+

修复了未初始化weight格式。

+

5

+

275

+

512

+

修复了DWCONV没有指定输出格式为5HD的问题

+
+ +详细说明如下: + +1. 由于原生实现的torch.transpose\(x, 1, 2\).contiguous\(\)是使用了View类框架算子transpose,造成了非连续场景,如[copy瓶颈优化](#性能优化-3.md)所描述Copy瓶颈,使用channel\_shuffle\_index\_select,在语义相同的情况下使用计算类算子替换框架类算子,从而减少耗时。 + +2. 由于shufflenetv2中含有大量的chunk操作,而chunk操作在Pytorch中为框架类算子,其结果会将一个tensor分割为几个等长的非连续的tensor,而非连续转连续这个操作目前耗时较长,故使用计算类算子消除非连续,如[copy瓶颈优化](#性能优化-3.md)所描述Copy瓶颈。 + +3. 适配层在适配算子时默认指定输出格式为输入格式,但是concat不支持C轴非16整数倍的5HD的格式,会转为4D进行处理,又由于concat后面接的是gatherv2算子,也是仅支持4D格式的算子,所以导致数据格式转换过程为5HD-\>4D-\>concat-\>5HD-\>4D-\>gatherv2-\>5HD,解决方法是修改concat输出格式,当非16整数倍时指定输出格式为4D,优化后数据格式转换过程为5HD-\>4D-\>concat-\>gatherv2-\>5HD,当前针对ShuffleNet的做法具体可参考pytorch/aten/src/ATen/native/npu/CatKernelNpu.cpp 第121行。 + +4. 设置weight初始化格式避免计算过程中反复的transdata,如[copy瓶颈优化](#性能优化-3.md)所描述框架瓶颈。 + +5. 修复了DWCONV weight输出格式指定,避免一些不必要5HD-\>4D。 + + +## 整网排查 + +整网排查记录表如下: + +**表 2** 整网排查 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +

序号

+

time(ms)

+

batch_size

+

detail

+

1

+

5500

+

512

+

通过框架层,index tocpu成index_add操作

+

2

+

4000

+

512

+

通过自定义算子,预生成index,不再tocpu

+

3

+

1800

+

512

+

通过自定义算子,融合index_add和chunk

+

4

+

885

+

512

+

添加contiguous_with_gatherv2

+

5

+

3480

+

1024

+

修改batchsize

+

6

+

1650

+

1024

+

修改batchsize + contiguous_with_gatherv2

+

7

+

1424

+

1024

+

通过自定义算子,融合cat+shuffle+chunk,消除不连续

+

8

+

1360

+

1024

+

通过框架层,修改relugrad传入的grad格式

+

9

+

1300

+

1024

+

修改IndexSelectFullImplementation的bp传入格式

+

10

+

920

+

1024

+

修改amp O1

+

11

+

860

+

1024

+

修改amp O2

+

12

+

830

+

1024

+

消除BN参数更新时AXPY引入的过多的transdata

+

13

+

800

+

1024

+

消除所有fp bp parm_updata间的流同步

+

14

+

461

+

1024

+

针对非32对齐场景,改进GatherV2算子

+

15

+

429

+

1024

+

针对ShufflenetV2场景再次优化GatherV2算子-->GatherV3

+
+ +详细说明如下: + +1. 使用计算类算子替换框架类算子。 + +2. 使用buffer记录index信息到npu,消除index.to\('npu'\) 的操作。 + +3. 使用计算类算子消除非连续。 + +4. contiguous\_with\_gatherv2是使用aicore算子GatherV2来完成非连续转连续操作。 + +5. 修改batchsize。 + +6. 修改batchsize + contiguous\_with\_gatherv2。 + +7. 由于concat算子的反向是chunk,会引起非连续问题,故自定义concat算子反向,使用Gatherv2替代chunk,将其融合成cat+shuffle+chunk,消除不连续。 + +8. ReluGrad算子有两个输入:grad\_output(反向的输入),self(正向的输出),在shufflenet中有时会出现4D + 5HD的场景,而FE的格式对齐往往对齐第一个tensor的format,结果就会导致\(4D, 5HD\)-\>\(4D, 4D\)-\>ReluGrad-\>4D-\>5HD。由于正向的输出格式基本就是输入格式,而relu往往是配合在Conv+BN+Relu这样使用,所以可以认为,在这个场景下,输出5HD是更合适的选择。于是手动插入npu\_format\_cast,\(4D, 5HD\)-\>\(5HD, 5HD\)-\>ReluGrad-\>5HD。 + +9. IndexSelectFullImplementation中涉及到了对一个5HD的tensor做两次gatherv2操作,这个时候会导致两次的5HD-\>4D,可以手动先做一次5HD-\>4D,这样就可以在gatherv2时不做transdata,从而消减一次transdata操作。 + +10. 加入混合精度O1。 + +11. 加入混合精度O2。 +12. 由于Axpy算子的参数校验,所有网络在参数更新时,如C不整除16则会transdata为4D进行Axpy运算,引入了大量的transdata算子,通过增加一个函数,当Axpy的input的shape一致时结束校验,从而避免了格式转换,增加了运行效率。 + +13. 删除所有的流同步操作,原因是容易导致不收敛,没有采纳。 + +14. 使用针对非对齐优化后的Gatherv2算子后,整体性能提速至交付水平。 + +15. 使用针对ShufflenetV2场景再次优化后的Gatherv3算子后,整体性能还能继续提升。 + + +## python侧优化细节 + +Python侧优化主要是通过一些同等语义的修改,使网络在NPU上边的更加亲和。当前非连续转连续容易成为性能瓶颈,而ShufflenetV2中的channel\_shuffle操作就涉及了permute后转连续的操作,导致整网性能在NPU上较差。通过对channel\_shuffle操作进行同等语义的修改,加上和concat操作的融合,使得整网性能得到飞升。采用的是torchvision版本参见[开源链接](https://github.com/pytorch/vision/blob/master/torchvision/models/shufflenetv2.py)。 + +- 框架原始channel\_shuffle操作。 + + ``` + def channel_shuffle(x, groups): + # type: (torch.Tensor, int) -> torch.Tensor + batchsize, num_channels, height, width = x.data.size() + channels_per_group = num_channels // groups + # reshape + x = x.view(batchsize, groups, + channels_per_group, height, width) + x = torch.transpose(x, 1, 2).contiguous() + # flatten + x = x.view(batchsize, -1, height, width) + return x + + class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride): + super(InvertedResidual, self).__init__() + if not (1 <= stride <= 3): + raise ValueError('illegal stride value') + self.stride = stride + branch_features = oup // 2 + assert (self.stride != 1) or (inp == branch_features << 1) + if self.stride > 1: + self.branch1 = nn.Sequential( + self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(inp), + nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + else: + self.branch1 = nn.Sequential() + + self.branch2 = nn.Sequential( + nn.Conv2d(inp if (self.stride > 1) else branch_features, + branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(branch_features), + nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + + @staticmethod + def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False): + return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i) + + def forward(self, x): + if self.stride == 1: + x1, x2 = x.chunk(2, dim=1) + out = torch.cat((x1, self.branch2(x2)), dim=1) + else: + out = torch.cat((self.branch1(x), self.branch2(x)), dim=1) + + out = channel_shuffle(out, 2) + + return out + ``` + +- 同等语义改写。 + +``` +def channel_shuffle_index_select(x, groups=2): + N, C, H, W = x.shape + inp = C + # channel_shuffle操作是对C维按一定规则的重排的工作,可以被表达为一次简单的重排 + group_len = inp // groups + index = torch.from_numpy(np.array(list(range(inp))).reshape(groups, group_len).transpose(1, 0).flatten()).long() + + x = x.index_select(1, index) + return x + +# 对两个操作进行结果对比,可以看到语义是相等的 +x = torch.randn(2, 232, 14, 14) +for group in [2, 4, 8]: + out1 = channel_shuffle(x, group) + out2 = channel_shuffle_index_select(x, group) + print((out1 - out2).sum()) +``` + +- 昇腾AI处理器亲和写法。 + + ``` + # 对应 out = channel_shuffle(torch.cat((self.branch1(x), self.branch2(x)), dim=1)) 的情形 + # 使用channel_shuffle_index_select替代channel_shuffle + # 自定义OP,融合channel_shuffle_index_select和cat,使用计算类算子来消减非连续 + class IndexSelectFullImplementation(torch.autograd.Function): + @staticmethod + def forward(ctx, x1, x2, fp_index, bp_index1, bp_index2): + # 强制流同步,仅稳定训练作用 + stream = torch.npu.current_stream() + stream.synchronize() + + # 对ctx注册bp_index1, bp_index2使反向时可以使用 + ctx.bp_index1 = bp_index1 + ctx.bp_index2 = bp_index2 + + x = torch.cat([x1, x2], dim=1) + + # 使用index_select替代channel_shuffle操作,这里是后面不接chunk算子的场景 + result = x.index_select(1, fp_index) + + return result + + @staticmethod + def backward(ctx, grad_output): + # 强制流同步,仅稳定训练作用 + stream = torch.npu.current_stream() + stream.synchronize() + + # 由于index_select不支持5HD格式,将格式转换为NCHW来减少额外的transdata + grad_output.data = grad_output.data.npu_format_cast(0) + + # 依据正向推导得到的反向的表达式,使用index_select同时完成对index_select和cat的反向 + out1 = grad_output.index_select(1, ctx.bp_index1) + out2 = grad_output.index_select(1, ctx.bp_index2) + return out1, out2, None, None, None, None + + + class IndexSelectHalfImplementation(torch.autograd.Function): + @staticmethod + def forward(ctx, x1, x2, fp_index1, fp_index2, bp_index1, bp_index2): + ctx.bp_index1 = bp_index1 + ctx.bp_index2 = bp_index2 + x = torch.cat([x1, x2], dim=1) + + # 使用index_select替代channel_shuffle操作,这里是后面接chunk算子的场景 + return x.index_select(1, fp_index1), x.index_select(1, fp_index2) + + @staticmethod + def backward(ctx, grad_output1, grad_output2): + grad_output = torch.cat([grad_output1, grad_output2], 1) + + out1 = grad_output.index_select(1, ctx.bp_index1) + out2 = grad_output.index_select(1, ctx.bp_index2) + return out1, out2, None, None, None, None + + + class Channel_Shuffle(nn.Module): + def __init__(self, inp, groups=2, split_shuffle=True): + super(Channel_Shuffle, self).__init__() + + self.split_shuffle = split_shuffle + self.group_len = inp // groups + + # 初始化channel_shuffle_index_select中需要使用的fp_index + self.out = np.array(list(range(inp))).reshape(groups, self.group_len).transpose(1, 0).flatten().tolist() + + # 将初始化的fp_index按需注册为module的buffer,在to.device的时候顺路带到设备,减少h2dcopy的耗时 + # 此处仅展示常用的group=2的场景下的使用方式,其他情形请自行拓展 + if self.split_shuffle: + self.register_buffer('fp_index1', torch.tensor(self.out[:self.group_len], dtype=torch.int32)) + self.register_buffer('fp_index2', torch.tensor(self.out[self.group_len:], dtype=torch.int32)) + else: + self.register_buffer('fp_index', torch.tensor(self.out, dtype=torch.int32)) + + # 将对应的bp_index按需注册为module的buffer,在to.device的时候顺路带到设备,减少h2dcopy的耗时 + self.register_buffer('bp_index1', torch.tensor(list(range(0, inp, 2)), dtype=torch.int32)) + self.register_buffer('bp_index2', torch.tensor(list(range(1, inp, 2)), dtype=torch.int32)) + + def forward(self, x1, x2): + if self.split_shuffle: + return IndexSelectHalfImplementation.apply(x1, x2, self.fp_index1, self.fp_index2, self.bp_index1, + self.bp_index2) + else: + return IndexSelectFullImplementation.apply(x1, x2, self.fp_index, self.bp_index1, self.bp_index2) + + + class InvertedResidual(nn.Module): + def __init__(self, inp, oup, stride, split_shuffle=True): + super(InvertedResidual, self).__init__() + + if not (1 <= stride <= 3): + raise ValueError('illegal stride value') + self.stride = stride + + branch_features = oup // 2 + assert (self.stride != 1) or (inp == branch_features << 1) + + if self.stride > 1: + self.branch1 = nn.Sequential( + self.depthwise_conv(inp, inp, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(inp), + nn.Conv2d(inp, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + else: + self.branch1 = nn.Sequential() + + self.branch2 = nn.Sequential( + nn.Conv2d(inp if (self.stride > 1) else branch_features, + branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + self.depthwise_conv(branch_features, branch_features, kernel_size=3, stride=self.stride, padding=1), + nn.BatchNorm2d(branch_features), + nn.Conv2d(branch_features, branch_features, kernel_size=1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(branch_features), + nn.ReLU(inplace=True), + ) + + if self.stride > 1: + self.channel_shuffle = Channel_Shuffle(inp=branch_features + branch_features, groups=2, + split_shuffle=split_shuffle) + else: + self.channel_shuffle = Channel_Shuffle(inp=inp, groups=2, split_shuffle=split_shuffle) + + @staticmethod + def depthwise_conv(i, o, kernel_size, stride=1, padding=0, bias=False): + return nn.Conv2d(i, o, kernel_size, stride, padding, bias=bias, groups=i) + + def forward(self, x): + + # 删除concat和chunk操作,融合进self.channel_shuffle内处理 + if self.stride == 1: + x1, x2 = x + x2 = self.branch2(x2) + else: + x1 = self.branch1(x) + x2 = self.branch2(x) + + out = self.channel_shuffle(x1, x2) + + return out + ``` + + +

参考信息

+ +- **[单算子样例编写说明](#单算子样例编写说明.md)** + +- **[单算子dump方法](#单算子dump方法.md)** + +- **[常用环境变量说明](#常用环境变量说明.md)** + +- **[dump op方法](#dump-op方法.md)** + +- **[CMake安装方法](#CMake安装方法.md)** + + +

单算子样例编写说明

+ +在模型中遇到问题时,使用整网复现问题成本较大,可以构建测试用例来复现精度或性能问题,便于定位解决。构建测试用例一般有如下两种方式。单算子dump方法请参见[单算子dump方法](#单算子dump方法.md)。 + +1. 单算子测试用例构建,直接调用该算子即可复现错误场景。 + + 例如构建max算子的单算子样例如下: + + ``` + import torch + import copy + from torch.testing._internal.common_utils import TestCase, run_tests + class TestMax(TestCase): + def cpu_op_exec(self, input1): + # 调用算子 + output = torch.max(input1) + output = output.numpy() + return output + + def npu_op_exec(self, input1): + # 调用对应npu算子 + output = torch.max(input1) + return output + + def test_max(self): + input = torch.randn(10,20)) + input = input.to(torch.int64) # 数据dtype转换 + input_cpu = copy.deepcopy(input) + input_npu = copy.deepcopy(input).npu() + + output_cpu = self.cpu_op_exec(input_cpu) + output_npu = self.npu_op_exec(input_npu) + + # 比较cpu和npu的计算结果,prec为允许误差 + self.assertEqual(output_cpu, output_npu, prec = 1e-4) + + if __name__ == '__main__': + run_tests() + ``` + + >![](public_sys-resources/icon-note.gif) **说明:** + >- 运行上述代码,如果发现报错信息与模型中的max算子报错信息相同,则说明单算子测试用例构建成功。 + >- 假设注释掉输入数据dtype转换代码,发现测试用例无报错,则可以说明在输入参数为torch.int64时,max算子在npu上报错。 + +2. 基于上下文的单算子测试用例构建。 + + 这里虽然是单算子样例,但有时候不仅仅为一个操作,而是带有上下文的场景,还有时候是一个带参数Module,Module的方式是更通用的方法。此处构建一个包含两个OP的Module,构建样例如下: + + ``` + import torch + import copy + from torch.testing._internal.common_utils import TestCase, run_tests + + class Model(nn.Module): + def __init__(self, in_channels=1, hooks=False): + super(Model, self).__init__() + self.conv = nn.Conv2d(in_channels, in_channels*2, kernel_size=64) + if hooks: + self.conv.weight.register_hook(lambda grad: print(grad)) + def forward(self, x): + out = self.conv(x) + return out + + class TestConv2d(TestCase): + def test_conv2d(self): + + model = Model(in_channels=16) + + # 若需要获取反向计算结果,则加入hooks获取反向即可 + # model = Model(in_channels=16, hooks=True) + # 创建输入tensor + input_tensor = torch.randn(4,16,64,64) + + input_tensor_cpu= copy.deepcopy(input_tensor) + out = model(input_tensor_cpu) + loss = out.sum() + loss.backward() + cpuout = out + + # 3 to NPU 运行,将model和input_tensor放到NPU运行 + torch.npu.set_device("npu:0") # 一般先set_device设定运行卡 + model_npu = Model(in_channels=16).npu() + input_tensor_npu= copy.deepcopy(input_tensor).npu() + out = model_npu(input_tensor_npu) + loss = out.sum() + loss.backward() + npuout = out + #根据结果,确定是否为报错场景 + self.assertEqual(cpuout, npuout, prec = 1e-4) + + if __name__ == '__main__': + run_tests() + ``` + + +

单算子dump方法

+ +## 采集Dump数据 + +当前适配昇腾AI处理器的PyTorch通过torch.npu中的init\_dump\(\)、set\_dump\(\)和finalize\_dump接口来进行算子dump数据的采集。首先init\_dump\(\)会进行初始化dump配置,然后通过set\_dump\(\)接口通过传入配置文件来配置dump参数,最后通过finalize\_dump来结束dump。以下以add\_算子为例,介绍算子dump数据采集方法。 + +``` +import torch +torch.npu.set_device("npu:0") +torch.npu.init_dump() +torch.npu.set_dump("/home/HwHiAiUser/dump.json") # "/home/HwHiAiUser/dump.json"为配置文件路径,用户自行配置 +a = torch.tensor([2, 2]).to("npu:0") +a.add_(1) +torch.npu.finalize_dump() +``` + +其中**dump.json**配置方法如下。 + +``` +{ + "dump": + { + "dump_list":[], + "dump_path":"/home/HwHiAiUser/dump/output", + "dump_mode":"all", + "dump_op_switch":"on" + } +``` + +**dump.json**字段解释如下。 + + + + + + + + + + + + + + + + + + + +

字段名

+

说明

+

dump_list

+

待dump数据的算子模型。为空,无需配置。

+

dump_path

+

dump数据文件存储到运行环境的目录,支持配置绝对路径或相对路径:

+
  • 绝对路径配置以“/”开头,例如:/home/HwHiAiUser/output。
  • 相对路径配置直接以目录名开始,例如:output。
+

例如:dump_path配置为/home/HwHiAiUser/output,则dump数据文件存储到运行环境的/home/HwHiAiUser/output目录下。

+

dump_mode

+

dump数据模式,配置如下。

+
  • input:dump算子的输入数据。
  • output:dump算子的输出数据,默认取值output。
  • all:dump算子的输入、输出数据
+

dump_op_switch

+

单算子模型dump数据开关,配置如下。

+
  • on:开启单算子模型dump。
  • off:关闭单算子模型dump,默认取值off
+
+ +## 查看溢出数据 + +采集的dump数据会在\{dump\_path\}/\{time\}/\{deviceid\}/\{model\_id\}/\{data\_index\}目录下生成例如:“/home/HwHiAiUser/output/20200808163566/0/0”目录。 + +存放路径及文件命名规则: + +- dump\_path:用户配置的溢出数据存放路径,例如/home/HwHiAiUser/output。 + +- time:时间戳,例如20200808163566。 +- deviceid:Device设备ID号。 +- model\_id:子图ID。 +- dump文件:命名规则如\{op\_type\}.\{op\_name\}.\{taskid\}.\{stream\_id\}.\{timestamp\},如果op\_type、op\_name出现了“.”、“/”、“\\”、空格时,会转换为下划线表示。 + +## 解析溢出算子的dump文件 + +1. 请根据实际情况,将\{op\_type\}.\{op\_name\}.\{taskid\}.\{stream\_id\}.\{timestamp\}上传到安装有Toolkit软件包的环境。 +2. 进入解析脚本所在路径,假设Toolkit软件包安装目录为:/home/HwHiAiUser/Ascend/ascend-toolkit/latest。 + + **cd /home/HwHiAiUser/Ascend/ascend-toolkit/latest/toolkit/tools/operator\_cmp/compare** + +3. 执行msaccucmp.pyc脚本,转换dump文件为numpy文件。举例: + + **python3.7.5 msaccucmp.pyc convert -d /home/HwHiAiUser/dump -out /home/HwHiAiUser/dumptonumpy -v 2** + + >![](public_sys-resources/icon-note.gif) **说明:** + >-d参数支持传入单个文件,对单个dump文件进行转换,也支持传入目录,对整个path下所有的dump文件进行转换。 + +4. 调用Python,转换numpy文件为txt文件。举例: + + **$ python3.7.5** + + **\>\>\> import numpy as np** + + **\>\>\> a = np.load\("/home/HwHiAiUser/dumptonumpy/Pooling.pool1.1147.1589195081588018.output.0.npy"\)** + + **\>\>\> b = a.flatten\(\)** + + **\>\>\> np.savetxt\("/home/HwHiAiUser/dumptonumpy/Pooling.pool1.1147.1589195081588018.output.0.txt", b\)** + + 转换为.txt格式文件后,维度信息、Dtype均不存在。详细的使用方法请参考numpy官网介绍。 + + +

常用环境变量说明

+ +1. 开启TASK多线程下发,绝大多数情况下,打开该功能会进一步提升整网训练性能。 + + **export TASK\_QUEUE\_ENABLE=1** + +2. 开启重定向日志到stdout,用于导出host日志到屏幕。 + + **export ASCEND\_SLOG\_PRINT\_TO\_STDOUT=1** + +3. 设置日志级别,日志级别设置,信息从多到少分别是 debug --\> info --\> warning --\> error --\> null,一般设置为error,调试时使用info。请参考《CANN 日志参考》设置日志级别。 +4. dump图,主要用于查看图结构。 + + **export DUMP\_GE\_GRAPH=2** + + **export DUMP\_GRAPH\_LEVEL=3** + + +

dump op方法

+ +1. 使用profile接口对原始代码训练脚本的loss计算和优化过程进行改造,打印算子信息。代码样例如下: + + ``` + with torch.autograd.profiler.profile() as prof: + out = model(input_tensor) + loss = out.sum() + loss.backward() + # 也可导出文件 + print(prof.key_averages().table(sort_by="self_cpu_time_total")) + ``` + +2. 将改造后的训练脚本在CPU上进行训练,屏幕会打印相关算子信息。 + +

CMake安装方法

+ +CMake版本升级为3.12.1的方法 + +1. 获取Cmake软件包。 + + ``` + wget https://cmake.org/files/v3.12/cmake-3.12.1.tar.gz --no-check-certificate + ``` + +2. 解压并进入软件包目录。 + + ``` + tar -xf cmake-3.12.1.tar.gz + cd cmake-3.12.1/ + ``` + +3. 执行配置、编译和安装命令。 + + ``` + ./configure --prefix=/usr/local/cmake + make && make install + ``` + +4. 设置软连接。 + + ``` + ln -s /usr/local/cmake/bin/cmake /usr/bin/cmake + ``` + +5. 执行如下命令验证是否安装成功。 + + ``` + cmake --version + ``` + + 如显示“cmake version 3.12.1”则表示安装成功。 + + +

FAQ

+ +- **[软件安装常见问题](#软件安装常见问题.md)** + +- **[模型和算子运行常见问题](#模型和算子运行常见问题.md)** + +- **[模型调测常见问题](#模型调测常见问题.md)** + +- **[其他操作相关问题](#其他操作相关问题.md)** + +- **[模型分布式训练常见问题](#模型分布式训练常见问题.md)** + + +

软件安装常见问题

+ +- **[pip3.7 install Pillow==5.3.0安装失败](#pip3-7-install-Pillow-5-3-0安装失败.md)** + +- **[安装“torch-\*.whl ”提示“torch 1.5.0xxxx”与“torchvision”所依赖的版本不匹配](#安装-torch--whl-提示-torch-1-5-0xxxx-与-torchvision-所依赖的版本不匹配.md)** + + +

pip3.7 install Pillow==5.3.0安装失败

+ +## 现象描述 + +pip3.7 install pillow==5.3.0安装失败。 + +## 可能原因 + +缺少必要的依赖,如:libjpeg、python-devel、 zlib-devel 、libjpeg-turbo-devel等等。 + +## 处理方法 + +安装相关依赖,通过如下命令安装: + +- CentOS/EulerOS/Tlinux/BClinux/Suse + + **yum install libjpeg python-devel zlib-devel libjpeg-turbo-devel** + +- Ubuntu/Debian/UOS + + **apt-get install libjpeg python-devel zlib-devel libjpeg-turbo-devel** + + +

安装“torch-\*.whl ”提示“torch 1.5.0xxxx”与“torchvision”所依赖的版本不匹配

+ +## 现象描述 + +安装“torch-\*.whl”时,提示"ERROR:torchvision 0.6.0 has requirement torch==1.5.0, but you'll have torch 1.5.0a0+1977093 which is incompatible"。 + +![](figures/zh-cn_image_0000001106176216.png) + +## 可能原因 + +安装torch时,会自动触发torchvision进行依赖版本检查,环境中安装的torchvision版本为0.6.0,检查时发现我们安装的torch-\*.whl的版本号与要求的1.5.0不一致,所以提示报错,但实际安装成功 。 + +## 处理方法 + +对实际结果无影响,无需处理。 + +

模型和算子运行常见问题

+ +- **[在模型运行或者算子运行时遇到报错“RuntimeError: ExchangeDevice:”](#在模型运行或者算子运行时遇到报错-RuntimeError-ExchangeDevice.md)** + +- **[在模型运行或者算子运行时遇到报错“Error in atexit.\_run\_exitfuncs:”](#在模型运行或者算子运行时遇到报错-Error-in-atexit-_run_exitfuncs.md)** + +- **[在模型运行时遇到报错“terminate called after throwing an instance of 'c10::Error' what\(\): HelpACLExecute:”](#在模型运行时遇到报错-terminate-called-after-throwing-an-instance-of-c10-Error-what()-HelpACLExecute.md)** + +- **[在模型运行时遇到报错“ImportError: libhccl.so.”](#在模型运行时遇到报错-ImportError-libhccl-so.md)** + +- **[在模型运行时遇到报错“RuntimeError: Initialize.”](#在模型运行时遇到报错-RuntimeError-Initialize.md)** + +- **[在模型运行时遇到报错“TVM/te/cce error.”](#在模型运行时遇到报错-TVM-te-cce-error.md)** + +- **[在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”](#在模型运行时遇到报错-MemCopySync-drvMemcpy-failed.md)** + +- **[在模型运行时将多任务下发关闭\(export TASK\_QUEUE\_ENABLE=0\)后仍然遇到报错“HelpACLExecute.”](#在模型运行时将多任务下发关闭(export-TASK_QUEUE_ENABLE-0)后仍然遇到报错-HelpACLExecute.md)** + + +

在模型运行或者算子运行时遇到报错“RuntimeError: ExchangeDevice:”

+ +## 现象描述 + +![](figures/FAQ1.png) + +## 可能原因 + +目前在一个线程内,只能调用一个npu设备,当切换不同的npu device时,出现上述错误。 + +## 处理方法 + +检查代码中在调用torch.npu.set\_device\(device\)、tensor.to\(device\)或者model.to\(device\)时,同一个线程内前后调用时device名称不一致。对于多个线程情况(如多卡训练),每个线程同样只能调用固定的npu device。 + +

在模型运行或者算子运行时遇到报错“Error in atexit.\_run\_exitfuncs:”

+ +## 现象描述 + +![](figures/FAQ2.png) + +## 可能原因 + +在torch初始化时,若未通过torch.npu.device\(id\)指定npu设备,则默认使用device 0设备。若直接使用其他npu设备,如指定在device 1上创建tensor,那么在运行时会出现上述错误。 + +## 处理方法 + +在调用npu设备之前,通过torch.npu.set\_device\(device\)指定需要使用的npu设备即可。 + +

在模型运行时遇到报错“terminate called after throwing an instance of 'c10::Error' what\(\): HelpACLExecute:”

+ +## 现象描述 + +![](figures/FAQ3.png) + +## 可能原因 + +目前HelpACLExecute的报错信息无法直接找到报错位置,此处在task任务下发时报错,是由于开启了TASK多线程下发(export TASK\_QUEUE\_ENABLE=1),上层封装了报错信息,导致无法获取更加详细的报错日志。 + +## 处理方法 + +可通过如下两种方式处理: + +- 查看具体的host报错日志信息。日志默认路径为/var/log/npu/slog/host-0/,根据时间标识查找以host-0为前缀的日志文件,打开日志文件,搜索“ERROR”,查询具体的报错信息。 +- 关闭多线程下发\(export TASK\_QUEUE\_ENABLE=0\),再次运行代码,一般可根据终端报错信息定位错误原因。 + +

在模型运行时遇到报错“ImportError: libhccl.so.”

+ +## 现象描述 + +![](figures/FAQ7.png) + +## 可能原因 + +目前对外发布的pytorch安装包,默认使用NPU和HCCL功能,因此在调用时需要将HCCL模块路径添加到环境变量中。根据报错信息“can not find libhccl.so”,出现上述错误原因为缺少hccl库文件。 + +## 处理方法 + +将hccl模块的路径添加到环境变量中即可,一般hccl库文件路径为安装包下的.../fwkacllib/python/site-packages/hccl。 + +

在模型运行时遇到报错“RuntimeError: Initialize.”

+ +## 现象描述 + +![](figures/FAQ9.png) + +## 可能原因 + +根据报错信息,初步判断为npu设备初始化错误。进一步查找host日志报错信息如下: + +![](figures/FAQ9-1.png) + +根据日志信息定位报错原因为系统在拉起npu设备时报错。 + +## 处理方法 + +一般可通过重启服务器和所有npu device解决该问题;若重启后仍然存在该问题,检查安装的driver和firmware版本是否匹配,更换正确版本的driver和firmware或者向华为工程师报告求助解决。 + +

在模型运行时遇到报错“TVM/te/cce error.”

+ +## 现象描述 + +![](figures/FAQ10.png) + +## 可能原因 + +pytorch内调用npu类型算子时,强依赖于te、cce、tvm组件,pytorch、toolkit/nnae和te版本需要一致。在更新toolkit/nnae后,te等组件不会自动更新,当版本不匹配时,则会出现该报错。 + +## 处理方法 + +更新te等组件版本,具体需要更新te-_.whl和topi-_.whl安装包。在安装的toolkit或者nnae的fwkacllib子目录下(默认安装路径在/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64目录下,更新安装包即可。在该目录下有安装包topi-0.4.0-py3-none-any.whl和te-0.4.0-py3-none-any.whl,分别执行pip install --upgrade topi-0.4.0-py3-none-any.whl,pip install --upgrade te-0.4.0-py3-none-any.whl。 + +![](figures/FAQ10-1.png) + +

在模型运行时遇到报错“MemCopySync:drvMemcpy failed.”

+ +## 现象描述 + +脚本: + +``` + import torch + + def test_sum(): + xs_shape = [22400, 8] + ys_shape = [22400, 8] + gt_bboxes_shape = [22400, 8,4] + xs = torch.rand(xs_shape).npu() + ys = torch.rand(ys_shape).npu() + gt_bboxes = torch.rand(gt_bboxes_shape).npu().half() + left = xs - gt_bboxes[..., 0] + right = gt_bboxes[..., 2] - xs + top = ys - gt_bboxes[..., 1] + bottom = gt_bboxes[..., 3] - ys + # stream = torch.npu.current_stream() + # stream.synchronize() + # left, top 结果是fp32, right, bottom 结果是fp16, + # print(left.dtype, top.dtype, right.dtype, bottom.dtype) + bbox_targets = torch.stack((left, top, right, bottom), -1) #报错位置在这里 + # stream.synchronize() + + bbox_targets = torch.sum(bbox_targets) +``` + +shell报错信息: + +``` + RuntimeError: Run:/usr1/workspace/PyTorch_Apex_Daily_c20tr5/CODE/aten/src/ATen/native/npu/utils/OpParamMaker.h:280 NPU error,NPU error code is:500002 + [ERROR] RUNTIME(160809)kernel task happen error, retCode=0x28, [aicpu timeout]. + [ERROR] RUNTIME(160809)aicpu kernel execute failed, device_id=0, stream_id=512, task_id=24, fault so_name=, fault kernel_name=, extend_info=. + Error in atexit._run_exitfuncs: + Traceback (most recent call last): + File "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/__init__.py", line 429, in _npu_shutdown + torch._C._npu_shutdown() + RuntimeError: npuSynchronizeDevice:/usr1/workspace/PyTorch_Apex_Daily_c20tr5/CODE/c10/npu/NPUStream.cpp:806 NPU error, error code is 0 +``` + +日志信息: + +``` + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.679 [../../../../../../runtime/feature/src/npu_driver.cc:1408]12828 MemCopySync:drvMemcpy failed: dst=0x108040288000, destMax=1240, src=0x7fe7649556d0, size=1240, kind=1, drvRetCode=17! + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.698 [../../../../../../runtime/feature/src/logger.cc:113]12828 KernelLaunch:launch kernel failed, kernel=140631803535760/ArgMinWithValue_tvmbin, dim=32, stream=0x55b22b3def50 + [ERROR] RUNTIME(12731,python3.7):2021-02-02-22:23:56.475.717 [../../../../../../runtime/feature/src/api_c.cc:224]12828 rtKernelLaunch:ErrCode=207001, desc=[module new memory error], InnerCode=0x70a0002 +``` + +## 可能原因 + +根据shell和日志报错信息,两者报错信息不匹配。 + +shell报错是在同步操作中和ai cpu错误,而日志报错信息却是在min算子(内部调用ArgMinWithValue\_tvmbin),二者报错信息不对应。一般这类问题出现的原因是由于日志生成的报错信息滞后。 + +报错信息滞后可能是由于AI cpu算子的异步执行,导致报错信息滞后。 + +## 处理方法 + +对于该报错需要根据实际的错误来定位,可参考如下步骤进行处理: + +1. 通过关闭多任务算子下发后发现结果不变,推断在shell脚本报错位置和日志报错算子之前就已出现错误。 +2. 根据报错加上stream同步操作,缩小错误范围,定位错误算子。stream同步操作的作用在于其要求代码所运行到的位置之前的所有计算必须为完成状态,从而定位错误位置。 +3. 通过在代码中加上stream同步操作,确定报错算子为stack。 +4. 打印stack所有参数的shape、dtype、npu\_format,通过构造单算子用例复现问题。定位到问题原因为减法计算输入参数数据类型不同,导致a-b和b-a结果的数据类型不一致,最终在stack算子中报错。 +5. 将stack入参数据类型转换为一致即可临时规避问题。 + +

在模型运行时将多任务下发关闭\(export TASK\_QUEUE\_ENABLE=0\)后仍然遇到报错“HelpACLExecute.”

+ +## 现象描述 + +![](figures/FAQ8.png) + +## 可能原因 + +pytorch算子在npu上运行,通过ACL接口调用底层经过优化的算子。由于在上层报错信息显示为HelpACLExecute. 时,内部也正在对报错信息与日志进行完善,导致部分算子发生错误时,报错信息获取异常。 + +## 处理方法 + +查看host日志,确定报错算子和位置,日志默认路径为/var/log/npu/slog/host-0。查找对应时间的log文件,搜索ERROR字段,查找错误信息。如对上述的错误,查询日志中的ERROR字段为: + +![](figures/FAQ8-1.png) + +从日志信息EEROR部分可以发现,报错算子为topKD,报错原因为“The number of attrs in op desc and op store does not match. ”,定位到错误原因为topk算子参数不匹配。 + +在模型代码中查找topk算子调用位置,确定该算子是否可由其他算子代替,若可由其他算子报错,暂时使用代替方案,并将算子报错信息报告华为工程师。若无替代算子,请将算子报错信息通知华为工程师解决。 + +

模型调测常见问题

+ +- **[在模型调测时遇到报错“RuntimeError: malloc:/..../pytorch/c10/npu/NPUCachingAllocator.cpp:293 NPU error, error code is 500000.”](#在模型调测时遇到报错-RuntimeError-malloc-pytorch-c10-npu-NPUCachingAllocator-cpp-293-NPU-error-error-code-is-5.md)** + +- **[在模型调测时遇到报错“RuntimeError: Could not run 'aten::trunc.out' with arguments from the 'NPUTensorId' backend.”](#在模型调测时遇到报错-RuntimeError-Could-not-run-aten-trunc-out-with-arguments-from-the-NPUTensorId-backend.md)** + +- **[在模型调测时遇到如MaxPoolGradWithArgmaxV1算子和max算子报错](#在模型调测时遇到如MaxPoolGradWithArgmaxV1算子和max算子报错.md)** + +- **[在调用torch时遇到报错“ModuleNotFoundError: No module named 'torch.\_C'”](#在调用torch时遇到报错-ModuleNotFoundError-No-module-named-torch-_C.md)** + + +

在模型调测时遇到报错“RuntimeError: malloc:/..../pytorch/c10/npu/NPUCachingAllocator.cpp:293 NPU error, error code is 500000.”

+ +## 现象描述 + +![](figures/FAQ4.png) + +## 可能原因 + +对于NPUCachingAllocator中malloc类型的错误原因一般为npu显存不足,所需显存大于npu上可用显存。 + +## 处理方法 + +在模型调测中,可用通过减小batch size参数来减少NPU显存的分配,解决该问题。 + +

在模型调测时遇到报错“RuntimeError: Could not run 'aten::trunc.out' with arguments from the 'NPUTensorId' backend.”

+ +## 现象描述 + +![](figures/FAQ5.png) + +## 可能原因 + +目前npu设备仅支持pytorch部分算子,对于不支持的算子在使用时均会报上述错误,算子正在不断开发中。算子支持情况可参考[PyTorch原生算子](https://support.huaweicloud.com/opl-pytorch/atlasptol_09_0001.html),持续更新。 + +## 处理方法 + +在模型调测中,可通过减小batch size参数,来减少NPU显存的占用,解决该问题。 + +

在模型调测时遇到如MaxPoolGradWithArgmaxV1算子和max算子报错

+ +## 现象描述 + +![](figures/FAQ6.png) + +![](figures/FAQ6-1.png) + +## 可能原因 + +在模型搭建中,算子输入参数是多样的。某些算子(如MaxPoolGradWithArgmaxV1算子和max算子)在特定参数下,计算报错或者不支持,根据报错信息可以定位到具体算子。 + +## 处理方法 + +根据报错信息定位到具体算子,解决步骤如下: + +1. 排查模型中对该算子的调用方式和参数是否正确; +2. 根据报错算子构建单算子用例,构建报错场景; +3. 一般算子错误无法在python侧解决,构建出报错场景。在论坛中发帖附上报错场景,求助华为工程师即可。 + + >![](public_sys-resources/icon-note.gif) **说明:** + >输入参数shape和dtype需要重点关注,一般是导致算子报错的主要原因。 + + +前述图中,根据报错信息,定位到是MaxPoolGradWithArgmaxV1算子和max算子报错。MaxPoolGradWithArgmaxV1是在反向计算过程中报错,那么构建测试用例时需要构建对应的反向场景;而对于max算子,是正向计算时报错,构建正向场景即可。 + +在模型中遇到算子报错,首选是仅构建单算子测试用例,确定报错场景和原因即可;若无法在单算子中构建单算子用例,则需要构建基于上下文的单算子场景, 可以参考[单算子样例编写说明](#单算子样例编写说明.md)编写用例。 + +

在调用torch时遇到报错“ModuleNotFoundError: No module named 'torch.\_C'”

+ +## 现象描述 + +![](figures/FAQ11.png) + +## 可能原因 + +首先确定报错位置,上述报错路径为.../code/pytorch/torch/init.py,而当前运行路径在.../code/pytorch下,在执行import torch时,默认首先在当前目录下查找torch文件夹,因此报错。此处应是调用在系统目录下安装的torch包,而不是当前目录下的torch。 + +## 处理方法 + +切换到其他目录执行脚本即可。 + +

其他操作相关问题

+ +- **[cuda流同步操作报错](#cuda流同步操作报错.md)** + +- **[aicpu\_kernels/libpt\_kernels.so不存在](#aicpu_kernels-libpt_kernels-so不存在.md)** + +- **[使用npu-smi info查看显存时发现python进程残留](#使用npu-smi-info查看显存时发现python进程残留.md)** + +- **[动态shape报错“match op inputs failed”](#动态shape报错-match-op-inputs-failed.md)** + +- **[Op type SigmoidCrossEntropyWithLogitsV2 of ops kernel AIcoreEngine is unsupported](#Op-type-SigmoidCrossEntropyWithLogitsV2-of-ops-kernel-AIcoreEngine-is-unsupported.md)** + +- **[Hook失败](#Hook失败.md)** + +- **[加载权重时遇到报错“load state\_dict error.”](#加载权重时遇到报错-load-state_dict-error.md)** + + +

cuda流同步操作报错

+ +## 现象描述 + +![](figures/FAQ12.png) + +## 可能原因 + +npu未使用npu的流同步方法。 + +## 处理方法 + +使用npu的流同步方法: + +``` +stream = torch.npu.current_stream() +stream.synchronize() +``` + +

aicpu\_kernels/libpt\_kernels.so不存在

+ +## 现象描述 + +![](figures/FAQ13.png) + +## 可能原因 + +未导入AICPU + +## 处理方法 + +导入AICPU: + +``` +export ASCEND_AICPU_PATH=/usr/local/Ascend/ascend-toolkit/latest +``` + +

使用npu-smi info查看显存时发现python进程残留

+ +## 现象描述 + +![](figures/FAQ14.png) + +## 可能原因 + +python进程残留,需要kill。 + +## 处理方法 + +杀死python进程: + +``` +pkill -9 python +``` + +

动态shape报错“match op inputs failed”

+ +## 现象描述 + +![](figures/FAQ15.png) + +## 可能原因 + +PTIndexPut编译的算子和输入的shape对不上, 并有acl\_dynamic\_shape\_op打头的日志字样,确定为动态shape报错。 + +## 处理方法 + +PTIndexPut对应tensor\[indices\] = value,需要在代码中找到对应的地方将动态shape修改为固定shape。 + +

Op type SigmoidCrossEntropyWithLogitsV2 of ops kernel AIcoreEngine is unsupported

+ +## 现象描述 + +``` +[ERROR] GE(24836,python3.7):2021-01-27-18:27:51.562.111 [../../../../../../graphengine/ge/engine_manager/dnnengine_manager.cc:266]25155 GetDNNEngineName: ErrorNo: 1343242282(assign engine failed) GetDNNEngineName:Op type SigmoidCrossEntropyWithLogitsV2 of ops kernel AIcoreEngine is unsupported, reason:Op SigmoidCrossEntropyWithLogitsV2 not supported reason: The type of this op is not found in op store, check whether the op store has this type of op. Op store name is tbe-custom. +The dtype, format or shape of input in op desc is not supported in op store, check the dtype, format or shape of input between the op store and the graph. Op store name is tbe-builtin. +``` + +## 可能原因 + +SigmoidCrossEntropyWithLogitsV2算子输入了不支持的数据类型,可能是输入int64类型导致的错误。 + +## 处理方法 + +检查对应python代码中输入的数据类型,并修改。 + +

Hook失败

+ +## 现象描述 + +``` +Traceback (most recent call last): + File "tools/train.py", line 227, in + main() + File "tools/train.py", line 221, in main + meta=meta) + File "/root/YoloV3/mmdetection/mmdet/apis/train.py", line 192, in train_detector + runner.run(data_loaders, cfg.workflow, cfg.total_epochs) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 166, in run + epoch_runner(data_loaders[i], **kwargs) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 50, in train + self.run_iter(data_batch, train_mode=True) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/mmcv/runner/epoch_based_runner.py", line 30, in run_iter + outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/mmcv/parallel/data_parallel.py", line 100, in train_step + return self.module.train_step(*inputs[0], **kwargs[0]) + File "/root/YoloV3/mmdetection/mmdet/models/detectors/base.py", line 251, in train_step + losses = self(**data) + File "/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/modules/module.py", line 660, in __call__ + var = next((v for v in var.values() if isinstance(v, torch.Tensor))) +StopIteration +``` + +## 可能原因 + +mmdet的loss部分结构触发了pytorch原生hook的bug,导致死循环。 + +## 处理方法 + +解决方案是在/usr/local/python3.7.5/lib/python3.7/site-packages/torch/nn/modules/module.py这个文件的658行加上try跳过: + +``` +if len(self._backward_hooks) > 0: + var = result + try: + while not isinstance(var, torch.Tensor): + if isinstance(var, dict): + var = next((v for v in var.values() if isinstance(v, torch.Tensor))) + else: + var = var[0] + grad_fn = var.grad_fn + if grad_fn is not None: + for hook in self._backward_hooks.values(): + wrapper = functools.partial(hook, self) + functools.update_wrapper(wrapper, hook) + grad_fn.register_hook(wrapper) + except Exception as e: + print('hook failed..') + print(str(e)) +return result +``` + +

加载权重时遇到报错“load state\_dict error.”

+ +## 现象描述 + +![](figures/FAQ18.png) + +![](figures/FAQ18-1.png) + +## 可能原因 + +模型训练后保存的state\_dict的key值与加载时state\_dict的key值不一致,保存时会在每个key的最前面多一个module前缀。 + +## 处理方法 + +加载权重时先遍历state\_dict字典,修改key值,并使用新建的字典,具体用例参考demo.py。 + +脚本: + +``` + ckpt = torch.load("checkpoint.pth", map_location=loc) + # model.load_state_dict(ckpt['state_dict']) + state_dict_old = ckpt['state_dict'] + state_dict = {} + for key, value in state_dict_old.items(): + key = key[7:] + state_dict[key] = value + model.load_state_dict(state_dict) +``` + +

模型分布式训练常见问题

+ +- **[在进行模型分布式训练时遇到报错“host not found.”](#在进行模型分布式训练时遇到报错-host-not-found.md)** + +- **[在进行模型分布式训练时遇到报错“RuntimeError:connect\(\) timed out.”](#在进行模型分布式训练时遇到报错-RuntimeError-connect()-timed-out.md)** + + +

在进行模型分布式训练时遇到报错“host not found.”

+ +## 现象描述 + +![](figures/FAQ19.png) + +## 可能原因 + +对模型进行分布式训练时,会调用集合通信模块HCCL,需要根据实际情况设置IP和端口信息。根据报错信息,确定是IP地址设置错误。 + +## 处理方法 + +在运行脚本中设置正确的IP地址,对于单机情况,设置为本机的IP即可;对于多机情况,每个服务器上脚本中的IP需要设置为master节点的IP。 + +

在进行模型分布式训练时遇到报错“RuntimeError:connect\(\) timed out.”

+ +## 现象描述 + +![](figures/1234.png) + +## 可能原因 + +模型进行分布式训练时,系统防火墙可能会阻截HCCL的集合通信端口的通信。需要根据报错信息,排查通信端口的开放情况,并进行相应设置。 + +## 理方法 + +查询出被系统防火墙阻截的集合通信端口,并开放相应端口。 + diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/1234.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/1234.png" new file mode 100644 index 0000000000000000000000000000000000000000..9c65147ed3a49d6e808aa8a514aeffa026c56c96 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/1234.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ1.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ1.png" new file mode 100644 index 0000000000000000000000000000000000000000..53f81e17d826f0aa0002a11873e16bb1f988f179 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ1.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ10-1.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ10-1.png" new file mode 100644 index 0000000000000000000000000000000000000000..b95105e3af29020645ad0a4b77e2d78b84cc2fdd Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ10-1.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ10.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ10.png" new file mode 100644 index 0000000000000000000000000000000000000000..a0232751b9f354e55b5a7b157ab2fdd8fb79caba Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ10.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ11.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ11.png" new file mode 100644 index 0000000000000000000000000000000000000000..dfe1d90a7e99b19c64039c771b8f0f3ee095489d Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ11.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ12.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ12.png" new file mode 100644 index 0000000000000000000000000000000000000000..dfe1d90a7e99b19c64039c771b8f0f3ee095489d Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ12.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ13.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ13.png" new file mode 100644 index 0000000000000000000000000000000000000000..773f83071183fb63c410d94b9f658ba901049a3f Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ13.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ14.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ14.png" new file mode 100644 index 0000000000000000000000000000000000000000..c1201b3f572aa01e2c91ed7959d4466d768d5723 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ14.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ15.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ15.png" new file mode 100644 index 0000000000000000000000000000000000000000..b2ea57e76acbe91b86c910ddb44a27890fe94ff0 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ15.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ18-1.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ18-1.png" new file mode 100644 index 0000000000000000000000000000000000000000..9634db02b1c76d601aba391180b3d84ab502c901 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ18-1.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ18.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ18.png" new file mode 100644 index 0000000000000000000000000000000000000000..253af857e9ff65a4f67ca154d323ca27769fcaef Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ18.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ19.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ19.png" new file mode 100644 index 0000000000000000000000000000000000000000..ffc2a914b6ca1f99b27e52d7fd6d33de7475e566 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ19.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ2.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ2.png" new file mode 100644 index 0000000000000000000000000000000000000000..ab0a9f7e0aae085338f2324aeb9464a3c25d5090 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ2.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ3.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ3.png" new file mode 100644 index 0000000000000000000000000000000000000000..970b050c2c46f29e9d09ff401e243ea6fb06804b Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ3.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ4.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ4.png" new file mode 100644 index 0000000000000000000000000000000000000000..1a813e3ddbcabd36646defdcc63b9a0fdbb7e1a9 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ4.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ5.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ5.png" new file mode 100644 index 0000000000000000000000000000000000000000..5c2019b795b4a165e7b150395739922ccacb8253 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ5.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ6-1.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ6-1.png" new file mode 100644 index 0000000000000000000000000000000000000000..134ed666fb21e075885226fa4039d84ff4e6642c Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ6-1.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ6.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ6.png" new file mode 100644 index 0000000000000000000000000000000000000000..5f0ab093a0c1c35b1c948e4ac2555a890bf73a05 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ6.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ7.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ7.png" new file mode 100644 index 0000000000000000000000000000000000000000..d7871f3d6ba9b3c2e37c79f886a7e6cb93147c5a Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ7.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ8-1.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ8-1.png" new file mode 100644 index 0000000000000000000000000000000000000000..0316905729ff0cd82806961565b947cb7655acb1 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ8-1.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ8.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ8.png" new file mode 100644 index 0000000000000000000000000000000000000000..c1950311e49af3ea74b28c02afe3e77938788396 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ8.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ9-1.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ9-1.png" new file mode 100644 index 0000000000000000000000000000000000000000..ec178d5e25a2e60ef4a1a25b80bd24271e25bb02 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ9-1.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ9.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ9.png" new file mode 100644 index 0000000000000000000000000000000000000000..70e4bc5824c836a894f3e3e3c6c87c276efd15ec Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/FAQ9.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Performance-Config.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Performance-Config.png" new file mode 100644 index 0000000000000000000000000000000000000000..d75a5dbc4e684169cfd311d0b4c94d5283c1e762 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Performance-Config.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Pth\346\226\207\344\273\266.jpg" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Pth\346\226\207\344\273\266.jpg" new file mode 100644 index 0000000000000000000000000000000000000000..7a64f87cec889e72b09d352522ae96b06c7694b6 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Pth\346\226\207\344\273\266.jpg" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Socket-Configuration.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Socket-Configuration.png" new file mode 100644 index 0000000000000000000000000000000000000000..025284eeae4bcc43c74979d1e142a4bb77d63096 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/Socket-Configuration.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/pytorch\351\200\202\351\205\215\351\200\273\350\276\221\347\273\223\346\236\204\345\233\276-\344\274\230\345\214\226.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/pytorch\351\200\202\351\205\215\351\200\273\350\276\221\347\273\223\346\236\204\345\233\276-\344\274\230\345\214\226.png" new file mode 100644 index 0000000000000000000000000000000000000000..207410d4779ad9a94bbb8c92c0a60bc384af83bf Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/pytorch\351\200\202\351\205\215\351\200\273\350\276\221\347\273\223\346\236\204\345\233\276-\344\274\230\345\214\226.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106016350.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106016350.png" new file mode 100644 index 0000000000000000000000000000000000000000..e95a0361e813dc685d49f524991b80acc490f988 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106016350.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106176216.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106176216.png" new file mode 100644 index 0000000000000000000000000000000000000000..ede83f4bc1b0ed21a9c746c358c45681d5ffb49a Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106176216.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106176222.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106176222.png" new file mode 100644 index 0000000000000000000000000000000000000000..24ad984e3ca1b8850526626259c25de8b93c4388 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001106176222.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152616281.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152616281.png" new file mode 100644 index 0000000000000000000000000000000000000000..eff3d25890d212eb91ed0cfb9f2157fa490d9983 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152616281.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152616289.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152616289.png" new file mode 100644 index 0000000000000000000000000000000000000000..e95a0361e813dc685d49f524991b80acc490f988 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152616289.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152736233.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152736233.png" new file mode 100644 index 0000000000000000000000000000000000000000..eff3d25890d212eb91ed0cfb9f2157fa490d9983 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/zh-cn_image_0000001152736233.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\345\220\257\345\212\250\351\241\271\345\267\245\345\205\267-1.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\345\220\257\345\212\250\351\241\271\345\267\245\345\205\267-1.png" new file mode 100644 index 0000000000000000000000000000000000000000..c4f81fc908960959f07aeaaf6e4aa47f1c9aac83 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\345\220\257\345\212\250\351\241\271\345\267\245\345\205\267-1.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\345\220\257\345\212\250\351\241\271\345\267\245\345\205\267.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\345\220\257\345\212\250\351\241\271\345\267\245\345\205\267.png" new file mode 100644 index 0000000000000000000000000000000000000000..c773cbd1c37dee1d8a5f755bd8229238dbb107cf Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\345\220\257\345\212\250\351\241\271\345\267\245\345\205\267.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\346\214\207\345\256\232\347\256\227\345\255\220\345\210\235\345\247\213\345\214\226\346\226\271\345\274\217.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\346\214\207\345\256\232\347\256\227\345\255\220\345\210\235\345\247\213\345\214\226\346\226\271\345\274\217.png" new file mode 100644 index 0000000000000000000000000000000000000000..35584844f7a44aa3c0076d7e1bdf7259f3479bcc Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\346\214\207\345\256\232\347\256\227\345\255\220\345\210\235\345\247\213\345\214\226\346\226\271\345\274\217.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\347\216\257\345\242\203\345\207\206\345\244\207\346\265\201\347\250\213\345\233\276.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\347\216\257\345\242\203\345\207\206\345\244\207\346\265\201\347\250\213\345\233\276.png" new file mode 100644 index 0000000000000000000000000000000000000000..cdda4fab2365a81d54807e9118cc617a25b8f4f2 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\347\216\257\345\242\203\345\207\206\345\244\207\346\265\201\347\250\213\345\233\276.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\256\276\347\275\256\347\224\265\346\272\220\347\255\226\347\225\245-2.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\256\276\347\275\256\347\224\265\346\272\220\347\255\226\347\225\245-2.png" new file mode 100644 index 0000000000000000000000000000000000000000..93e4f5aa984a7f2e6c2e5298a8d59e4d3d0e9aab Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\256\276\347\275\256\347\224\265\346\272\220\347\255\226\347\225\245-2.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\256\276\347\275\256\347\224\265\346\272\220\347\255\226\347\225\245.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\256\276\347\275\256\347\224\265\346\272\220\347\255\226\347\225\245.png" new file mode 100644 index 0000000000000000000000000000000000000000..3c31df7fe027517b9eec9ec717d06e3d75b2f3c7 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\256\276\347\275\256\347\224\265\346\272\220\347\255\226\347\225\245.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\201\347\247\273\346\265\201\347\250\213.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\201\347\247\273\346\265\201\347\250\213.png" new file mode 100644 index 0000000000000000000000000000000000000000..4f5a0edf328f897103e2d79f0281125258ac218a Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\201\347\247\273\346\265\201\347\250\213.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\234\347\250\213\347\231\273\345\275\225\346\216\247\345\210\266\345\217\260-0.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\234\347\250\213\347\231\273\345\275\225\346\216\247\345\210\266\345\217\260-0.png" new file mode 100644 index 0000000000000000000000000000000000000000..c323451aaa02f04b7305837db31652c23ea7bc24 Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\234\347\250\213\347\231\273\345\275\225\346\216\247\345\210\266\345\217\260-0.png" differ diff --git "a/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\234\347\250\213\347\231\273\345\275\225\346\216\247\345\210\266\345\217\260.png" "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\234\347\250\213\347\231\273\345\275\225\346\216\247\345\210\266\345\217\260.png" new file mode 100644 index 0000000000000000000000000000000000000000..56ee9d31bcd3f4b601c8bd7edcfa0ddcb357459e Binary files /dev/null and "b/docs/zh/FrameworkPTAdapter 2.0.2 PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227 01/figures/\350\277\234\347\250\213\347\231\273\345\275\225\346\216\247\345\210\266\345\217\260.png" differ diff --git "a/docs/zh/PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225_1.5.0.md" "b/docs/zh/PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225_1.5.0.md" index ec95cc153c0f391ca7b3e085990fd6fb48796651..31c03490c4743e1cc9d7aefd4f6734f5884b5bba 100644 --- "a/docs/zh/PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225_1.5.0.md" +++ "b/docs/zh/PyTorch API\346\224\257\346\214\201\346\270\205\345\215\225_1.5.0.md" @@ -787,8 +787,20 @@ | 120 | torch.nn.BatchNorm2d | 是 | | 121 | torch.nn.BatchNorm3d | 是 | | 122 | torch.nn.GroupNorm | 是 | -| 123 | torch.nn.SyncBatchNorm | 否 | +| 123 | torch.nn.SyncBatchNorm | 是 | +<<<<<<< HEAD +<<<<<<< HEAD +<<<<<<< HEAD +| 124 | torch.nn.SyncBatchNorm.convert_sync_batchnorm | 是 | +======= | 124 | torch.nn.SyncBatchNorm.convert_sync_batchnorm | 否 | +>>>>>>> 34d323c (new api support) +======= +| 124 | torch.nn.SyncBatchNorm.convert_sync_batchnorm | 是 | +>>>>>>> 0585771 (new api support) +======= +| 124 | torch.nn.SyncBatchNorm.convert_sync_batchnorm | 是 | +>>>>>>> ba861db (new file) | 125 | torch.nn.InstanceNorm1d | 是 | | 126 | torch.nn.InstanceNorm2d | 是 | | 127 | torch.nn.InstanceNorm3d | 是 | @@ -1289,9 +1301,7 @@ torch.npu.set_device()接口只支持在程序开始的位置通过set_device进 详细算子接口说明: -> ``` > npu_apply_adam(beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad, use_locking, use_nesterov, out = (var, m, v)) -> ``` count adam result. @@ -1317,58 +1327,1969 @@ count adam result. None -> npu_bert_apply_adam(lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay, out = (var, m, v)) +> npu_convolution_transpose(input, weight, bias, padding, output_padding, stride, dilation, groups) -> Tensor -count adam result in bert. +Applies a 2D or 3D transposed convolution operator over an input image composed of several input planes, sometimes also called “deconvolution”. - Parameters: - - **lr** (Number) - learning rate. - - **beta1** (Number) - exponential decay rate for the 1st moment estimates. - - **beta2** (Number) - exponential decay rate for the 2nd moment estimates. - - **epsilon** (Number) - term added to the denominator to improve numerical stability. - - **grad** (Tensor) - the gradient. - - **max_grad_norm** (Number) - maximum norm for the gradients. - - **global_grad_norm** (Number) - L2_norm for the gradients. - - **weight_decay** (Number) - weight decay - - **var** (Tensor) - variables to be optimized. - - **m** (Tensor) -mean value of variables. - - **v** (Tensor) - variance of variables. + - **input** (Tensor) - input tensor of shape(minibatch, in_channels, iH, iW) or (minibatch, in_channels, iT, iH, iW) + - **weight** (Tensor) - filters of shape(in_channels, out_channels/groups, kH, kW) or (in_channels, out_channels/groups, kT, kH, kW) + - **bias** (Tensor, optional) - optional bias of shape(out_channels) + - **padding** (ListInt) - (dilation * (kernel_size - 1) - padding) zero-padding will be added to both sides of each dimension in the input + - **output_padding** (ListInt) - additional size added to one side of each dimension in the output shape. + - **stride** (ListInt) - the stride of the convolving kernel + - **dilation** (ListInt) - the spacing between kernel elements + - **groups** (Number) - split input into groups, in_channels should be divisible by the number of groups + +- constraints: + + None + +- Examples: + + None + +> npu_conv_transpose2d(input, weight, bias, padding, output_padding, stride, dilation, groups) -> Tensor + +Applies a 2D transposed convolution operator over an input image composed of several input planes, sometimes also called “deconvolution”. + +- Parameters: + - **input** (Tensor) - input tensor of shape(minibatch, in_channels, iH, iW) + - **weight** (Tensor) - filters of shape(in_channels, out_channels/groups, kH, kW) + - **bias** (Tensor, optional) - optional bias of shape(out_channels) + - **padding** (ListInt) - (dilation * (kernel_size - 1) - padding) zero-padding will be added to both sides of each dimension in the input + - **output_padding** (ListInt) - additional size added to one side of each dimension in the output shape. + - **stride** (ListInt) - the stride of the convolving kernel + - **dilation** (ListInt) - the spacing between kernel elements + - **groups** (Number) - split input into groups, in_channels should be divisible by the number of groups + +- constraints: + + None + +- Examples: + + None + +> npu_convolution(input, weight, bias, stride, padding, dilation, groups) -> Tensor + +Applies a 2D or 3D convolution over an input image composed of several input planes. + +- Parameters: + - **input** (Tensor) - input tensor of shape(minibatch, in_channels, iH, iW) or (minibatch, in_channels, iT, iH, iW) + - **weight** (Tensor) - filters of shape(out_channels, in_channels/groups, kH, kW) or (out_channels, in_channels/groups, kT, kH, kW) + - **bias** (Tensor, optional) - optional bias of shape(out_channels) + - **stride** (ListInt) - the stride of the convolving kernel + - **padding** (ListInt) - implicit paddings on both sides of the input + - **dilation** (ListInt) - the spacing between kernel elements + - **groups** (ListInt) - split input into groups, in_channels should be divisible by the number of groups + +- constraints: + + None + +- Examples: + + None + +> npu_conv2d(input, weight, bias, stride, padding, dilation, groups) -> Tensor + +Applies a 2D convolution over an input image composed of several input planes. + +- Parameters: + - **input** (Tensor) - input tensor of shape(minibatch, in_channels, iH, iW) + - **weight** (Tensor) - filters of shape(out_channels, in_channels/groups, kH, kW) + - **bias** (Tensor, optional) - optional bias of shape(out_channels) + - **stride** (ListInt) - the stride of the convolving kernel + - **padding** (ListInt) - implicit paddings on both sides of the input + - **dilation** (ListInt) - the spacing between kernel elements + - **groups** (ListInt) - split input into groups, in_channels should be divisible by the number of groups + +- constraints: + + None + +- Examples: + + None + +> npu_conv3d(input, weight, bias, stride, padding, dilation, groups) -> Tensor + +Applies a 3D convolution over an input image composed of several input planes. + +- Parameters: + - **input** (Tensor) - input tensor of shape(minibatch, in_channels, iT, iH, iW) + - **weight** (Tensor) - filters of shape(out_channels, in_channels/groups, kT, kH, kW) + - **bias** (Tensor, optional) - optional bias of shape(out_channels) + - **stride** (ListInt) - the stride of the convolving kernel + - **padding** (ListInt) - implicit paddings on both sides of the input + - **dilation** (ListInt) - the spacing between kernel elements + - **groups** (ListInt) - split input into groups, in_channels should be divisible by the number of groups + +- constraints: + + None + +- Examples: + + None + +> one_(self) -> Tensor + +Fills self tensor with ones. + +- Parameters: + +- **self** (Tensor) - input tensor + +- constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(2, 3).npu() + >>> x + tensor([[0.6072, 0.9726, 0.3475], + [0.3717, 0.6135, 0.6788]], device='npu:0') + >>> x.one_() + tensor([[1., 1., 1.], + [1., 1., 1.]], device='npu:0') + ``` + +> npu_sort_v2(self, dim=-1, descending=False, out=None) -> Tensor + +Sorts the elements of the input tensor along a given dimension in ascending order by value without indices. +If dim is not given, the last dimension of the input is chosen. +If descending is True then the elements are sorted in descending order by value. + +- Parameters: + - **self** (Tensor) - the input tensor + - **dim** (int, optional) - the dimension to sort along + - **descending** (bool, optional) - controls the sorting order (ascending or descending) + - **out** (Tensor, optional) - the output that can be optionally given to be used as output buffers + +- constraints: + + At present only support the last dim(-1). + +- Examples: + + ```python + >>> x = torch.randn(3, 4).npu() + >>> x + tensor([[-0.0067, 1.7790, 0.5031, -1.7217], + [ 1.1685, -1.0486, -0.2938, 1.3241], + [ 0.1880, -2.7447, 1.3976, 0.7380]], device='npu:0') + >>> sorted_x = torch.npu_sort_v2(x) + >>> sorted_x + tensor([[-1.7217, -0.0067, 0.5029, 1.7793], + [-1.0488, -0.2937, 1.1689, 1.3242], + [-2.7441, 0.1880, 0.7378, 1.3975]], device='npu:0') + ``` + +> npu_format_cast(self, acl_format) -> Tensor + +Change the format of a npu tensor. + +- Parameters: + - **self** (Tensor) - the input tensor + - **acl_format** (int) - the target format to transform + +- constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(2, 3, 4, 5).npu() + >>> x.storage().npu_format() + 0 + >>> x1 = x.npu_format_cast(29) + >>> x1.storage().npu_format() + 29 + ``` + +> npu_format_cast_ + +> npu_format_cast_.acl_format(self, acl_format) -> Tensor + + In-place version of npu_format_cast() + +> npu_format_cast_.src(self, src) -> Tensor + + In-place Change the format of self, with the same format as src. + + - Parameters: + - **self** (Tensor) - the input tensor + - **src** (Tensor) - the target format to transform + + - constraints: + + None + + - Examples: + + ```python + >>> x = torch.rand(2, 3, 4, 5).npu() + >>> x.storage().npu_format() + 0 + >>> x.npu_format_cast_(29).storage().npu_format() + 29 + ``` + +> npu_transpose(self, perm) -> Tensor + +Returns a view of the original tensor with its dimensions permuted, and make the result contiguous. + +- Parameters: + - **self** (Tensor) - the input tensor + - **perm** (ListInt) - The desired ordering of dimensions + +- constraints: + + None + +- Examples: + + ```python + >>> x = torch.randn(2, 3, 5).npu() + >>> x.shape + torch.Size([2, 3, 5]) + >>> x1 = torch.npu_transpose(x, (2, 0, 1)) + >>> x1.shape + torch.Size([5, 2, 3]) + >>> x2 = x.npu_transpose(2, 0, 1) + >>> x2.shape + torch.Size([5, 2, 3]) + ``` + +> npu_broadcast(self, perm) -> Tensor + +Returns a new view of the self tensor with singleton dimensions expanded to a larger size, and make the result contiguous. + +Tensor can be also expanded to a larger number of dimensions, and the new ones will be appended at the front. + +- Parameters: + - **self** (Tensor) - the input tensor + - **perm** (ListInt) - the desired expanded size + +- constraints: + + None + +- Examples: + + ```python + >>> x = torch.tensor([[1], [2], [3]]).npu() + >>> x.shape + torch.Size([3, 1]) + >>> x.npu_broadcast(3, 4) + tensor([[1, 1, 1, 1], + [2, 2, 2, 2], + [3, 3, 3, 3]], device='npu:0') + ``` + +> npu_dtype_cast(input, dtype) -> Tensor + +Performs Tensor dtype conversion. + +- Parameters: + - **input** (Tensor) - the input tensor. + - **dtype** (torch.dtype) - the desired data type of returned Tensor. + +- constraints: + + None + +- Examples: + + ```python + >>> torch. npu_dtype_cast (torch.tensor([0, 0.5, -1.]).npu(), dtype=torch.int) + tensor([ 0, 0, -1], device='npu:0', dtype=torch.int32) + ``` + +> empty_with_format(size, dtype, layout, device, pin_memory, acl_format) -> Tensor + +Returns a tensor filled with uninitialized data. The shape of the tensor is defined by the variable argument size. The format of the tensor is defined by the variable argument acl_format. + +- Parameters: + + - **size** (int...) – a sequence of integers defining the shape of the output tensor. Can be a variable number of arguments or a collection like a list or tuple. + + - **dtype** (torch.dtype, optional) – the desired data type of returned tensor. Default: if None, uses a global default (see torch.set_default_tensor_type()). + + - **layout** (torch.layout, optional) – the desired layout of returned Tensor. Default: None. + + - **device** (torch.device, optional) – the desired device of returned tensor. Default: None + + - **pin_memory** (bool, optional) – If set, returned tensor would be allocated in the pinned memory. Default: None. + + - **acl_format** (Number) – the desired memory format of returned Tensor. Default: 2. + +- constraints: + + None + +- Examples: + ```python + >>> torch.empty_with_format((2, 3), dtype=torch.float32, device="npu") + tensor([[1., 1., 1.], + [1., 1., 1.]], device='npu:0') + ``` + +> copy_memory_(dst, src, non_blocking=False) -> Tensor + +Copies the elements from src into self tensor and returns self. + +- Parameters: + - **dst** (Tensor) - the source tensor to copy from. + - **src** (Tensor) - the desired data type of returned Tensor. + - **non_blocking** (bool) - if True and this copy is between CPU and NPU, the copy may occur asynchronously with respect to the host. For other cases, this argument has no effect. + +- constraints: + + copy_memory_ only support npu tensor. + input tensors of copy_memory_ should have same dtype. + input tensors of copy_memory_ should have same device index. + +- Examples: + + ```python + >>> a=torch.IntTensor([0, 0, -1]).npu() + >>> b=torch.IntTensor([1, 1, 1]).npu() + >>> a.copy_memory_(b) + tensor([1, 1, 1], device='npu:0', dtype=torch.int32) + ``` + +> npu_one_hot(input, num_classes=-1, depth=1, on_value=1, off_value=0) -> Tensor + +Returns a one-hot tensor. The locations represented by index in "x" take value "on_value", while all other locations take value "off_value". + +- Parameters: + - **input** (Tensor) - class values of any shape. + - **num_classes** (Tensor) - The axis to fill. Defaults to "-1". + - **depth** (Number) - The depth of the one hot dimension. + - **on_value** (Number) - The value to fill in output when indices[j] = i. + - **off_value** (Number) - The value to fill in output when indices[j] != i. + +- constraints: + + None + +- Examples: + ```python + >>> a=torch.IntTensor([5, 3, 2, 1]).npu() + >>> b=torch.npu_one_hot(a, depth=5) + >>> b + tensor([[0., 0., 0., 0., 0.], + [0., 0., 0., 1., 0.], + [0., 0., 1., 0., 0.], + [0., 1., 0., 0., 0.]], device='npu:0') + ``` + +> npu_stride_add(x1, x2, offset1, offset2, c1_len) -> Tensor + +Add the partial values of two tensors in format NC1HWC0. + +- Parameters: + - **x1** (Tensor) - A Tensor in 5HD. + - **x2** (Tensor) - A Tensor of the same type as "x1", and the same shape as "x1", except for the C1 value. + - **offset1** (Number) - A required int. Offset value of C1 in "x1". + - **offset2** (Number) - A required int. Offset value of C1 in "x2". + - **c1_len** (Number) - A required int. C1 len of "y". The value must be less than the difference between C1 and offset in "x1" and "x2". + +- constraints: + + None + +- Examples: + ```python + >>> a=torch.tensor([[[[[1.]]]]]).npu() + >>> b=torch.npu_stride_add(a, a, 0, 0, 1) + >>> b + tensor([[[[[2.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]], + [[[0.]]]]], device='npu:0') + ``` + +> npu_softmax_cross_entropy_with_logits(features, labels) -> Tensor + +Computes softmax cross entropy cost. + +- Parameters: + - **features** (Tensor) - A Tensor. A "batch_size * num_classes" matrix. + - **labels** (Tensor) - A Tensor of the same type as "features". A "batch_size * num_classes" matrix. + +- constraints: + + None + +- Examples: + + None + +> npu_ps_roi_pooling(x, rois, spatial_scale, group_size, output_dim) -> Tensor + +Performs Position Sensitive PS ROI Pooling. + +- Parameters: + - **x** (Tensor) - An NC1HWC0 tensor, describing the feature map, dimension C1 must be equal to (int(output_dim+15)/C0))*group_size*group_size. + - **rois** (Tensor) - A tensor with shape [batch, 5, rois_num], describing the ROIs, each ROI consists of five elements: "batch_id", "x1", "y1", "x2", and "y2", which "batch_id" indicates the index of the input feature map, "x1", "y1", "x2", or "y2" must be greater than or equal to "0.0". + - **spatial_scale** (Number) - A required float32, scaling factor for mapping the input coordinates to the ROI coordinates . + - **group_size** (Number) - A required int32, specifying the number of groups to encode position-sensitive score maps, must be within the range (0, 128). + - **output_dim** (Number) - A required int32, specifying the number of output channels, must be greater than 0. + +- constraints: + + None + +- Examples: + ```python + >>> roi = torch.tensor([[[1], [2], [3], [4], [5]], + [[6], [7], [8], [9], [10]]], dtype = torch.float16).npu() + >>> x = torch.tensor([[[[ 1]], [[ 2]], [[ 3]], [[ 4]], + [[ 5]], [[ 6]], [[ 7]], [[ 8]]], + [[[ 9]], [[10]], [[11]], [[12]], + [[13]], [[14]], [[15]], [[16]]]], dtype = torch.float16).npu() + >>> out = torch.npu_ps_roi_pooling(x, roi, 0.5, 2, 2) + >>> out + tensor([[[[0., 0.], + [0., 0.]], + [[0., 0.], + [0., 0.]]], + [[[0., 0.], + [0., 0.]], + [[0., 0.], + [0., 0.]]]], device='npu:0', dtype=torch.float16) + ``` + +> npu_roi_align(features, rois, spatial_scale, pooled_height, pooled_width, sample_num, roi_end_mode) -> Tensor + +Obtains the ROI feature matrix from the feature map. It is a customized FasterRcnn operator. + +- Parameters: + - **features** (Tensor) - A Tensor in 5HD. + - **rois** (Tensor) - ROI position. A 2D Tensor with shape (N, 5). "N" indicates the number of ROIs, the value "5" indicates the indexes of images where the ROIs are located, "x0", "y0", "x1", and "y1". + - **spatial_scale** (Number) - A required attribute of type float32, specifying the scaling ratio of "features" to the original image. + - **pooled_height** (Number) - A required attribute of type int32, specifying the H dimension. + - **pooled_width** (Number) - A required attribute of type int32, specifying the W dimension. + - **sample_num** (Number) - An optional attribute of type int32, specifying the horizontal and vertical sampling frequency of each output. If this attribute is set to "0", the sampling frequency is equal to the rounded up value of "rois", which is a floating point number. Defaults to "2". + - **roi_end_mode** (Number) - An optional attribute of type int32. Defaults to "1". + +- constraints: + + None + +- Examples: + ```python + >>> x = torch.FloatTensor([[[[1, 2, 3 , 4, 5, 6], + [7, 8, 9, 10, 11, 12], + [13, 14, 15, 16, 17, 18], + [19, 20, 21, 22, 23, 24], + [25, 26, 27, 28, 29, 30], + [31, 32, 33, 34, 35, 36]]]]).npu() + >>> rois = torch.tensor([[0, -2.0, -2.0, 22.0, 22.0]]).npu() + >>> out = torch.npu_roi_align(x, rois, 0.25, 3, 3, 2, 0) + >>> out + tensor([[[[ 4.5000, 6.5000, 8.5000], + [16.5000, 18.5000, 20.5000], + [28.5000, 30.5000, 32.5000]]]], device='npu:0') + ``` + +> npu_nms_v4(boxes, scores, max_output_size, iou_threshold, scores_threshold, pad_to_max_output_size=False) -> (Tensor, Tensor) + +Greedily selects a subset of bounding boxes in descending order of score. + +- Parameters: + - **boxes** (Tensor) - A 2-D float tensor of shape [num_boxes, 4]. + - **scores** (Tensor) - A 1-D float tensor of shape [num_boxes] representing a single score corresponding to each box (each row of boxes). + - **max_output_size** (Number) - A scalar representing the maximum number of boxes to be selected by non max suppression. + - **iou_threshold** (Tensor) - A 0-D float tensor representing the threshold for deciding whether boxes overlap too much with respect to IOU. + - **scores_threshold** (Tensor) - A 0-D float tensor representing the threshold for deciding when to remove boxes based on score. + - **pad_to_max_output_size** (bool) - If true, the output selected_indices is padded to be of length max_output_size. Defaults to false. + +- Returns: + - **selected_indices** - A 1-D integer tensor of shape [M] representing the selected indices from the boxes tensor, where M <= max_output_size. + - **valid_outputs** - A 0-D integer tensor representing the number of valid elements in selected_indices, with the valid elements appearing first. + +- constraints: + + None + +- Examples: + ```python + >>> boxes=torch.randn(100,4).npu() + >>> scores=torch.randn(100).npu() + >>> boxes.uniform_(0,100) + >>> scores.uniform_(0,1) + >>> max_output_size = 20 + >>> iou_threshold = torch.tensor(0.5).npu() + >>> scores_threshold = torch.tensor(0.3).npu() + >>> npu_output = torch.npu_nms_v4(boxes, scores, max_output_size, iou_threshold, scores_threshold) + >>> npu_output + (tensor([57, 65, 25, 45, 43, 12, 52, 91, 23, 78, 53, 11, 24, 62, 22, 67, 9, 94, + 54, 92], device='npu:0', dtype=torch.int32), tensor(20, device='npu:0', dtype=torch.int32)) + ``` + +> npu_nms_rotated(dets, scores, iou_threshold, scores_threshold=0, max_output_size=-1, mode=0) -> (Tensor, Tensor) + +Greedy selects a subset of the rotated bounding boxes in descending fractional order. + +- Parameters: + - **dets** (Tensor) - A 2-D float tensor of shape [num_boxes, 5]. + - **scores** (Tensor) - A 1-D float tensor of shape [num_boxes] representing a single score corresponding to each box (each row of boxes). + - **iou_threshold** (Number) - A scalar representing the threshold for deciding whether boxes overlap too much with respect to IOU. + - **scores_threshold** (Number) - A scalar representing the threshold for deciding when to remove boxes based on score. Defaults to "0". + - **max_output_size** (Number) - A scalar integer tensor representing the maximum number of boxes to be selected by non max suppression. Defaults to "-1", that is, no constraint is imposed. + - **mode** (Number) - This parameter specifies the layout type of the dets. The default value is 0. If mode is set to 0, the input values of dets are x, y, w, h, and angle. If mode is set to 1, the input values of dets are x1, y1, x2, y2, and angle. Defaults to "0". + +- Returns: + - **selected_index** - A 1-D integer tensor of shape [M] representing the selected indices from the dets tensor, where M <= max_output_size. + - **selected_num** - A 0-D integer tensor representing the number of valid elements in selected_indices. + +- constraints: + + None + +- Examples: + ```python + >>> dets=torch.randn(100,5).npu() + >>> scores=torch.randn(100).npu() + >>> dets.uniform_(0,100) + >>> scores.uniform_(0,1) + >>> output1, output2 = torch.npu_nms_rotated(dets, scores, 0.2, 0, -1, 1) + >>> output1 + tensor([76, 48, 15, 65, 91, 82, 21, 96, 62, 90, 13, 59, 0, 18, 47, 23, 8, 56, + 55, 63, 72, 39, 97, 81, 16, 38, 17, 25, 74, 33, 79, 44, 36, 88, 83, 37, + 64, 45, 54, 41, 22, 28, 98, 40, 30, 20, 1, 86, 69, 57, 43, 9, 42, 27, + 71, 46, 19, 26, 78, 66, 3, 52], device='npu:0', dtype=torch.int32) + >>> output2 + tensor([62], device='npu:0', dtype=torch.int32) + ``` + +> npu_lstm(x, weight, bias, seq_len, h, c, has_biases, num_layers, dropout, train, bidirectional, batch_first, flag_seq, direction) + +DynamicRNN calculation. + +- Parameters: + - **x** (Tensor) - A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **weight** (Tensor) - A required 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_ZN_LSTM. + - **bias** (Tensor) - A required 1D Tensor. Must be one of the following types: float16, float32. The format must be ND. + - **seq_len** (Tensor) - A optional Tensor. Only Support float16 in FRACTAL_NZ and int32 in ND. + - **h** (Tensor) - A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **c** (Tensor) - A optional 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **has_biases** (bool) - If the value is true, bias exists. + - **num_layers** (Number) - Number of recurrent layers. Only Support single layer currently. + - **dropout** (Number) - If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. unsupport currently. + - **train** (bool) - An bool identifying is training in the op. Default to true . + - **bidirectional** (bool) - If True, becomes a bidirectional LSTM. unsupport currently. + - **batch_first** (bool) - If True, then the input and output tensors are provided as (batch, seq, feature). unsupport currently. + - **flag_seq** (bool) - If True, then the input is PackSequnce. unsupport currently. + - **direction** (bool) - If True, then the direction is "REDIRECTIONAL", otherwise is "UNIDIRECTIONAL". + +- Returns: + - **y** - A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **output_h** - A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **output_c** - A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **i** - A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **j** - A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **f** - A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **o** - A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **tanhct** - A 4D Tensor. Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + +- constraints: + + None + +- Examples: + + None + +>npu_iou(bboxes, gtboxes, mode=0) -> Tensor +>npu_ptiou(bboxes, gtboxes, mode=0) -> Tensor + +Computes the intersection over union (iou) or the intersection over. foreground (iof) based on the ground-truth and predicted regions. + +- Parameters: + - **bboxes** (Tensor) - the input tensor. + - **gtboxes** (Tensor) - the input tensor. + - **mode** (Number) - 0 1 corresponds to two modes iou iof. + +- constraints: + + None + +- Examples: + + ```python + >>> bboxes = torch.tensor([[0, 0, 10, 10], + [10, 10, 20, 20], + [32, 32, 38, 42]], dtype=torch.float16).to("npu") + >>> gtboxes = torch.tensor([[0, 0, 10, 20], + [0, 10, 10, 10], + [10, 10, 20, 20]], dtype=torch.float16).to("npu") + >>> output_iou = torch.npu_iou(bboxes, gtboxes, 0) + >>> output_iou + tensor([[0.4985, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000], + [0.0000, 0.9961, 0.0000]], device='npu:0', dtype=torch.float16) + ``` + +>npu_pad(input, paddings) -> Tensor + +Pads a tensor + +- Parameters: + - **input** (Tensor) - the input tensor. + - **paddings** (ListInt) - type int32 or int64. + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.tensor([[20, 20, 10, 10]], dtype=torch.float16).to("npu") + >>> paddings = [1, 1, 1, 1] + >>> output = torch.npu_pad(input, paddings) + >>> output + tensor([[ 0., 0., 0., 0., 0., 0.], + [ 0., 20., 20., 10., 10., 0.], + [ 0., 0., 0., 0., 0., 0.]], device='npu:0', dtype=torch.float16) + ``` + +>npu_nms_with_mask(input, iou_threshold) -> (Tensor, Tensor, Tensor) + +The value 01 is generated for the nms operator to determine the valid bit + +- Parameters: + - **input** (Tensor) - the input tensor. + - **iou_threshold** (Number) - Threshold. If the value exceeds this threshold, the value is 1. Otherwise, the value is 0. + +- Returns: + + - **selected_boxes** - 2-D tensor with shape of [N,5], representing filtered boxes including proposal boxes and corresponding confidence scores. + - **selected_idx** - 1-D tensor with shape of [N], representing the index of input proposal boxes. + - **selected_mask** - 1-D tensor with shape of [N], the symbol judging whether the output proposal boxes is valid . + +- constraints: + + The 2nd-dim of input box_scores must be equal to 8. + +- Examples: + + ```python + >>> input = torch.tensor([[0.0, 1.0, 2.0, 3.0, 0.6], [6.0, 7.0, 8.0, 9.0, 0.4]], dtype=torch.float16).to("npu") + >>> iou_threshold = 0.5 + >>> output1, output2, output3, = torch.npu_nms_with_mask(input, iou_threshold) + >>> output1 + tensor([[0.0000, 1.0000, 2.0000, 3.0000, 0.6001], + [6.0000, 7.0000, 8.0000, 9.0000, 0.3999]], device='npu:0', + dtype=torch.float16) + >>> output2 + tensor([0, 1], device='npu:0', dtype=torch.int32) + >>> output3 + tensor([1, 1], device='npu:0', dtype=torch.uint8) + ``` + +>npu_bounding_box_encode(anchor_box, ground_truth_box, means0, means1, means2, means3, stds0, stds1, stds2, stds3) -> Tensor + +Computes the coordinate variations between bboxes and ground truth boxes. It is a customized FasterRcnn operator + +- Parameters: + - **anchor_box** (Tensor) - the input tensor.Anchor boxes. A 2D Tensor of float32 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to "x0", "x1", "y0", and "y1". + - **ground_truth_box** (Tensor) - the input tensor.Ground truth boxes. A 2D Tensor of float32 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to "x0", "x1", "y0", and "y1" + - **means0** (Number) - An index of type int + - **means1** (Number) - An index of type int + - **means2** (Number) - An index of type int + - **means3** (Number) - An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means". + - **stds0** (Number) - An index of type int + - **stds1** (Number) - An index of type int + - **stds2** (Number) - An index of type int + - **stds3** (Number) - An index of type int Defaults to [1.0,1.0,1.0,1.0]. "deltas" = "deltas" x "stds" + "means" . + +- constraints: + + None + +- Examples: + + ```python + >>> anchor_box = torch.tensor([[1., 2., 3., 4.], [3.,4., 5., 6.]], dtype = torch.float32).to("npu") + >>> ground_truth_box = torch.tensor([[5., 6., 7., 8.], [7.,8., 9., 6.]], dtype = torch.float32).to("npu") + >>> output = torch.npu_bounding_box_encode(anchor_box, ground_truth_box, 0, 0, 0, 0, 0.1, 0.1, 0.2, 0.2) + >>> output + tensor([[13.3281, 13.3281, 0.0000, 0.0000], + [13.3281, 6.6641, 0.0000, -5.4922]], device='npu:0') + >>> + ``` + +>npu_bounding_box_decode(rois, deltas, means0, means1, means2, means3, stds0, stds1, stds2, stds3, max_shape, wh_ratio_clip) -> Tensor + +Generates bounding boxes based on "rois" and "deltas". It is a customized FasterRcnn operator . + +- Parameters: + - **rois** (Tensor) - Region of interests (ROIs) generated by the region proposal network (RPN). A 2D Tensor of type float32 or float16 with shape (N, 4). "N" indicates the number of ROIs, and the value "4" refers to "x0", "x1", "y0", and "y1". + - **deltas** (Tensor) - Absolute variation between the ROIs generated by the RPN and ground truth boxes. A 2D Tensor of type float32 or float16 with shape (N, 4). "N" indicates the number of errors, and 4 indicates "dx", "dy", "dw", and "dh" . + - **means0** (Number) - An index of type int + - **means1** (Number) - An index of type int + - **means2** (Number) - An index of type int + - **means3** (Number) - An index of type int. Defaults to [0,0,0,0]. "deltas" = "deltas" x "stds" + "means". + - **stds0** (Number) - An index of type int + - **stds1** (Number) - An index of type int + - **stds2** (Number) - An index of type int + - **stds3** (Number) - An index of type int Defaults to [1.0,1.0,1.0,1.0]. "deltas" = "deltas" x "stds" + "means" . + - **max_shape** (ListInt) - Shape [h, w], specifying the size of the image transferred to the network. Used to ensure that the bbox shape after conversion does not exceed "max_shape + - **wh_ratio_clip** (Number) - Defaults to "16/1000". The values of "dw" and "dh" fall within (-wh_ratio_clip, wh_ratio_clip) . + +- constraints: + + None + +- Examples: + + ```python + >>> rois = torch.tensor([[1., 2., 3., 4.], [3.,4., 5., 6.]], dtype = torch.float32).to("npu") + >>> deltas = torch.tensor([[5., 6., 7., 8.], [7.,8., 9., 6.]], dtype = torch.float32).to("npu") + >>> output = torch.npu_bounding_box_decode(rois, deltas, 0, 0, 0, 0, 1, 1, 1, 1, (10, 10), 0.1) + >>> output + tensor([[2.5000, 6.5000, 9.0000, 9.0000], + [9.0000, 9.0000, 9.0000, 9.0000]], device='npu:0') + ``` + +>npu_gru(input, hx, weight_input, weight_hidden, bias_input, bias_hidden, seq_length, has_biases, num_layers, dropout, train, bidirectional, batch_first) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) + +DynamicGRUV2 calculation. + +- Parameters: + - **input** (Tensor) - Must be one of the following types: float16. The format must be FRACTAL_NZ. + - **hx** (Tensor) - Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **weight_input** (Tensor) - Must be one of the following types: float16. The format must be FRACTAL_Z. + - **weight_hidden** (Tensor) - Must be one of the following types: float16. The format must be FRACTAL_Z. + - **bias_input** (Tensor) - Must be one of the following types: float16, float32. The format must be ND. + - **bias_hidden** (Tensor) - Must be one of the following types: float16, float32. The format must be ND. + - **seq_length** (Tensor) - Must be one of the following types: int32. The format must be ND. + - **has_biases** (bool) - Default to true. + - **num_layers** (Number) + - **dropout** (Number) + - **train** (bool) - An bool identifying is training in the op. Default to true. + - **bidirectional** (bool) - Default to true. + - **batch_first** (bool) - Default to true. + +- Returns: + + - **y** (Tensor) - Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **output_h** (Tensor) - output_h:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **update** (Tensor) - update:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **reset** (Tensor) - reset:Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **new** (Tensor) - Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + - **hidden_new** (Tensor) - Must be one of the following types: float16, float32. The format must be FRACTAL_NZ. + +- constraints: + + None + +- Examples: + + None + +>npu_random_choice_with_mask(x, count=256, seed=0, seed2=0) -> (Tensor, Tensor) + +Shuffle index of no-zero element + +- Parameters: + - **x** (Tensor) - the input tensor. + - **count** (Number) - the count of output, if 0, out all no-zero elements. + - **seed** (Number) - type int32 or int64. + - **seed2** (Number) - type int32 or int64. + +- Returns: + + - **y** - 2-D tensor, no-zero element index. + - **mask** - 1-D, whether the corresponding index is valid. + +- constraints: + + None + +- Examples: + + ```python + >>> x = torch.tensor([1, 0, 1, 0], dtype=torch.bool).to("npu") + >>> result, mask = torch.npu_random_choice_with_mask(x, 2, 1, 0) + >>> result + tensor([[0], + [2]], device='npu:0', dtype=torch.int32) + >>> mask + tensor([True, True], device='npu:0') + ``` + +>npu_batch_nms(self, scores, score_threshold, iou_threshold, max_size_per_class, max_total_size, change_coordinate_frame=False, transpose_box=False) -> (Tensor, Tensor, Tensor, Tensor) + +Computes nms for input boxes and score, support multiple batch and classes. will do clip to window, score filter, top_k, and nms + +- Parameters: + - **self** (Tensor) - the input tensor. + - **scores** (Tensor) - the input tensor. + - **score_threshold** (Number) - A required attribute of type float32, specifying the score filter iou iou_threshold. + - **iou_threshold** (Number) - A required attribute of type float32, specifying the nms iou iou_threshold. + - **max_size_per_class** (Number) - A required attribute of type int, specifying the nms output num per class. + - **max_total_size** (Number) - A required attribute of type int, specifying the the nms output num per batch. + - **change_coordinate_frame** (bool) - A optional attribute of type bool, whether to normalize coordinates after clipping. + - **transpose_box** (bool) - A optional attribute of type bool, whether inserted transpose before this op. must be "false" + +- Returns: + + - **nmsed_boxes** (Tensor) - A 3D Tensor of type float16 with shape (batch, max_total_size, 4),specifying the output nms boxes per batch. + - **nmsed_scores** (Tensor) - A 2D Tensor of type float16 with shape (batch, max_total_size),specifying the output nms score per batch. + - **nmsed_classes** (Tensor) - A 2D Tensor of type float16 with shape (batch, max_total_size),specifying the output nms class per batch. + - **nmsed_num** (Tensor) - A 1D Tensor of type int32 with shape (batch), specifying the valid num of nmsed_boxes. + +- constraints: + + None + +- Examples: + + ```python + >>> boxes = torch.randn(8, 2, 4, 4, dtype = torch.float32).to("npu") + >>> scores = torch.randn(3, 2, 4, dtype = torch.float32).to("npu") + >>> nmsed_boxes, nmsed_scores, nmsed_classes, nmsed_num = torch.npu_batch_nms(boxes, scores, 0.3, 0.5, 3, 4) + >>> nmsed_boxes + >>> nmsed_scores + >>> nmsed_classes + >>> nmsed_num + ``` + +>npu_slice(self, offsets, size) -> Tensor + +Extracts a slice from a tensor + +- Parameters: + - **self** (Tensor) - the input tensor. + - **offsets** (ListInt) - type int32 or int64. + - **size** (ListInt) - type int32 or int64. + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.tensor([[1,2,3,4,5], [6,7,8,9,10]], dtype=torch.float16).to("npu") + >>> offsets = [0, 0] + >>> size = [2, 2] + >>> output = torch.npu_slice(input, offsets, size) + >>> output + tensor([[1., 2.], + [6., 7.]], device='npu:0', dtype=torch.float16) + ``` + +>npu_dropoutV2(self, seed, p) -> (Tensor, Tensor, Tensor(a!)) + +count dropout result with seed + +- Parameters: + - **self** (Tensor) - The input Tensor. + - **seed** (Tensor) - The input Tensor. + - **p** (Float) - Dropout probability. + +- Returns: + + - **y** - A tensor with the same shape and type as "x". + - **mask** - A tensor with the same shape and type as "x". + - **new_seed** - A tensor with the same shape and type as "seed". + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.tensor([1.,2.,3.,4.]).npu() + >>> input + tensor([1., 2., 3., 4.], device='npu:0') + >>> seed = torch.rand((32,),dtype=torch.float32).npu() + >>> seed + tensor([0.4368, 0.7351, 0.8459, 0.4657, 0.6783, 0.8914, 0.8995, 0.4401, 0.4408, + 0.4453, 0.2404, 0.9680, 0.0999, 0.8665, 0.2993, 0.5787, 0.0251, 0.6783, + 0.7411, 0.0670, 0.9430, 0.9165, 0.3983, 0.5849, 0.7722, 0.4659, 0.0486, + 0.2693, 0.6451, 0.2734, 0.3176, 0.0176], device='npu:0') + >>> prob = 0.3 + >>> output, mask, out_seed = torch.npu_dropoutV2(input, seed, prob) + >>> output + tensor([0.4408, 0.4453, 0.2404, 0.9680], device='npu:0') + >>> mask + tensor([0., 0., 0., 0.], device='npu:0') + >>> out_seed + tensor([0.4408, 0.4453, 0.2404, 0.9680, 0.0999, 0.8665, 0.2993, 0.5787, 0.0251, + 0.6783, 0.7411, 0.0670, 0.9430, 0.9165, 0.3983, 0.5849, 0.7722, 0.4659, + 0.0486, 0.2693, 0.6451, 0.2734, 0.3176, 0.0176, 0.0000, 0.0000, 0.0000, + 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], device='npu:0') + ``` + +>_npu_dropout(self, p) -> (Tensor, Tensor) + +count dropout result without seed + +- Parameters: + Similar to `torch.dropout`, optimize implemention to npu device. + - **self** (Tensor) - The input Tensor. + - **p** (Float) - Dropout probability. + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.tensor([1.,2.,3.,4.]).npu() + >>> input + tensor([1., 2., 3., 4.], device='npu:0') + >>> prob = 0.3 + >>> output, mask = torch._npu_dropout(input, prob) + >>> output + tensor([0.0000, 2.8571, 0.0000, 0.0000], device='npu:0') + >>> mask + tensor([ 98, 255, 188, 186, 120, 157, 175, 159, 77, 223, 127, 79, 247, 151, + 253, 255], device='npu:0', dtype=torch.uint8) + ``` + +>_npu_dropout_inplace(result, p) -> (Tensor(a!), Tensor) + +count dropout result inplace. + +- Parameters: + Similar to `torch.dropout_`, optimize implemention to npu device. + - **result** (Tensor) - The Tensor dropout inplace. + - **p** (Float) - Dropout probability. + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.tensor([1.,2.,3.,4.]).npu() + >>> input + tensor([1., 2., 3., 4.], device='npu:0') + >>> prob = 0.3 + >>> output, mask = torch._npu_dropout_inplace(input, prob) + >>> output + tensor([0.0000, 2.8571, 0.0000, 0.0000], device='npu:0') + >>> input + tensor([0.0000, 2.8571, 4.2857, 5.7143], device='npu:0') + >>> mask + tensor([ 98, 255, 188, 186, 120, 157, 175, 159, 77, 223, 127, 79, 247, 151, + 253, 255], device='npu:0', dtype=torch.uint8) + ``` + +>npu_indexing(self, begin, end, strides, begin_mask=0, end_mask=0, ellipsis_mask=0, new_axis_mask=0, shrink_axis_mask=0) -> Tensor + +count indexing result by begin,end,strides array. + +- Parameters: + - **self** (Tensor) - A Input Tensor. + - **begin** (ListInt) - The index of the first value to select. + - **end** (ListInt) - The index of the last value to select. + - **strides** (ListInt) - The index increment. + - **begin_mask** (Number) - A bitmask where a bit "i" being "1" means to ignore the begin + value and instead use the largest interval possible. + - **end_mask** (Number) - Analogous to "begin_mask". + - **ellipsis_mask** (Number) - A bitmask where bit "i" being "1" means the "i"th position + is actually an ellipsis. + - **new_axis_mask** (Number) - A bitmask where bit "i" being "1" means the "i"th + specification creates a new shape 1 dimension. + - **shrink_axis_mask** (Number) - A bitmask where bit "i" implies that the "i"th + specification should shrink the dimensionality. + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.tensor([[1, 2, 3, 4],[5, 6, 7, 8]], dtype=torch.int32).to("npu") + >>> input + tensor([[1, 2, 3, 4], + [5, 6, 7, 8]], device='npu:0', dtype=torch.int32) + >>> output = torch.npu_indexing(input1, [0, 0], [2, 2], [1, 1]) + >>> output + tensor([[1, 2], + [5, 6]], device='npu:0', dtype=torch.int32) + ``` + +>npu_ifmr(Tensor data, Tensor data_min, Tensor data_max, Tensor cumsum, float min_percentile, float max_percentile, float search_start, float search_end, float search_step, bool with_offset) -> (Tensor, Tensor) + +count ifmr result by begin,end,strides array, Input Feature Map Reconstruction + +- Parameters: + - **data** (Tensor) - A Tensor of feature map. + - **data_min** (Tensor) - A Tensor of min value of feature map. + - **data_max** (Tensor) - A Tensor of max value of feature map. + - **cumsum** (Tensor) - A Tensor of cumsum bin of data. + - **min_percentile** (Float) - min init percentile. + - **max_percentile** (Float) - max init percentile. + - **search_start** (Float) - search start. + - **search_end** (Float) - search end. + - **search_step** (Float) - step size of searching. + - **with_offset** (bool) - whether using offset. + +- Returns: + + - **scale** - optimal scale. + - **offset** - optimal offset . + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.rand((2,2,3,4),dtype=torch.float32).npu() + >>> input + tensor([[[[0.4508, 0.6513, 0.4734, 0.1924], + [0.0402, 0.5502, 0.0694, 0.9032], + [0.4844, 0.5361, 0.9369, 0.7874]], + + [[0.5157, 0.1863, 0.4574, 0.8033], + [0.5986, 0.8090, 0.7605, 0.8252], + [0.4264, 0.8952, 0.2279, 0.9746]]], + + [[[0.0803, 0.7114, 0.8773, 0.2341], + [0.6497, 0.0423, 0.8407, 0.9515], + [0.1821, 0.5931, 0.7160, 0.4968]], + + [[0.7977, 0.0899, 0.9572, 0.0146], + [0.2804, 0.8569, 0.2292, 0.1118], + [0.5747, 0.4064, 0.8370, 0.1611]]]], device='npu:0') + >>> min_value = torch.min(input) + >>> min_value + tensor(0.0146, device='npu:0') + >>> max_value = torch.max(input) + >>> max_value + tensor(0.9746, device='npu:0') + >>> hist = torch.histc(input.to('cpu'), + bins=128, + min=min_value.to('cpu'), + max=max_value.to('cpu')) + >>> hist + tensor([1., 0., 0., 2., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., + 0., 1., 0., 0., 2., 1., 0., 0., 0., 0., 2., 1., 0., 0., 0., 0., 0., 1., + 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., + 1., 0., 0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., + 0., 0., 1., 0., 0., 2., 0., 0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0., + 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 2., 0., 0., + 1., 1., 1., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0., 1., 0., 1., 1., + 0., 1.]) + >>> cdf = torch.cumsum(hist,dim=0).int().npu() + >>> cdf + tensor([ 1, 1, 1, 3, 3, 3, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7, 7, 7, + 7, 8, 8, 8, 10, 11, 11, 11, 11, 11, 13, 14, 14, 14, 14, 14, 14, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, + 17, 17, 17, 17, 18, 19, 19, 20, 21, 21, 22, 22, 23, 23, 23, 24, 24, 25, + 25, 25, 26, 26, 26, 28, 28, 28, 28, 28, 28, 28, 30, 30, 30, 30, 30, 30, + 30, 30, 31, 32, 32, 32, 32, 32, 32, 33, 33, 33, 33, 34, 35, 37, 37, 37, + 38, 39, 40, 40, 41, 41, 41, 42, 42, 43, 44, 44, 44, 44, 45, 45, 46, 47, + 47, 48], device='npu:0', dtype=torch.int32) + >>> scale, offset = torch.npu_ifmr(input, + min_value, + max_value, + cdf, + min_percentile=0.999999, + max_percentile=0.999999, + search_start=0.7, + search_end=1.3, + search_step=0.01, + with_offset=False) + >>> scale + tensor(0.0080, device='npu:0') + >>> offset + tensor(0., device='npu:0') + ``` + +>npu_max.dim(self, dim, keepdim=False) -> (Tensor, Tensor) + +count max result with dim. + +- Parameters: + Similar to `torch.max`, optimize implemention to npu device. + + - **self** (Tensor) – the input tensor. + - **dim** (Number) – the dimension to reduce. + - **keepdim** (bool) – whether the output tensor has dim retained or not. + +- Returns: + + - **values** - max values in the input tensor. + - **indices** - index of max values in the input tensor. + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.randn(2, 2, 2, 2, dtype = torch.float32).npu() + >>> input + tensor([[[[-1.8135, 0.2078], + [-0.6678, 0.7846]], + + [[ 0.6458, -0.0923], + [-0.2124, -1.9112]]], + + [[[-0.5800, -0.4979], + [ 0.2580, 1.1335]], + + [[ 0.6669, 0.1876], + [ 0.1160, -0.1061]]]], device='npu:0') + >>> outputs, indices = torch.npu_max(input, 2) + >>> outputs + tensor([[[-0.6678, 0.7846], + [ 0.6458, -0.0923]], + + [[ 0.2580, 1.1335], + [ 0.6669, 0.1876]]], device='npu:0') + >>> indices + tensor([[[1, 1], + [0, 0]], + + [[1, 1], + [0, 0]]], device='npu:0', dtype=torch.int32) + ``` + +>npu_min.dim(self, dim, keepdim=False) -> (Tensor, Tensor) + +count min result with dim. + +- Parameters: + Similar to `torch.min`, optimize implemention to npu device. + - **self** (Tensor) – the input tensor. + - **dim** (Number) – the dimension to reduce. + - **keepdim** (bool) – whether the output tensor has dim retained or not. + +- Returns: + + - **values** - min values in the input tensor. + - **indices** - index of min values in the input tensor. + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.randn(2, 2, 2, 2, dtype = torch.float32).npu() + >>> input + tensor([[[[-0.9909, -0.2369], + [-0.9569, -0.6223]], + + [[ 0.1157, -0.3147], + [-0.7761, 0.1344]]], + + [[[ 1.6292, 0.5953], + [ 0.6940, -0.6367]], + + [[-1.2335, 0.2131], + [ 1.0748, -0.7046]]]], device='npu:0') + >>> outputs, indices = torch.npu_min(input, 2) + >>> outputs + tensor([[[-0.9909, -0.6223], + [-0.7761, -0.3147]], + + [[ 0.6940, -0.6367], + [-1.2335, -0.7046]]], device='npu:0') + >>> indices + tensor([[[0, 1], + [1, 0]], + + [[1, 1], + [0, 1]]], device='npu:0', dtype=torch.int32) + ``` + +>npu_scatter(self, indices, updates, dim) -> Tensor + +count scatter result with dim. + +- Parameters: + Similar to `torch.scatter`, optimize implemention to npu device. + + - **self** (Tensor) - the input tensor. + - **indices** (Tensor) – the indices of elements to scatter, can be either empty or of the same dimensionality as src. When empty, the operation returns self unchanged. + - **updates** (Tensor) – the source element(s) to scatter. +- **dim** (Number) – the axis along which to index + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.tensor([[1.6279, 0.1226], [0.9041, 1.0980]]).npu() + >>> input + tensor([[1.6279, 0.1226], + [0.9041, 1.0980]], device='npu:0') + >>> indices = torch.tensor([0, 1],dtype=torch.int32).npu() + >>> indices + tensor([0, 1], device='npu:0', dtype=torch.int32) + >>> updates = torch.tensor([-1.1993, -1.5247]).npu() + >>> updates + tensor([-1.1993, -1.5247], device='npu:0') + >>> dim = 0 + >>> output = torch.npu_scatter(input, indices, updates, dim) + >>> output + tensor([[-1.1993, 0.1226], + [ 0.9041, -1.5247]], device='npu:0') + ``` + +>npu_layer_norm_eval(input, normalized_shape, weight=None, bias=None, eps=1e-05) -> Tensor + +count layer norm result. + +- Parameters: + The same as `torch.nn.functional.layer_norm`, optimize implemention to npu device. + - **input** (Tensor) - The input Tensor. + - **normalized_shape** (ListInt) – input shape from an expected input of size. + - **weight** (Tensor) - The gamma Tensor. + - **bias** (Tensor) - The beta Tensor. + - **eps** (Float) – The epsilon value added to the denominator for numerical stability. Default: 1e-5. + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.rand((6, 4), dtype=torch.float32).npu() + >>> input + tensor([[0.1863, 0.3755, 0.1115, 0.7308], + [0.6004, 0.6832, 0.8951, 0.2087], + [0.8548, 0.0176, 0.8498, 0.3703], + [0.5609, 0.0114, 0.5021, 0.1242], + [0.3966, 0.3022, 0.2323, 0.3914], + [0.1554, 0.0149, 0.1718, 0.4972]], device='npu:0') + >>> normalized_shape = input.size()[1:] + >>> normalized_shape + torch.Size([4]) + >>> weight = torch.Tensor(*normalized_shape).npu() + >>> weight + tensor([ nan, 6.1223e-41, -8.3159e-20, 9.1834e-41], device='npu:0') + >>> bias = torch.Tensor(*normalized_shape).npu() + >>> bias + tensor([5.6033e-39, 6.1224e-41, 6.1757e-39, 6.1224e-41], device='npu:0') + >>> output = torch.npu_layer_norm_eval(input, normalized_shape, weight, bias, 1e-5) + >>> output + tensor([[ nan, 6.7474e-41, 8.3182e-20, 2.0687e-40], + [ nan, 8.2494e-41, -9.9784e-20, -8.2186e-41], + [ nan, -2.6695e-41, -7.7173e-20, 2.1353e-41], + [ nan, -1.3497e-41, -7.1281e-20, -6.9827e-42], + [ nan, 3.5663e-41, 1.2002e-19, 1.4314e-40], + [ nan, -6.2792e-42, 1.7902e-20, 2.1050e-40]], device='npu:0') + ``` + +>npu_alloc_float_status(self) -> Tensor + +Produces eight numbers with a value of zero + +- Parameters: + + - **self** (Tensor) - Any Tensor + +- constraints: + + None + +- Examples: + + ```python + >>> input = torch.randn([1,2,3]).npu() + >>> output = torch.npu_alloc_float_status(input) + >>> input + tensor([[[ 2.2324, 0.2478, -0.1056], + [ 1.1273, -0.2573, 1.0558]]], device='npu:0') + >>> output + tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='npu:0') + ``` + +> npu_get_float_status(self) -> Tensor + +Computes NPU get float status operator function. + +- Parameters: + + - **self** (Tensor) - A Tensor of data memory address. Must be float32 . + +- Constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(2).npu() + >>> torch.npu_get_float_status(x) + tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='npu:0') + ``` + +> npu_clear_float_status(self) -> Tensor + +Set the value of address 0x40000 to 0 in each core. + +- Parameters: + + - **self** (Tensor) - A tensor of type float32. + +- Constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(2).npu() + >>> torch.npu_clear_float_status(x) + tensor([0., 0., 0., 0., 0., 0., 0., 0.], device='npu:0') + ``` + +> npu_confusion_transpose(self, perm, shape, transpose_first) -> Tensor + +Confuse reshape and transpose. + +- Parameters: + + - **self** (Tensor) - A Tensor. Must be one of the following types: float16, float32, int8, int16, int32, int64, uint8, uint16, uint32, uint64. + - **perm** (ListInt) - A permutation of the dimensions of "x". + - **shape** (ListInt) - The shape of the input. + - **transpose_first** (bool) - If True, the transpose is first, otherwise the reshape is first. + +- Constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(2, 3, 4, 6).npu() + >>> x.shape + torch.Size([2, 3, 4, 6]) + >>> y = torch.npu_confusion_transpose(x, (0, 2, 1, 3), (2, 4, 18), True) + >>> y.shape + torch.Size([2, 4, 18]) + >>> y2 = torch.npu_confusion_transpose(x, (0, 2, 1), (2, 12, 6), False) + >>> y2.shape + torch.Size([2, 6, 12]) + ``` + +> npu_bmmV2(self, mat2, output_sizes) -> Tensor + +Multiplies matrix "a" by matrix "b", producing "a * b" . + +- Parameters: + - **self** (Tensor) - A matrix Tensor. Must be one of the following types: float16, float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ]. + - **mat2** (Tensor) - A matrix Tensor. Must be one of the following types: float16, float32, int32. 2D or higher. Has format [ND, NHWC, FRACTAL_NZ]. + - **output_sizes** (ListInt) - Output's shape, used in matmul's backpropagation, default []. + +- Constraints: + + None + +- Examples: + + ```python + >>> mat1 = torch.randn(10, 3, 4).npu() + >>> mat2 = torch.randn(10, 4, 5).npu() + >>> res = torch.npu_bmmV2(mat1, mat2, []) + >>> res.shape + torch.Size([10, 3, 5]) + ``` + +> fast_gelu(self) -> Tensor + +Computes the gradient for the fast_gelu of "x" . + +- Parameters: + + - **self** (Tensor) - A Tensor. Must be one of the following types: float16, float32 + +- Constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(2).npu() + >>> x + tensor([0.5991, 0.4094], device='npu:0') + >>> torch.fast_gelu(x) + tensor([0.4403, 0.2733], device='npu:0') + ``` + +> npu_sub_sample(self, per_images, positive_fraction) -> Tensor + +Randomly sample a subset of positive and negative examples,and overwrite the label vector to the ignore value (-1) for all elements that are not included in the sample. + +- Parameters: + + - **self** (Tensor) - shape of labels,(N, ) label vector with values. + - **per_images** (Number) - A require attribute of type int. + - **positive_fraction** (Float) - A require attribute of type float. + +- Constraints: + + None + +- Examples: + + ```python + >>> x = torch.tensor([-2, 3, 6, -7, -2, 8, 1, -5, 7, 4]).int().npu() + >>> x + tensor([-2, 3, 6, -7, -2, 8, 1, -5, 7, 4], device='npu:0', + dtype=torch.int32) + >>> torch.npu_sub_sample(x, 5, 0.6) + tensor([-1, -1, -1, -1, -1, -1, 1, -1, -1, -1], device='npu:0', + dtype=torch.int32) + ``` + +> npu_deformable_conv2d(input, weight, offset, bias, kernel_size, stride, padding, dilation=[1,1,1,1], groups=1, deformable_groups=1, modulated=True) -> (Tensor, Tensor) + +Computes the deformed convolution output with the expected input. + +- Parameters: + + - **self** (Tensor) - A 4D tensor of input image. With the format "NHWC", the data is stored in the order of: [batch, in_height, in_width, in_channels]. + - **weight** (Tensor) - A 4D tensor of learnable filters. Must have the same type as "x". With the format "HWCN" , the data is stored in the order of: [filter_height, filter_width, in_channels / groups, out_channels]. + - **offset** (Tensor) - A 4D tensor of x-y coordinates offset and mask. With the format "NHWC", the data is stored in the order of: [batch, out_height, out_width, deformable_groups * filter_height * filter_width * 3]. + - **bias** (Tensor) - An optional 1D tensor of additive biases to the filter outputs. The data is stored in the order of: [out_channels]. + - **kernel_size** (ListInt) - A tuple/list of 2 integers.kernel size. + - **stride** (ListInt) - Required. A list of 4 integers. The stride of the sliding window for each dimension of input. The dimension order is interpreted according to the data format of "x". The N and C dimensions must be set to 1. + - **padding** (ListInt) - Required. A list of 4 integers. The number of pixels to add to each (top, bottom, left, right) side of the input. + - **dilations** (ListInt) - Optional. A list of 4 integers. The dilation factor for each dimension of input. The dimension order is interpreted according to the data format of "x". The N and C dimensions must be set to 1. Defaults to [1, 1, 1, 1]. + - **groups** (Number) - Optional. An integer of type int32. The number of blocked connections from input channels to output channels. In_channels and out_channels must both be divisible by "groups". Defaults to 1. + - **deformable_groups** (Number) - Optional. An integer of type int32. The number of deformable group partitions. In_channels must be divisible by "deformable_groups". Defaults to 1. + - **modulated** (bool) - Optional. Specify version of DeformableConv2D, true means v2, false means v1, currently only support v2. + +- Constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(16, 32, 32, 32).npu() + >>> weight = torch.rand(32, 32, 5, 5).npu() + >>> offset = torch.rand(16, 75, 32, 32).npu() + >>> output, _ = torch.npu_deformable_conv2d(x, weight, offset, None, kernel_size=[5, 5], stride = [1, 1, 1, 1], padding = [2, 2, 2, 2]) + >>> output.shape + torch.Size([16, 32, 32, 32]) + ``` + +> npu_mish(self) -> Tensor + +Computes hyperbolic tangent of "x" element-wise. + +- Parameters: + + - **self** (Tensor) - A Tensor. Must be one of the following types: float16, float32. + +- Constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(10, 30, 10).npu() + >>> y = torch.npu_mish(x) + >>> y.shape + torch.Size([10, 30, 10]) + ``` + +> npu_anchor_response_flags(self, featmap_size, stride, num_base_anchors) -> Tensor + +Generate the responsible flags of anchor in a single feature map. + +- Parameters: + - **self** (Tensor) - Ground truth box, 2-D Tensor with shape [batch, 4]. + - **featmap_size** (ListInt) - The size of feature maps, listint. + - **strides** (ListInt) - Stride of current level, listint. + - **num_base_anchors** (Number) - The number of base anchors. + +- Constraints: + + None + +- Examples: + + ```python + >>> x = torch.rand(100, 4).npu() + >>> y = torch.npu_anchor_response_flags(x, [60, 60], [2, 2], 9) + >>> y.shape + torch.Size([32400]) + ``` + +> npu_yolo_boxes_encode(self, gt_bboxes, stride, performance_mode=False) -> Tensor + +Generates bounding boxes based on yolo's "anchor" and "ground-truth" boxes. It is a customized mmdetection operator. + +- Parameters: + - **self** (Tensor) - anchor boxes generated by the yolo training set. A 2D Tensor of type float32 or float16 with shape (N, 4). "N" indicates the number of ROIs, "N" indicates the number of ROIs, and the value "4" refers to (tx, ty, tw, th). + - **gt_bboxes** (Tensor) - target of the transformation, e.g, ground-truth boxes. A 2D Tensor of type float32 or float16 with shape (N, 4). "N" indicates the number of ROIs, and 4 indicates "dx", "dy", "dw", and "dh". + - **strides** (Tensor) - Scale for each box. A 1D Tensor of type int32 shape (N,). "N" indicates the number of ROIs. +- **performance_mode** (bool) - Select performance mode, "high_precision" or "high_performance". select "high_precision" when input type is float32, the output tensor precision will be smaller than 0.0001, select "high_performance" when input type is float32, the ops will be best performance, but precision will be only smaller than 0.005. + +- Constraints: + + input anchor boxes only support maximum N=20480. + +- Examples: + + ```python + >>> anchor_boxes = torch.rand(2, 4).npu() + >>> gt_bboxes = torch.rand(2, 4).npu() + >>> stride = torch.tensor([2, 2], dtype=torch.int32).npu() + >>> output = torch.npu_yolo_boxes_encode(anchor_boxes, gt_bboxes, stride, False) + >>> output.shape + torch.Size([2, 4]) + ``` + +> npu_grid_assign_positive(self, overlaps, box_responsible_flags, max_overlaps, argmax_overlaps, gt_max_overlaps, gt_argmax_overlaps, num_gts, pos_iou_thr, min_pos_iou, gt_max_assign_all) -> Tensor + +Performs Position Sensitive PS ROI Pooling Grad. + +- Parameters: + - **self** (Tensor) - Tensor of type float16 or float32, shape (n, ) + - **overlaps** (Tensor) - A Tensor. Datatype is same as assigned_gt_inds. IOU between gt_bboxes and bboxes. shape(k, n) + - **box_responsible_flags** (Tensor) - A Tensor. Support uint8. Flag to indicate whether box is responsible. + - **max_overlaps** (Tensor) - A Tensor. Datatype is same as assigned_gt_inds. overlaps.max(axis=0). + - **argmax_overlaps** (Tensor) - A Tensor. Support int32. overlaps.argmax(axis=0). + - **gt_max_overlaps** (Tensor) - A Tensor. Datatype is same as assigned_gt_inds. overlaps.max(axis=1). + - **gt_argmax_overlaps** (Tensor) - A Tensor. Support int32. overlaps.argmax(axis=1). + - **num_gts** (Number) - A Tensor. Support int32. real k. shape (1, ) + - **pos_iou_thr** (Float) - loat. IOU threshold for positive bboxes. + - **min_pos_iou** (Float) - float. minimum iou for a bbox to be considered as a positive bbox + - **gt_max_assign_all** (bool) - bool. whether to assign all bboxes with the same highest overlap with some gt to that gt. + +- Constraints: + + None + +- Examples: + + ```python + >>> assigned_gt_inds = torch.rand(4).npu() + >>> overlaps = torch.rand(2,4).npu() + >>> box_responsible_flags = torch.tensor([1, 1, 1, 0], dtype=torch.uint8).npu() + >>> max_overlap = torch.rand(4).npu() + >>> argmax_overlap = torch.tensor([1, 0, 1, 0], dtype=torch.int32).npu() + >>> gt_max_overlaps = torch.rand(2).npu() + >>> gt_argmax_overlaps = torch.tensor([1, 0],dtype=torch.int32).npu() + >>> output = torch.npu_grid_assign_positive(assigned_gt_inds, overlaps, box_responsible_flags, max_overlap, argmax_overlap, gt_max_overlaps, gt_argmax_overlaps, 128, 0.5, 0., True) + >>> output.shape + torch.Size([4]) + ``` + +> npu_normalize_batch(self, seq_len, normalize_type=0) -> Tensor + + Performs batch normalization . + +- Parameters: + + - **self** (Tensor) - A Tensor. Support float32. shape (n, c, d). + - **seq_len** (Tensor) - A Tensor. Each batch normalize data num. Support Int32. Shape (n, ). + - **normalize_type** (Number) - Str. Support "per_feature" or "all_features". + +- constraints: + + None + +- Examples: + ```python + >>> a=np.random.uniform(1,10,(2,3,6)).astype(np.float32) + >>> b=np.random.uniform(3,6,(2)).astype(np.int32) + >>> x=torch.from_numpy(a).to("npu") + >>> seqlen=torch.from_numpy(b).to("npu") + >>> out = torch.npu_normalize_batch(x, seqlen, 0) + >>> out + tensor([[[ 1.1496, -0.6685, -0.4812, 1.7611, -0.5187, 0.7571], + [ 1.1445, -0.4393, -0.7051, 1.0474, -0.2646, -0.1582], + [ 0.1477, 0.9179, -1.0656, -6.8692, -6.7437, 2.8621]], + + [[-0.6880, 0.1337, 1.3623, -0.8081, -1.2291, -0.9410], + [ 0.3070, 0.5489, -1.4858, 0.6300, 0.6428, 0.0433], + [-0.5387, 0.8204, -1.1401, 0.8584, -0.3686, 0.8444]]], + device='npu:0') + ``` + +> npu_masked_fill_range(self, start, end, value, axis=-1) -> Tensor + +masked fill tensor along with one axis by range.boxes. It is a customized masked fill range operator . + +- Parameters: + + - **self** (Tensor) - input tensor. A ND Tensor of float32/float16/int32/int8 with shapes 1-D (D,), 2-D(N, D), 3-D(N, C, D). + - **start** (Tensor) - masked fill start pos. A 3D Tensor of int32 with shape (num, N). + - **end** (Tensor) - masked fill end pos. A 3D Tensor of int32 with shape (num, N). + - **value** (Tensor) - masked fill value. A 2D Tensor of float32/float16/int32/int8 with shape (num,). + - **axis** (Number) - axis with masked fill of int32. Defaults to -1. + +- constraints: + + None + +- Examples: + ```python + >>> a=torch.rand(4,4).npu() + >>> a + tensor([[0.9419, 0.4919, 0.2874, 0.6560], + [0.6691, 0.6668, 0.0330, 0.1006], + [0.3888, 0.7011, 0.7141, 0.7878], + [0.0366, 0.9738, 0.4689, 0.0979]], device='npu:0') + >>> start = torch.tensor([[0,1,2]], dtype=torch.int32).npu() + >>> end = torch.tensor([[1,2,3]], dtype=torch.int32).npu() + >>> value = torch.tensor([1], dtype=torch.float).npu() + >>> out = torch.npu_masked_fill_range(a, start, end, value, 1) + >>> out + tensor([[1.0000, 0.4919, 0.2874, 0.6560], + [0.6691, 1.0000, 0.0330, 0.1006], + [0.3888, 0.7011, 1.0000, 0.7878], + [0.0366, 0.9738, 0.4689, 0.0979]], device='npu:0') + ``` + +> npu_linear(input, weight, bias=None) -> Tensor + + Multiplies matrix "a" by matrix "b", producing "a * b" . + +- Parameters: + + - **input** (Tensor) - A matrix Tensor. 2D. Must be one of the following types: float32, float16, int32, int8. Has format [ND, NHWC, FRACTAL_NZ]. + - **weight** (Tensor) - A matrix Tensor. 2D. Must be one of the following types: float32, float16, int32, int8. Has format [ND, NHWC, FRACTAL_NZ]. + - **bias** (Tensor) - A 1D Tensor. Must be one of the following types: float32, float16, int32. Has format [ND, NHWC]. - constraints: None - Examples: + ```python + >>> x=torch.rand(2,16).npu() + >>> w=torch.rand(4,16).npu() + >>> b=torch.rand(4).npu() + >>> output = torch.npu_linear(x, w, b) + >>> output + tensor([[3.6335, 4.3713, 2.4440, 2.0081], + [5.3273, 6.3089, 3.9601, 3.2410]], device='npu:0') + ``` + +> npu_bert_apply_adam.old(Tensor(a!) var, Tensor(b!) m, Tensor(c!) v, lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay, step_size=None, adam_mode=0) -> (Tensor(a!), Tensor(b!), Tensor(c!)) + + count adam result. + +- Parameters: + + - **var** (Tensor) - A Tensor. Support float16/float32. + - **m**(Tensor) - A Tensor. Datatype and shape are same as exp_avg. + - **v**(Tensor) - A Tensor. Datatype and shape are same as exp_avg. + - **lr** (Number) - A Tensor. Datatype is same as exp_avg. + - **beta1** (Number) - A Tensor. Datatype is same as exp_avg. + - **beta2** (Number) - A Tensor. Datatype is same as exp_avg. + - **epsilon** (Number) - A Tensor. Datatype is same as exp_avg. + - **grad**(Tensor) - A Tensor. Datatype and shape are same as exp_avg. + - **max_grad_norm** (Number) - A Tensor. Datatype is same as exp_avg. + - **global_grad_norm** (Number) - A Tensor. Datatype is same as exp_avg. + - **weight_decay** (Number) - A Tensor. Datatype is same as exp_avg. + +- constraints: + None + +- Examples: ```python - >>> var_in = torch.rand(321538).uniform_(-32.,21.).npu() - >>> var_in - tensor([ 0.6119, 5.8193, 3.0683, ..., -28.5832, 12.9402, -24.0488], - device='npu:0') + >>> var_in = torch.rand(321538).uniform_(-32., 21.).npu() >>> m_in = torch.zeros(321538).npu() - >>> v_in = torchzeros(321538).npu() - >>> grad = torch.rand(321538).uniform_(-0.05,0.03).npu() - >>> grad - tensor([-0.0315, -0.0113, -0.0132, ..., 0.0106, -0.0226, -0.0252], - device='npu:0') + >>> v_in = torch.zeros(321538).npu() + >>> grad = torch.rand(321538).uniform_(-0.05, 0.03).npu() >>> max_grad_norm = -1. >>> beta1 = 0.9 >>> beta2 = 0.99 >>> weight_decay = 0. - >>> lr = 0.1 + >>> lr = 0. >>> epsilon = 1e-06 >>> global_grad_norm = 0. >>> var_out, m_out, v_out = torch.npu_bert_apply_adam(var_in, m_in, v_in, lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay) >>> var_out - tensor([ 0.7118, 5.9192, 3.1682, ..., -28.6831, 13.0402, -23.9489], - device='npu:0') - >>> m_out - tensor([-0.0032, -0.0011, -0.0013, ..., 0.0011, -0.0023, -0.0025], - device='npu:0') - >>> v_out - tensor([9.9431e-06, 1.2659e-06, 1.7328e-06, ..., 1.1206e-06, 5.0933e-06, - 6.3495e-06], device='npu:0') + tensor([ 14.7733, -30.1218, -1.3647, ..., -16.6840, 7.1518, 8.4872], + device='npu:0') + ``` + +> npu_bert_apply_adam(lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay, step_size=None, adam_mode=0, *, out=(var,m,v)) + + count adam result. + +- Parameters: + + - **var** (Tensor) - A Tensor. Support float16/float32. + - **m** (Tensor) - A Tensor. Datatype and shape are same as exp_avg. + - **v** (Tensor) - A Tensor. Datatype and shape are same as exp_avg. + - **lr** (Number) - Datatype is same as exp_avg. + - **beta1** (Number) - Datatype is same as exp_avg. + - **beta2** (Number) - Datatype is same as exp_avg. + - **epsilon** (Number) - Datatype is same as exp_avg. + - **grad** (Tensor) - A Tensor. Datatype and shape are same as exp_avg. + - **max_grad_norm** (Number) - Datatype is same as exp_avg. + - **global_grad_norm** (Number) - Datatype is same as exp_avg. + - **weight_decay** (Number) - Datatype is same as exp_avg. + +- Keyword Arguments : + + - **out** :A Tensor, optional. The output tensor. + +- constraints: + + None + +- Examples: + ```python + >>> var_in = torch.rand(321538).uniform_(-32., 21.).npu() + >>> m_in = torch.zeros(321538).npu() + >>> v_in = torch.zeros(321538).npu() + >>> grad = torch.rand(321538).uniform_(-0.05, 0.03).npu() + >>> max_grad_norm = -1. + >>> beta1 = 0.9 + >>> beta2 = 0.99 + >>> weight_decay = 0. + >>> lr = 0. + >>> epsilon = 1e-06 + >>> global_grad_norm = 0. + >>> var_out, m_out, v_out = torch.npu_bert_apply_adam(lr, beta1, beta2, epsilon, grad, max_grad_norm, global_grad_norm, weight_decay, out=(var_in, m_in, v_in)) + >>> var_out + tensor([ 14.7733, -30.1218, -1.3647, ..., -16.6840, 7.1518, 8.4872], + device='npu:0') + ``` + +> npu_giou(self, gtboxes, trans=False, is_cross=False, mode=0) -> Tensor + +First calculate the minimum closure area of the two boxes, IoU, the proportion of the closed area that does not belong to the two boxes in the closure area, and finally subtract this proportion from IoU to get GIoU . + +- Parameters: + + - **self** (Tensor) - Bounding boxes, a 2D Tensor of type float16 or float32 with shape (N, 4). "N" indicates the number of bounding boxes, and the value "4" refers to [x1, y1, x2, y2] or [x, y, w, h]. + - **gtboxes** (Tensor) - Ground-truth boxes, a 2D Tensor of type float16 or float32 with shape (M, 4). "M" indicates the number of ground truth boxes, and the value "4" refers to [x1, y1, x2, y2] or [x, y, w, h]. + - **trans** (bool) - An optional bool, true for 'xywh', false for 'xyxy'. + - **is_cross** (bool) - An optional bool, control whether the output shape is [M, N] or [1, N]. + - **mode:** (Number) - Computation mode, a character string with the value range of [iou, iof] . + +- constraints: + + None + +- Examples: + ```python + >>> a=np.random.uniform(0,1,(4,10)).astype(np.float16) + >>> b=np.random.uniform(0,1,(4,10)).astype(np.float16) + >>> box1=torch.from_numpy(a).to("npu") + >>> box2=torch.from_numpy(a).to("npu") + >>> output = torch.npu_giou(box1, box2, trans=True, is_cross=False, mode=0) + >>> output + tensor([[1.], + [1.], + [1.], + [1.], + [1.], + [1.], + [1.], + [1.], + [1.], + [1.]], device='npu:0', dtype=torch.float16) + ``` + +> npu_silu(self) -> Tensor + +Computes the for the Swish of "x" . + +- Parameters: + + - **self** (Tensor) - A Tensor. Must be one of the following types: float16, float32 + +- constraints: + + None + +- Examples: +```python +>>> a=torch.rand(2,8).npu() +>>> output = torch.npu_silu(a) +>>> output +tensor([[0.4397, 0.7178, 0.5190, 0.2654, 0.2230, 0.2674, 0.6051, 0.3522], + [0.4679, 0.1764, 0.6650, 0.3175, 0.0530, 0.4787, 0.5621, 0.4026]], + device='npu:0') +``` + +> npu_reshape(self, shape, bool can_refresh=False) -> Tensor + +Reshapes a tensor. Only the tensor shape is changed, without changing the data. + +- Parameters: + + - **self** (Tensor) - A Tensor. + - **shape** (ListInt) - Defines the shape of the output tensor. + - **can_refresh** (bool) - Used to specify whether reshape can be refreshed in place. + +- constraints: + + This operator cannot be directly called by the acllopExecute API. + +- Examples: + ```python + >>> a=torch.rand(2,8).npu() + >>> out=torch.npu_reshape(a,(4,4)) + >>> out + tensor([[0.6657, 0.9857, 0.7614, 0.4368], + [0.3761, 0.4397, 0.8609, 0.5544], + [0.7002, 0.3063, 0.9279, 0.5085], + [0.1009, 0.7133, 0.8118, 0.6193]], device='npu:0') + ``` + +> npu_rotated_overlaps(self, query_boxes, trans=False) -> Tensor + +Calculate the overlapping area of the rotated box. + +- Parameters: + + - **self** (Tensor) - data of grad increment, a 3D Tensor of type float32 with shape (B, 5, N). + - **query_boxes** (Tensor) - Bounding boxes, a 3D Tensor of type float32 with shape (B, 5, K). + - **trans** (bool) - An optional attr, true for 'xyxyt', false for 'xywht'. + +- constraints: + + None + +- Examples: + ```python + >>> a=np.random.uniform(0,1,(1,3,5)).astype(np.float16) + >>> b=np.random.uniform(0,1,(1,2,5)).astype(np.float16) + >>> box1=torch.from_numpy(a).to("npu") + >>> box2=torch.from_numpy(a).to("npu") + >>> output = torch.npu_rotated_overlaps(box1, box2, trans=False) + >>> output + tensor([[[0.0000, 0.1562, 0.0000], + [0.1562, 0.3713, 0.0611], + [0.0000, 0.0611, 0.0000]]], device='npu:0', dtype=torch.float16) + ``` + +> npu_rotated_iou(self, query_boxes, trans=False, mode=0, is_cross=True) -> Tensor + +Calculate the IOU of the rotated box. + +- Parameters: + + - **self** (Tensor) - data of grad increment, a 3D Tensor of type float32 with shape (B, 5, N). + - **query_boxes** (Tensor) - Bounding boxes, a 3D Tensor of type float32 with shape (B, 5, K). + - **trans** (bool) - An optional attr, true for 'xyxyt', false for 'xywht'. + - **is_cross** (bool) -Cross calculation when it is True, and one-to-one calculation when it is False. + - **mode** (Number) - Computation mode, a character string with the value range of [iou, iof, giou] . + +- constraints: + + None + +- Examples: + ```python + >>> a=np.random.uniform(0,1,(2,2,5)).astype(np.float16) + >>> b=np.random.uniform(0,1,(2,3,5)).astype(np.float16) + >>> box1=torch.from_numpy(a).to("npu") + >>> box2=torch.from_numpy(a).to("npu") + >>> output = torch.npu_rotated_iou(box1, box2, trans=False, mode=0, is_cross=True) + >>> output + tensor([[[3.3325e-01, 1.0162e-01], + [1.0162e-01, 1.0000e+00]], + + [[0.0000e+00, 0.0000e+00], + [0.0000e+00, 5.9605e-08]]], device='npu:0', dtype=torch.float16) + ``` + +> npu_rotated_box_encode(anchor_box, gt_bboxes, weight) -> Tensor + +Rotate Bounding Box Encoding. + +- Parameters: + + - anchor_box (Tensor) - A 3D Tensor with shape (B, 5, N). the input tensor.Anchor boxes. "B" indicates the number of batch size, "N" indicates the number of bounding boxes, and the value "5" refers to "x0", "x1", "y0", "y1" and "angle" . + - gt_bboxes (Tensor) - A 3D Tensor of float32 (float16) with shape (B, 5, N). + - weight (Tensor) - A float list for "x0", "x1", "y0", "y1" and "angle", defaults to [1.0, 1.0, 1.0, 1.0, 1.0]. + +- constraints: + + None + +- Examples: + + ```python + >>> anchor_boxes = torch.tensor([[[30.69], [32.6], [45.94], [59.88], [-44.53]]], dtype=torch.float16).to("npu") + >>> gt_bboxes = torch.tensor([[[30.44], [18.72], [33.22], [45.56], [8.5]]], dtype=torch.float16).to("npu") + >>> weight = torch.tensor([1., 1., 1., 1., 1.], dtype=torch.float16).npu() + >>> out = torch.npu_rotated_box_encode(anchor_boxes, gt_bboxes, weight) + >>> out + tensor([[[-0.4253], + [-0.5166], + [-1.7021], + [-0.0162], + [ 1.1328]]], device='npu:0', dtype=torch.float16) ``` +> npu_rotated_box_decode(anchor_boxes, deltas, weight) -> Tensor + +Rotate Bounding Box Encoding + +- Parameters: + + - anchor_box (Tensor) - A 3D Tensor with shape (B, 5, N). the input tensor.Anchor boxes. "B" indicates the number of batch size, "N" indicates the number of bounding boxes, and the value "5" refers to "x0", "x1", "y0", "y1" and "angle" . + - deltas (Tensor) - A 3D Tensor of float32 (float16) with shape (B, 5, N). + - weight (Tensor) - A float list for "x0", "x1", "y0", "y1" and "angle", defaults to [1.0, 1.0, 1.0, 1.0, 1.0]. +- constraints: + + None + +- Examples: + + ```python + >>> anchor_boxes = torch.tensor([[[4.137],[33.72],[29.4], [54.06], [41.28]]], dtype=torch.float16).to("npu") + >>> deltas = torch.tensor([[[0.0244], [-1.992], [0.2109], [0.315], [-37.25]]], dtype=torch.float16).to("npu") + >>> weight = torch.tensor([1., 1., 1., 1., 1.], dtype=torch.float16).npu() + >>> out = torch.npu_rotated_box_decode(anchor_boxes, deltas, weight) + >>> out + tensor([[[ 1.7861], + [-10.5781], + [ 33.0000], + [ 17.2969], + [-88.4375]]], device='npu:0', dtype=torch.float16) + ``` diff --git "a/docs/zh/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227.md" index 3bf9d29807a6a6b8813cc7d9478fe45d59ef9b2b..449b41865960cde4a8640c594d00d863e3d7099e 100644 --- "a/docs/zh/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227/PyTorch\345\256\211\350\243\205\346\214\207\345\215\227.md" @@ -11,7 +11,7 @@ - [安装“torch-\*.whl ”提示“torch 1.5.0xxxx”与“torchvision”所依赖的版本不匹配](#安装-torch--whl-提示-torch-1-5-0xxxx-与-torchvision-所依赖的版本不匹配md)

简介

-用户在准备相关环境进行基于PyTorch框架模型的开发、运行时,可以选择在服务器中手动编译安装PyTorch框架相关模块。 +用户在准备相关环境进行基于PyTorch框架模型的开发、运行时,可以选择在服务器中手动编译安装PyTorch框架相关模块。 **图 1** 环境准备流程图 @@ -33,10 +33,16 @@ #### 前提条件 -- 需完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南》。 -- 需安装3.12.0以上版本的CMake,安装方法请参考[CMake安装方法](#CMake安装方法md)。 -- 需确保已安装7.3.0以上版本的gcc,7.3.0版本gcc具体安装及使用方式请参考[安装7.3.0版本gcc](#安装7-3-0版本gccmd)。 -- 需安装python版本为3.7.5、3.8、3.9。 +- 需完成CANN开发或运行环境的安装,具体操作请参考《CANN 软件安装指南》。 + +- 需安装3.12.0以上版本的CMake,安装方法请参考[CMake安装方法](#CMake安装方法md)。 + +- 需确保已安装7.3.0以上版本的gcc,7.3.0版本gcc具体安装及使用方式请参考[安装7.3.0版本gcc](#安装7-3-0版本gccmd)。 + +- 需安装python版本为3.7.5、3.8、3.9。 + +- 需注意torch1.5版本不支持python3.9编译安装(与官方保持一致),仅torch1.8.1版本支持python版本3.9进行编译安装。 + - 需确保环境中已安装patch、git工具,以Ubuntu和CentOS系统为例,命令如下: - Ubuntu系统 @@ -146,7 +152,7 @@ 或 bash build.sh --python=3.8 或 - bash build.sh --python=3.9 + bash build.sh --python=3.9 #torch1.5不支持使用python3.9编译安装 ``` 请指定环境中python版本进行编译。生成的二进制包在当前的dist目录下,即“pytorch/pytorch/dist”文件夹目录下。 diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227.md" "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227.md" index bd5c1d664dfa0c6c6321bf95c1044a048df8d198..176c1f806ab0bfb07a71941ac5fbd0ae1f5030ca 100644 --- "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227.md" +++ "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227.md" @@ -13,9 +13,7 @@ - [性能调优和分析](#性能调优和分析md) - [精度调测](#精度调测md) - [模型保存与转换](#模型保存与转换md) -- [样例说明](#样例说明md) - - [ResNet50模型迁移示例](#ResNet50模型迁移示例md) - - [ShuffleNet模型调优示例](#ShuffleNet模型调优示例md) +- [ShuffleNet模型调优示例](#ShuffleNet模型调优示例md) - [参考信息](#参考信息md) - [FAQ](#FAQmd) - [软件安装常见问题](#软件安装常见问题md) @@ -2050,8 +2048,101 @@ def main(): - 解决方案:减少编译或不需要编译该算子。 - 优化算子编译配置请参见[编译选项设置](#编译选项设置md)。 +### 端到端性能工具(E2E prof)使用说明 -

亲和库

+#### E2E prof工具介绍 + +E2E prof工具是一个将pytorch框架的profiling工具和cann prof工具获取到的框架层面数据和算子性能数据统一集成,实现端到端的模型和算子性能分析工具。 + +#### E2E prof使用教程 + +添加with语句使能E2E prof功能 + +``` +with torch.npu.profile(profiler_result_path="./result",use_e2e_profiler=Ture): + + model_train() +``` + +- profiler_result_path表示prof结果保存路径,默认为当前路径。 +- use_e2e_profiler表示是否开启E2E prof功能,默认为False(仅开启CANN prof功能)。 + +(因NUP算子需要编译后才能执行,为保证数据的准确性,建议先运行10个step,在第十个step后再进行E2E prof操作,并且一般只需要profiling1个或者2个setp即可。) + +#### E2E prof结果解析 + +通过E2E prof工具获得的结果为原始数据,需要通过解析后查看。 + +1. 以使用教程中路径为例,工具会在profiler_result_path路径下创建文件夹以保存原始数据。![](figures/1.png) + +2. 切换至如上图./result路径后,执行脚本。 + + ``` + /usr/local/Ascend/ascend-toolkit/latest/toolkit/tools/profiler/bin/msprof --export=on --output=./ + ``` + + - output:原始数据路径。 + +3. 运行完成后,在原始数据路径下输出timeline目录。如下图: + + ![](figures/2.png) + +4. timeline路径下为解析得到的性能数据,可以通过chrome://tracing/中打开。 + + 1. 浏览器进入chrome://tracing/。 + + 2. 点击load,上传文件查看。 + + + + 内容示例如下图: + + + + 该示例分为4个层次,由上到下,第一层(MsprofTx)为Pytorch框架数据,第二层(AscendCL)为ACL层面数据,第三层(Task Scheduler)为device数据,第四层(AI CPU)为AICPU数据。 + +#### E2E prof高级设置 + +E2E prof工具默认配置获取上述所有层面数据。获取数据过程亦会影响性能,若获取数据过多,会导致性能数据不具备参考价值。因此,E2E prof工具提供了可配置选项,用于精细化控制获取部分层面数据。 + +``` +with torch.npu.profile(profiler_result_path="./results", use_e2e_profiler=True,config=torch.npu. profileConfig(ACL_PROF_ACL_API=True, ACL_PROF_TASK_TIME=True, ACL_PROF_AICORE_METRICS=True,ACL_PROF_AICPU=True, ACL_PROF_L2CACHE=True, ACL_PROF_HCCL_TRACE=True, ACL_PROF_TRAINING_TRACE=True, aiCoreMetricsType=0)): +``` + +- ACL_PROF_ACL_API:表示采集AscendCL接口的性能数据,默认True + + +- ACL_PROF_TASK_TIME:采集AI Core算子的执行时间,默认True + + +- ·ACL_PROF_AICORE_METRICS:表示采集AI Core性能指标数据,aicore_metrics入参处配置的性能指标采集项才有效,默认为True + + +- ACL_PROF_AICPU:0x0008,集AI CPU任务的开始、结束轨迹数据,默认为True + +- · ACL_PROF_L2CACHE:表示采集L2 Cache数据,默认True + +- ACL_PROF_HCCL_TRACE:表示采集HCCL数据,默认为True + +- ACL_PROF_TRAINING_TRACE:表示迭代轨迹数据,记录模型正向和反向等步骤,默认为True + +其中,aiCoreMetricsType的取值和定义如下,默认为0: + +- ACL_AICORE_ARITHMETIC_UTILIZATION = 0:表示各种计算类指标占比统计,包括采集项mac_fp16_ratio、mac_int8_ratio、vec_fp32_ratio、vec_fp16_ratio、vec_int32_ratio、vec_misc_ratio + +- ACL_AICORE_PIPE_UTILIZATION = 1:表示计算单元和搬运单元耗时占比,包括采集项vec_ratio、mac_ratio、scalar_ratio、mte1_ratio、mte2_ratio、mte3_ratio、icache_miss_rate + +- ACL_AICORE_MEMORY_BANDWIDTH = 2:表示外部内存读写类指令占比,包括采集项ub_read_bw、ub_write_bw、l1_read_bw、l1_write_bw、l2_read_bw、l2_write_bw、main_mem_read_bw、main_mem_write_bw + +- ACL_AICORE_L0B_AND_WIDTH :表示内部内存读写类指令占比,包括采集项scalar_ld_ratio、scalar_st_ratio、l0a_read_bw、l0a_write_bw、l0b_read_bw、l0b_write_bw、l0c_read_bw、l0c_write_bw + +- ACL_AICORE_RESOURCE_CONFLICT_RATIO :表示流水线队列类指令占比,包括采集项vec_bankgroup_cflt_ratio、vec_bank_cflt_ratio、vec_resc_cflt_ratio、mte1_iq_full_ratio、mte2_iq_full_ratio、mte3_iq_full_ratio、cube_iq_full_ratio、vec_iq_full_ratio、iq_full_ratio + +- ACL_AICORE_NONE = 0xFF:表示不采集 + +​ + +### 亲和库

来源介绍

diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/1.png" "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/1.png" new file mode 100644 index 0000000000000000000000000000000000000000..1c7c3c517beb810563232e71e93698a74106fc09 Binary files /dev/null and "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/1.png" differ diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/2.png" "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/2.png" new file mode 100644 index 0000000000000000000000000000000000000000..927040832dcc49ff15f5a0d0e635179201e9b3a4 Binary files /dev/null and "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/2.png" differ diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/3.png" "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/3.png" new file mode 100644 index 0000000000000000000000000000000000000000..ea9ce0c03c7dcdfd0a2042d8fc98378befbf0f8b Binary files /dev/null and "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/3.png" differ diff --git "a/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/chrometracing.png" "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/chrometracing.png" new file mode 100644 index 0000000000000000000000000000000000000000..47532e82e270b0f2bd3f81e3b8315dfd6c95bf56 Binary files /dev/null and "b/docs/zh/PyTorch\347\275\221\347\273\234\346\250\241\345\236\213\347\247\273\346\244\215&\350\256\255\347\273\203\346\214\207\345\215\227/figures/chrometracing.png" differ