python, c, cuda, pycuda and embedded python
python, cpp, cuda, embedded python conversion sample code
- see practice_004_c_cuda_pycuda
- performance numpy (internally c vectorization) > cuda > cpp (for)
- need add embedded python
- need add npp
python
# add constant, multipy constant, add matrix, multipy matrix
# numpy
import os, copy
import numpy as np
pnsz = np.asarray([300, 1024, 760], dtype = np.int32)
mul = np.random.randn()
add = np.random.randn()
src = np.random.randn(pnsz[0], pnsz[1], pnsz[2]).astype(dtype=np.float32)
## numpy in CPU
src_numpy = copy.deepcopy(src)
dst_numpy = src_numpy
dst_numpy = dst_numpy * mul
dst_numpy = dst_numpy + add
dst_numpy = dst_numpy * dst_numpy
dst_numpy = dst_numpy + dst_numpy
cpp
- math_clang.c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
/*
* Multiplies each element of the source array by a constant value.
*
* @param pddst Destination array to store the result.
* @param pdsrc Source array to be multiplied.
* @param dconst Constant value to multiply each element.
* @param pnsz Integer array containing the dimensions [nx, ny, nz].
*
* p represents pointer
* d represents data
*/
void mul_const(float *pddst, float *pdsrc, float dconst, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
for (int idz = 0; idz < nz; idz++) {
for (int idy = 0; idy < ny; idy++) {
for (int idx = 0; idx < nx; idx++) {
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] * dconst;
}
}
}
return ;
}
/*
* Adds a constant value to each element of the source array.
*
* @param pddst Destination array to store the result.
* @param pdsrc Source array to which the constant is added.
* @param dconst Constant value to be added to each element.
* @param pnsz Integer array containing the dimensions [nx, ny, nz].
*/
void add_const(float *pddst, float *pdsrc, float dconst, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
for (int idz = 0; idz < nz; idz++) {
for (int idy = 0; idy < ny; idy++) {
for (int idx = 0; idx < nx; idx++) {
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] + dconst;
}
}
}
return ;
}
/*
* Multiplies two arrays element-wise and stores the result in the destination array.
*
* @param pddst Destination array to store the result.
* @param pdsrc Source array to be multiplied with another source array.
* @param pdsrc2 Second source array for element-wise multiplication.
* @param pnsz Integer array containing the dimensions [nx, ny, nz].
*/
void mul_mat(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
for (int idz = 0; idz < nz; idz++) {
for (int idy = 0; idy < ny; idy++) {
for (int idx = 0; idx < nx; idx++) {
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] * pdsrc2[id];
}
}
}
return ;
}
/*
* Adds two arrays element-wise and stores the result in the destination array.
*
* @param pddst Destination array to store the result.
* @param pdsrc Source array to which the second source array is added.
* @param pdsrc2 Second source array for element-wise addition.
* @param pnsz Integer array containing the dimensions [nx, ny, nz].
*/
void add_mat(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
for (int idz = 0; idz < nz; idz++) {
for (int idy = 0; idy < ny; idy++) {
for (int idx = 0; idx < nx; idx++) {
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] + pdsrc2[id];
}
}
}
return ;
}
- compile math_clang.c -> math_clang.o -> math_clang.so
- gcc 설치 필요 (msys2 or other install managers)
gcc -c -fPIC math_clang.c -o math_clang.o
gcc -shared math_clang.o -o libmath_clang.so
- .py
import os, copy
import ctypes
from ctypes import *
import sys
import time
if sys.platform.startswith('win'):
print('platform is window')
platform = 'window'
elif sys.platform.startswith('linux'):
print('platform is linux')
platform = 'linux'
else:
print('platform is others')
platform = 'other platform'
pnsz = np.asarray([300, 1024, 760], dtype = np.int32)
mul = np.random.randn()
add = np.random.randn()
src = np.random.randn(pnsz[0], pnsz[1], pnsz[2]).astype(dtype=np.float32)
## Clang in CPU
if platform == 'window':
clang_file = os.path.join(os.path.dirname(__file__), 'libmath_clang.so')
elif platform == 'linux':
clang_file = os.path.join(os.getcwd(), 'libmath_clang.so')
_math_clang = ctypes.CDLL(clang_file)
__mul_const_clang = _math_clang.mul_const
__add_const_clang = _math_clang.add_const
__mul_mat_clang = _math_clang.mul_mat
__add_mat_clang = _math_clang.add_mat
## 초기화, 타입 변환에 주의한다.
# init
__mul_const_clang.argtypes = (POINTER(c_float), POINTER(c_float), c_float, POINTER(c_int)) # argument
__mul_const_clang.restypes = c_void_p # return variable
__add_const_clang.argtypes = (POINTER(c_float), POINTER(c_float), c_float, POINTER(c_int)) # argument
__add_const_clang.restypes = c_void_p # return variable
__mul_mat_clang.argtypes = (POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_int)) # argument
__mul_mat_clang.restypes = c_void_p # return variable
__add_mat_clang.argtypes = (POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_int)) # argument
__add_mat_clang.restypes = c_void_p # return variable
c_float_p = lambda x: x.ctypes.data_as(POINTER(c_float))
c_int_p = lambda x: x.ctypes.data_as(POINTER(c_int))
##
src_clang = copy.deepcopy(src)
dst_clang = src_clang
__mul_const_clang(c_float_p(dst_clang), c_float_p(dst_clang), mul, c_int_p(pnsz))
__add_const_clang(c_float_p(dst_clang), c_float_p(dst_clang), add, c_int_p(pnsz))
__mul_mat_clang(c_float_p(dst_clang), c_float_p(dst_clang), c_float_p(dst_clang), c_int_p(pnsz))
__add_mat_clang(c_float_p(dst_clang), c_float_p(dst_clang), c_float_p(dst_clang), c_int_p(pnsz))
cuda (.cu)
- .cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <cuda.h>
// #define DLLEXPORT extern "C" __declspec(dllexport) // window
__global__ void mul_const_kernel(float *pddst, float *pdsrc, float dconst, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idz = blockDim.z * blockIdx.z + threadIdx.z;
if (idz >= nz || idy >= ny || idx >= nx) return;
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] * dconst;
return ;
}
__global__ void add_const_kernel(float *pddst, float *pdsrc, float dconst, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idz = blockDim.z * blockIdx.z + threadIdx.z;
if (idz >= nz || idy >= ny || idx >= nx) return;
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] + dconst;
return ;
}
__global__ void mul_mat_kernel(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idz = blockDim.z * blockIdx.z + threadIdx.z;
if (idz >= nz || idy >= ny || idx >= nx) return;
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] * pdsrc2[id];
return ;
}
__global__ void add_mat_kernel(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idz = blockDim.z * blockIdx.z + threadIdx.z;
if (idz >= nz || idy >= ny || idx >= nx) return;
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] + pdsrc2[id];
return ;
}
// cpu interface
// DLLEXPORT void mul_const(float *pddst, float *pdsrc, float dconst, int *pnsz){ // window only expression
extern "C" void mul_const(float *pddst, float *pdsrc, float dconst, int *pnsz){ // available on linux, window
float *gpddst = 0;
float *gpdsrc = 0;
int *gpnsz = 0;
cudaMalloc((void **)&gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpdsrc, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpnsz, 3*sizeof(int));
cudaMemset(gpddst, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpdsrc, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpnsz, 0, 3*sizeof(float));
cudaMemcpy(gpdsrc, pdsrc, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
cudaMemcpy(gpnsz, pnsz, 3*sizeof(int), cudaMemcpyHostToDevice); // destination, source, memory size, direction
int nthread = 8;
dim3 nblock(nthread, nthread, nthread);
dim3 ngrid ((pnsz[0] + nthread - 1) / nthread,
(pnsz[1] + nthread - 1) / nthread,
(pnsz[2] + nthread - 1) / nthread);
mul_const_kernel<<<ngrid, nblock>>>(gpddst, gpdsrc, dconst, gpnsz);
cudaMemcpy(pddst, gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(gpddst);
cudaFree(gpdsrc);
cudaFree(gpnsz);
gpddst = 0;
gpdsrc = 0;
gpnsz = 0;
return ;
}
// DLLEXPORT void add_const(float *pddst, float *pdsrc, float dconst, int *pnsz){ // window only
extern "C" void add_const(float *pddst, float *pdsrc, float dconst, int *pnsz){ // available on window and linux
float *gpddst = 0;
float *gpdsrc = 0;
int *gpnsz = 0;
cudaMalloc((void **)&gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpdsrc, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpnsz, 3*sizeof(int));
cudaMemset(gpddst, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpdsrc, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpnsz, 0, 3*sizeof(float));
cudaMemcpy(gpdsrc, pdsrc, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
cudaMemcpy(gpnsz, pnsz, 3*sizeof(int), cudaMemcpyHostToDevice); // destination, source, memory size, direction
int nthread = 8;
dim3 nblock(nthread, nthread, nthread);
dim3 ngrid ((pnsz[0] + nthread - 1) / nthread,
(pnsz[1] + nthread - 1) / nthread,
(pnsz[2] + nthread - 1) / nthread);
add_const_kernel<<<ngrid, nblock>>>(gpddst, gpdsrc, dconst, gpnsz);
cudaMemcpy(pddst, gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(gpddst);
cudaFree(gpdsrc);
cudaFree(gpnsz);
gpddst = 0;
gpdsrc = 0;
gpnsz = 0;
return ;
}
// DLLEXPORT void mul_mat(float *pddst, float *pdsrc1, float *pdsrc2, int *pnsz){ // window only
extern "C" void mul_mat(float *pddst, float *pdsrc1, float *pdsrc2, int *pnsz){ // available on window and linux
float *gpddst = 0;
float *gpdsrc1 = 0;
float *gpdsrc2 = 0;
int *gpnsz = 0;
cudaMalloc((void **)&gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpdsrc1, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpdsrc2, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpnsz, 3*sizeof(int));
cudaMemset(gpddst, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpdsrc1, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpdsrc2, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpnsz, 0, 3*sizeof(int));
cudaMemcpy(gpdsrc1, pdsrc1, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
cudaMemcpy(gpdsrc2, pdsrc2, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
cudaMemcpy(gpnsz, pnsz, 3*sizeof(int), cudaMemcpyHostToDevice); // destination, source, memory size, direction
int nthread = 8;
dim3 nblock(nthread, nthread, nthread);
dim3 ngrid ((pnsz[0] + nthread - 1) / nthread,
(pnsz[1] + nthread - 1) / nthread,
(pnsz[2] + nthread - 1) / nthread);
mul_mat_kernel<<<ngrid, nblock>>>(gpddst, gpdsrc1, gpdsrc2, gpnsz);
cudaMemcpy(pddst, gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(gpddst);
cudaFree(gpdsrc1);
cudaFree(gpdsrc2);
cudaFree(gpnsz);
gpddst = 0;
gpdsrc1 = 0;
gpdsrc2 = 0;
gpnsz = 0;
return ;
}
// DLLEXPORT void add_mat(float *pddst, float *pdsrc1, float *pdsrc2, int *pnsz){ // window only
extern "C" void add_mat(float *pddst, float *pdsrc1, float *pdsrc2, int *pnsz){ // available on window and linux
float *gpddst = 0;
float *gpdsrc1 = 0;
float *gpdsrc2 = 0;
int *gpnsz = 0;
cudaMalloc((void **)&gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpdsrc1, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpdsrc2, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMalloc((void **)&gpnsz, 3*sizeof(int));
cudaMemset(gpddst, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpdsrc1, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpdsrc2, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
cudaMemset(gpnsz, 0, 3*sizeof(int));
cudaMemcpy(gpdsrc1, pdsrc1, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
cudaMemcpy(gpdsrc2, pdsrc2, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
cudaMemcpy(gpnsz, pnsz, 3*sizeof(int), cudaMemcpyHostToDevice); // destination, source, memory size, direction
int nthread = 8;
dim3 nblock(nthread, nthread, nthread);
dim3 ngrid ((pnsz[0] + nthread - 1) / nthread,
(pnsz[1] + nthread - 1) / nthread,
(pnsz[2] + nthread - 1) / nthread);
add_mat_kernel<<<ngrid, nblock>>>(gpddst, gpdsrc1, gpdsrc2, gpnsz);
cudaMemcpy(pddst, gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(gpddst);
cudaFree(gpdsrc1);
cudaFree(gpdsrc2);
cudaFree(gpnsz);
gpddst = 0;
gpdsrc1 = 0;
gpdsrc2 = 0;
gpnsz = 0;
return ;
}
- compile math_cu_linux.cu -> libmath_cu.so
nvcc -Xcompiler -fPIC math_cu_linux.cu -shared -o libmath_cu.so
# nvcc error : 'cudafe++' died with status 0xC0000005 (ACCESS_VIOLATION).. need check compiler
- .py
## python, c, cu
import os, copy
import numpy as np
import ctypes
from ctypes import *
import sys
import time
if sys.platform.startswith('win'):
print('platform is window')
platform = 'window'
elif sys.platform.startswith('linux'):
print('platform is linux')
platform = 'linux'
else:
print('platform is others')
platform = 'other platform'
pnsz = np.asarray([300, 1024, 760], dtype = np.int32)
mul = np.random.randn()
add = np.random.randn()
src = np.random.randn(pnsz[0], pnsz[1], pnsz[2]).astype(dtype=np.float32)
## CU in GPU
if platform == 'window':
cu_file = os.path.join(os.path.dirname(__file__), 'libmath_cu.so')
elif platform == 'linux':
cu_file = os.path.join(os.getcwd(), 'libmath_cu.so')
_math_cu = ctypes.CDLL(cu_file)
__mul_const_cu = _math_cu.mul_const
__add_const_cu = _math_cu.add_const
__mul_mat_cu = _math_cu.mul_mat
__add_mat_cu = _math_cu.add_mat
__mul_const_cu.argtypes = (POINTER(c_float), POINTER(c_float), c_float, POINTER(c_int))
__mul_const_cu.restypes = c_void_p
__add_const_cu.argtypes = (POINTER(c_float), POINTER(c_float), c_float, POINTER(c_int))
__add_const_cu.restypes = c_void_p
__mul_mat_cu.argtypes = (POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_int))
__mul_mat_cu.restypes = c_void_p
__add_mat_cu.argtypes = (POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_int))
__add_mat_cu.restypes = c_void_p
c_float_p = lambda x: x.ctypes.data_as(POINTER(c_float))
c_int_p = lambda x: x.ctypes.data_as(POINTER(c_int))
pre_time_cu = time.time()
src_cu = copy.deepcopy(src)
dst_cu = src_cu
__mul_const_cu(c_float_p(dst_cu), c_float_p(dst_cu), mul, c_int_p(pnsz))
__add_const_cu(c_float_p(dst_cu), c_float_p(dst_cu), add, c_int_p(pnsz))
__mul_mat_cu(c_float_p(dst_cu), c_float_p(dst_cu), c_float_p(dst_cu), c_int_p(pnsz))
__add_mat_cu(c_float_p(dst_cu), c_float_p(dst_cu), c_float_p(dst_cu), c_int_p(pnsz))
pycuda
- .cu에서는 interface를 위한 코드가 있지만 여기선 없다.
- memcopy를 cuda.in, cuda.out으로 간단히 처리했다.
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
import copy
import time
# 주어진 커널 코드
kernel_code = """
__global__ void mul_const_kernel(float *pddst, float *pdsrc, float dconst, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idz = blockDim.z * blockIdx.z + threadIdx.z;
if (idz >= nz || idy >= ny || idx >= nx) return;
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] * dconst;
return ;
}
__global__ void add_const_kernel(float *pddst, float *pdsrc, float dconst, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idz = blockDim.z * blockIdx.z + threadIdx.z;
if (idz >= nz || idy >= ny || idx >= nx) return;
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] + dconst;
return ;
}
__global__ void mul_mat_kernel(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idz = blockDim.z * blockIdx.z + threadIdx.z;
if (idz >= nz || idy >= ny || idx >= nx) return;
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] * pdsrc2[id];
return ;
}
__global__ void add_mat_kernel(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
int nx = pnsz[0];
int ny = pnsz[1];
int nz = pnsz[2];
int id = 0;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
int idy = blockDim.y * blockIdx.y + threadIdx.y;
int idz = blockDim.z * blockIdx.z + threadIdx.z;
if (idz >= nz || idy >= ny || idx >= nx) return;
id = ny * nx * idz + nx * idy + idx;
pddst[id] = pdsrc[id] + pdsrc2[id];
return ;
}
"""
# 커널 코드를 컴파일
module = SourceModule(kernel_code)
mul_const_kernel = module.get_function("mul_const_kernel")
add_const_kernel = module.get_function("add_const_kernel")
mul_mat_kernel = module.get_function("mul_mat_kernel")
add_mat_kernel = module.get_function("add_mat_kernel")
pnsz = numpy.int32(numpy.array([300, 1024, 760]))
mul = numpy.float32(numpy.random.randn())
# mul = numpy.float32(3.0)
add = numpy.float32(numpy.random.randn())
# add = numpy.float32(5.0)
# src1 = numpy.float32(numpy.ones((pnsz[0], pnsz[1], pnsz[2]))*4)
src1 = numpy.float32(numpy.random.randn(pnsz[0], pnsz[1], pnsz[2]))
# src2 = numpy.float32(numpy.ones((pnsz[0], pnsz[1], pnsz[2]))*2)
src2 = numpy.float32(numpy.random.randn(pnsz[0], pnsz[1], pnsz[2]))
dst = numpy.float32(numpy.zeros_like(src1))
## numpy in CPU
src_numpy = copy.deepcopy(src1)
pre_time_numpy = time.time()
dst_numpy = src_numpy
dst_numpy = dst_numpy * mul
dst_numpy = dst_numpy + add
dst_numpy = dst_numpy * dst_numpy
dst_numpy = dst_numpy + dst_numpy
## cuda
block_size = (32, 8, 4)
# GTX 960: warp size 32 * streaming multiprocessor 8 * scheduler 4 = 1024
# RTX 3060: warp size 32 * streaming multiprocessor 28 * scheduler 4 = 3584 (cuda core)
grid_size = (int(pnsz[0] // block_size[0] + 1), int(pnsz[1] // block_size[1] + 1), int(pnsz[2] // block_size[2] + 1))
pre_time_pycuda = time.time()
mul_const_kernel(cuda.Out(dst), cuda.In(src1), numpy.float32(mul), cuda.In(pnsz), block=block_size, grid=grid_size)
add_const_kernel(cuda.Out(dst), cuda.In(dst), numpy.float32(add), cuda.In(pnsz), block=block_size, grid=grid_size)
mul_mat_kernel(cuda.Out(dst), cuda.In(dst), cuda.In(dst), cuda.In(pnsz), block=block_size, grid=grid_size)
add_mat_kernel(cuda.Out(dst), cuda.In(dst), cuda.In(dst), cuda.In(pnsz), block=block_size, grid=grid_size)