python, c, cuda, pycuda and embedded python

python, cpp, cuda, embedded python conversion sample code

see practice_004_c_cuda_pycuda
performance numpy (internally c vectorization) > cuda > cpp (for)
need add embedded python
need add npp

python

# add constant, multipy constant, add matrix, multipy matrix
# numpy
import os, copy
import numpy as np

pnsz = np.asarray([300, 1024, 760], dtype = np.int32)
mul = np.random.randn()
add = np.random.randn()
src = np.random.randn(pnsz[0], pnsz[1], pnsz[2]).astype(dtype=np.float32)

## numpy in CPU
src_numpy = copy.deepcopy(src)
dst_numpy = src_numpy
dst_numpy = dst_numpy * mul
dst_numpy = dst_numpy + add
dst_numpy = dst_numpy * dst_numpy
dst_numpy = dst_numpy + dst_numpy

cpp

math_clang.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>

/*
 * Multiplies each element of the source array by a constant value.
 * 
 * @param pddst     Destination array to store the result.
 * @param pdsrc     Source array to be multiplied.
 * @param dconst    Constant value to multiply each element.
 * @param pnsz      Integer array containing the dimensions [nx, ny, nz].
 * 
 * p represents pointer
 * d represents data
 */
void mul_const(float *pddst, float *pdsrc, float dconst, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    for (int idz = 0; idz < nz; idz++) {
        for (int idy = 0; idy < ny; idy++) {
            for (int idx = 0; idx < nx; idx++) {
                id = ny * nx * idz + nx * idy + idx;
                pddst[id] = pdsrc[id] * dconst;
            }
        }
    }
    
    return ;
}

/*
 * Adds a constant value to each element of the source array.
 * 
 * @param pddst     Destination array to store the result.
 * @param pdsrc     Source array to which the constant is added.
 * @param dconst    Constant value to be added to each element.
 * @param pnsz      Integer array containing the dimensions [nx, ny, nz].
 */
void add_const(float *pddst, float *pdsrc, float dconst, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    for (int idz = 0; idz < nz; idz++) {
        for (int idy = 0; idy < ny; idy++) {
            for (int idx = 0; idx < nx; idx++) {
                id = ny * nx * idz + nx * idy + idx;
                pddst[id] = pdsrc[id] + dconst;
            }
        }
    }
    
    return ;

}

/*
 * Multiplies two arrays element-wise and stores the result in the destination array.
 * 
 * @param pddst     Destination array to store the result.
 * @param pdsrc     Source array to be multiplied with another source array.
 * @param pdsrc2    Second source array for element-wise multiplication.
 * @param pnsz      Integer array containing the dimensions [nx, ny, nz].
 */
void mul_mat(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    for (int idz = 0; idz < nz; idz++) {
        for (int idy = 0; idy < ny; idy++) {
            for (int idx = 0; idx < nx; idx++) {
                id = ny * nx * idz + nx * idy + idx;
                pddst[id] = pdsrc[id] * pdsrc2[id];
            }
        }
    }
    
    return ;

}

/*
 * Adds two arrays element-wise and stores the result in the destination array.
 * 
 * @param pddst     Destination array to store the result.
 * @param pdsrc     Source array to which the second source array is added.
 * @param pdsrc2    Second source array for element-wise addition.
 * @param pnsz      Integer array containing the dimensions [nx, ny, nz].
 */
void add_mat(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    for (int idz = 0; idz < nz; idz++) {
        for (int idy = 0; idy < ny; idy++) {
            for (int idx = 0; idx < nx; idx++) {
                id = ny * nx * idz + nx * idy + idx;
                pddst[id] = pdsrc[id] + pdsrc2[id];
            }
        }
    }
    
    return ;

}

compile math_clang.c -> math_clang.o -> math_clang.so
gcc 설치 필요 (msys2 or other install managers)

gcc -c -fPIC math_clang.c -o math_clang.o
gcc -shared math_clang.o -o libmath_clang.so

import os, copy
import ctypes
from ctypes import *
import sys
import time

if sys.platform.startswith('win'): 
    print('platform is window')
    platform = 'window'
elif sys.platform.startswith('linux'): 
    print('platform is linux')
    platform = 'linux'
else: 
    print('platform is others')
    platform = 'other platform'

pnsz = np.asarray([300, 1024, 760], dtype = np.int32)
mul = np.random.randn()
add = np.random.randn()
src = np.random.randn(pnsz[0], pnsz[1], pnsz[2]).astype(dtype=np.float32)

## Clang in CPU
if platform == 'window':
    clang_file = os.path.join(os.path.dirname(__file__), 'libmath_clang.so')
elif platform == 'linux':
    clang_file = os.path.join(os.getcwd(), 'libmath_clang.so')

_math_clang = ctypes.CDLL(clang_file)

__mul_const_clang = _math_clang.mul_const
__add_const_clang = _math_clang.add_const
__mul_mat_clang = _math_clang.mul_mat
__add_mat_clang = _math_clang.add_mat

## 초기화, 타입 변환에 주의한다.
# init
__mul_const_clang.argtypes = (POINTER(c_float), POINTER(c_float), c_float, POINTER(c_int)) # argument
__mul_const_clang.restypes = c_void_p # return variable
__add_const_clang.argtypes = (POINTER(c_float), POINTER(c_float), c_float, POINTER(c_int)) # argument
__add_const_clang.restypes = c_void_p # return variable
__mul_mat_clang.argtypes = (POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_int)) # argument
__mul_mat_clang.restypes = c_void_p # return variable
__add_mat_clang.argtypes = (POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_int)) # argument
__add_mat_clang.restypes = c_void_p # return variable

c_float_p = lambda x: x.ctypes.data_as(POINTER(c_float))
c_int_p = lambda x: x.ctypes.data_as(POINTER(c_int))

##
src_clang = copy.deepcopy(src)
dst_clang = src_clang
__mul_const_clang(c_float_p(dst_clang), c_float_p(dst_clang), mul, c_int_p(pnsz))
__add_const_clang(c_float_p(dst_clang), c_float_p(dst_clang), add, c_int_p(pnsz))
__mul_mat_clang(c_float_p(dst_clang), c_float_p(dst_clang), c_float_p(dst_clang), c_int_p(pnsz))
__add_mat_clang(c_float_p(dst_clang), c_float_p(dst_clang), c_float_p(dst_clang), c_int_p(pnsz))

cuda (.cu)

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <cuda.h>
// #define DLLEXPORT extern "C" __declspec(dllexport) // window


__global__ void mul_const_kernel(float *pddst, float *pdsrc, float dconst, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    int idz = blockDim.z * blockIdx.z + threadIdx.z;

    if (idz >= nz || idy >= ny || idx >= nx) return;

    id = ny * nx * idz + nx * idy + idx;
    pddst[id] = pdsrc[id] * dconst;

    
    return ;
}


__global__ void add_const_kernel(float *pddst, float *pdsrc, float dconst, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    int idz = blockDim.z * blockIdx.z + threadIdx.z;

    if (idz >= nz || idy >= ny || idx >= nx) return;

    id = ny * nx * idz + nx * idy + idx;
    pddst[id] = pdsrc[id] + dconst;
    
    return ;

}


__global__ void mul_mat_kernel(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    int idz = blockDim.z * blockIdx.z + threadIdx.z;

    if (idz >= nz || idy >= ny || idx >= nx) return;

    id = ny * nx * idz + nx * idy + idx;
    pddst[id] = pdsrc[id] * pdsrc2[id];

    return ;

}


__global__ void add_mat_kernel(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    int idz = blockDim.z * blockIdx.z + threadIdx.z;

    if (idz >= nz || idy >= ny || idx >= nx) return;

    id = ny * nx * idz + nx * idy + idx;
    pddst[id] = pdsrc[id] + pdsrc2[id];
    
    return ;

}

// cpu interface
// DLLEXPORT void mul_const(float *pddst, float *pdsrc, float dconst, int *pnsz){ // window only expression
extern "C" void mul_const(float *pddst, float *pdsrc, float dconst, int *pnsz){ // available on linux, window
    float *gpddst = 0;
    float *gpdsrc = 0;
    int *gpnsz = 0;

    cudaMalloc((void **)&gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpdsrc, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpnsz, 3*sizeof(int));

    cudaMemset(gpddst, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpdsrc, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpnsz, 0, 3*sizeof(float));

    cudaMemcpy(gpdsrc, pdsrc, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
    cudaMemcpy(gpnsz, pnsz, 3*sizeof(int), cudaMemcpyHostToDevice); // destination, source, memory size, direction

    int nthread = 8;
    dim3 nblock(nthread, nthread, nthread);
    dim3 ngrid ((pnsz[0] + nthread - 1) / nthread,
                (pnsz[1] + nthread - 1) / nthread,
                (pnsz[2] + nthread - 1) / nthread);

    mul_const_kernel<<<ngrid, nblock>>>(gpddst, gpdsrc, dconst, gpnsz);

    cudaMemcpy(pddst, gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(gpddst);
    cudaFree(gpdsrc);
    cudaFree(gpnsz);

    gpddst = 0;
    gpdsrc = 0;
    gpnsz = 0;

    return ;
}

// DLLEXPORT void add_const(float *pddst, float *pdsrc, float dconst, int *pnsz){ // window only
extern "C" void add_const(float *pddst, float *pdsrc, float dconst, int *pnsz){ // available on window and linux 
    float *gpddst = 0;
    float *gpdsrc = 0;
    int *gpnsz = 0;

    cudaMalloc((void **)&gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpdsrc, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpnsz, 3*sizeof(int));

    cudaMemset(gpddst, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpdsrc, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpnsz, 0, 3*sizeof(float));

    cudaMemcpy(gpdsrc, pdsrc, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
    cudaMemcpy(gpnsz, pnsz, 3*sizeof(int), cudaMemcpyHostToDevice); // destination, source, memory size, direction

    int nthread = 8;
    dim3 nblock(nthread, nthread, nthread);
    dim3 ngrid ((pnsz[0] + nthread - 1) / nthread,
                (pnsz[1] + nthread - 1) / nthread,
                (pnsz[2] + nthread - 1) / nthread);

    add_const_kernel<<<ngrid, nblock>>>(gpddst, gpdsrc, dconst, gpnsz);

    cudaMemcpy(pddst, gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(gpddst);
    cudaFree(gpdsrc);
    cudaFree(gpnsz);

    gpddst = 0;
    gpdsrc = 0;
    gpnsz = 0;

    return ;
}

// DLLEXPORT void mul_mat(float *pddst, float *pdsrc1, float *pdsrc2, int *pnsz){ // window only
extern "C" void mul_mat(float *pddst, float *pdsrc1, float *pdsrc2, int *pnsz){ // available on window and linux
    float *gpddst = 0;
    float *gpdsrc1 = 0;
    float *gpdsrc2 = 0;
    int *gpnsz = 0;

    cudaMalloc((void **)&gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpdsrc1, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpdsrc2, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpnsz, 3*sizeof(int));


    cudaMemset(gpddst, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpdsrc1, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpdsrc2, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpnsz, 0, 3*sizeof(int));

    cudaMemcpy(gpdsrc1, pdsrc1, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
    cudaMemcpy(gpdsrc2, pdsrc2, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
    cudaMemcpy(gpnsz, pnsz, 3*sizeof(int), cudaMemcpyHostToDevice); // destination, source, memory size, direction

    int nthread = 8;
    dim3 nblock(nthread, nthread, nthread);
    dim3 ngrid ((pnsz[0] + nthread - 1) / nthread,
                (pnsz[1] + nthread - 1) / nthread,
                (pnsz[2] + nthread - 1) / nthread);

    mul_mat_kernel<<<ngrid, nblock>>>(gpddst, gpdsrc1, gpdsrc2, gpnsz);

    cudaMemcpy(pddst, gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(gpddst);
    cudaFree(gpdsrc1);
    cudaFree(gpdsrc2);
    cudaFree(gpnsz);

    gpddst = 0;
    gpdsrc1 = 0;
    gpdsrc2 = 0;
    gpnsz = 0;

    return ;
}


// DLLEXPORT void add_mat(float *pddst, float *pdsrc1, float *pdsrc2, int *pnsz){ // window only
extern "C" void add_mat(float *pddst, float *pdsrc1, float *pdsrc2, int *pnsz){ // available on window and linux
    float *gpddst = 0;
    float *gpdsrc1 = 0;
    float *gpdsrc2 = 0;
    int *gpnsz = 0;

    cudaMalloc((void **)&gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpdsrc1, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpdsrc2, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMalloc((void **)&gpnsz, 3*sizeof(int));


    cudaMemset(gpddst, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpdsrc1, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpdsrc2, 0, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float));
    cudaMemset(gpnsz, 0, 3*sizeof(int));

    cudaMemcpy(gpdsrc1, pdsrc1, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
    cudaMemcpy(gpdsrc2, pdsrc2, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyHostToDevice); // destination, source, memory size, direction
    cudaMemcpy(gpnsz, pnsz, 3*sizeof(int), cudaMemcpyHostToDevice); // destination, source, memory size, direction

    int nthread = 8;
    dim3 nblock(nthread, nthread, nthread);
    dim3 ngrid ((pnsz[0] + nthread - 1) / nthread,
                (pnsz[1] + nthread - 1) / nthread,
                (pnsz[2] + nthread - 1) / nthread);

    add_mat_kernel<<<ngrid, nblock>>>(gpddst, gpdsrc1, gpdsrc2, gpnsz);

    cudaMemcpy(pddst, gpddst, pnsz[0]*pnsz[1]*pnsz[2]*sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(gpddst);
    cudaFree(gpdsrc1);
    cudaFree(gpdsrc2);
    cudaFree(gpnsz);

    gpddst = 0;
    gpdsrc1 = 0;
    gpdsrc2 = 0;
    gpnsz = 0;

    return ;
}

compile math_cu_linux.cu -> libmath_cu.so

nvcc -Xcompiler -fPIC math_cu_linux.cu -shared -o libmath_cu.so
# nvcc error   : 'cudafe++' died with status 0xC0000005 (ACCESS_VIOLATION).. need check compiler

## python, c, cu

import os, copy
import numpy as np
import ctypes
from ctypes import *
import sys
import time

if sys.platform.startswith('win'): 
    print('platform is window')
    platform = 'window'
elif sys.platform.startswith('linux'): 
    print('platform is linux')
    platform = 'linux'
else: 
    print('platform is others')
    platform = 'other platform'

pnsz = np.asarray([300, 1024, 760], dtype = np.int32)
mul = np.random.randn()
add = np.random.randn()
src = np.random.randn(pnsz[0], pnsz[1], pnsz[2]).astype(dtype=np.float32)

## CU in GPU

if platform == 'window':
    cu_file = os.path.join(os.path.dirname(__file__), 'libmath_cu.so')
elif platform == 'linux':
    cu_file = os.path.join(os.getcwd(), 'libmath_cu.so')

_math_cu = ctypes.CDLL(cu_file)

__mul_const_cu = _math_cu.mul_const
__add_const_cu = _math_cu.add_const
__mul_mat_cu = _math_cu.mul_mat
__add_mat_cu = _math_cu.add_mat

__mul_const_cu.argtypes = (POINTER(c_float), POINTER(c_float), c_float, POINTER(c_int))
__mul_const_cu.restypes = c_void_p
__add_const_cu.argtypes = (POINTER(c_float), POINTER(c_float), c_float, POINTER(c_int))
__add_const_cu.restypes = c_void_p
__mul_mat_cu.argtypes = (POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_int))
__mul_mat_cu.restypes = c_void_p
__add_mat_cu.argtypes = (POINTER(c_float), POINTER(c_float), POINTER(c_float), POINTER(c_int))
__add_mat_cu.restypes = c_void_p

c_float_p = lambda x: x.ctypes.data_as(POINTER(c_float))
c_int_p = lambda x: x.ctypes.data_as(POINTER(c_int))
pre_time_cu = time.time()
src_cu = copy.deepcopy(src)
dst_cu = src_cu

__mul_const_cu(c_float_p(dst_cu), c_float_p(dst_cu), mul, c_int_p(pnsz))
__add_const_cu(c_float_p(dst_cu), c_float_p(dst_cu), add, c_int_p(pnsz))
__mul_mat_cu(c_float_p(dst_cu), c_float_p(dst_cu), c_float_p(dst_cu), c_int_p(pnsz))
__add_mat_cu(c_float_p(dst_cu), c_float_p(dst_cu), c_float_p(dst_cu), c_int_p(pnsz))

pycuda

.cu에서는 interface를 위한 코드가 있지만 여기선 없다.
memcopy를 cuda.in, cuda.out으로 간단히 처리했다.

import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule
import numpy
import copy
import time

# 주어진 커널 코드
kernel_code = """
__global__ void mul_const_kernel(float *pddst, float *pdsrc, float dconst, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    int idz = blockDim.z * blockIdx.z + threadIdx.z;

    if (idz >= nz || idy >= ny || idx >= nx) return;

    id = ny * nx * idz + nx * idy + idx;
    pddst[id] = pdsrc[id] * dconst;

    
    return ;
}


__global__ void add_const_kernel(float *pddst, float *pdsrc, float dconst, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    int idz = blockDim.z * blockIdx.z + threadIdx.z;

    if (idz >= nz || idy >= ny || idx >= nx) return;

    id = ny * nx * idz + nx * idy + idx;
    pddst[id] = pdsrc[id] + dconst;
    
    return ;

}


__global__ void mul_mat_kernel(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    int idz = blockDim.z * blockIdx.z + threadIdx.z;

    if (idz >= nz || idy >= ny || idx >= nx) return;

    id = ny * nx * idz + nx * idy + idx;
    pddst[id] = pdsrc[id] * pdsrc2[id];

    return ;

}


__global__ void add_mat_kernel(float *pddst, float *pdsrc, float *pdsrc2, int *pnsz) {
    int nx = pnsz[0];
    int ny = pnsz[1];
    int nz = pnsz[2];
    int id = 0;

    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    int idy = blockDim.y * blockIdx.y + threadIdx.y;
    int idz = blockDim.z * blockIdx.z + threadIdx.z;

    if (idz >= nz || idy >= ny || idx >= nx) return;

    id = ny * nx * idz + nx * idy + idx;
    pddst[id] = pdsrc[id] + pdsrc2[id];
    
    return ;

}
"""

# 커널 코드를 컴파일
module = SourceModule(kernel_code)
mul_const_kernel = module.get_function("mul_const_kernel")
add_const_kernel = module.get_function("add_const_kernel")
mul_mat_kernel = module.get_function("mul_mat_kernel")
add_mat_kernel = module.get_function("add_mat_kernel")

pnsz = numpy.int32(numpy.array([300, 1024, 760]))
mul = numpy.float32(numpy.random.randn())
# mul = numpy.float32(3.0)
add = numpy.float32(numpy.random.randn())
# add = numpy.float32(5.0)
# src1 = numpy.float32(numpy.ones((pnsz[0], pnsz[1], pnsz[2]))*4)
src1 = numpy.float32(numpy.random.randn(pnsz[0], pnsz[1], pnsz[2]))
# src2 = numpy.float32(numpy.ones((pnsz[0], pnsz[1], pnsz[2]))*2)
src2 = numpy.float32(numpy.random.randn(pnsz[0], pnsz[1], pnsz[2]))
dst = numpy.float32(numpy.zeros_like(src1))


## numpy in CPU
src_numpy = copy.deepcopy(src1)
pre_time_numpy = time.time()
dst_numpy = src_numpy
dst_numpy = dst_numpy * mul
dst_numpy = dst_numpy + add
dst_numpy = dst_numpy * dst_numpy
dst_numpy = dst_numpy + dst_numpy

## cuda 


block_size = (32, 8, 4) 
# GTX 960: warp size 32 * streaming multiprocessor 8 * scheduler 4 = 1024
# RTX 3060: warp size 32 * streaming multiprocessor 28 * scheduler 4 = 3584 (cuda core)
grid_size = (int(pnsz[0] // block_size[0] + 1), int(pnsz[1] // block_size[1] + 1), int(pnsz[2] // block_size[2] + 1))

pre_time_pycuda = time.time()

mul_const_kernel(cuda.Out(dst), cuda.In(src1), numpy.float32(mul), cuda.In(pnsz), block=block_size, grid=grid_size)
add_const_kernel(cuda.Out(dst), cuda.In(dst), numpy.float32(add), cuda.In(pnsz), block=block_size, grid=grid_size)
mul_mat_kernel(cuda.Out(dst), cuda.In(dst), cuda.In(dst), cuda.In(pnsz), block=block_size, grid=grid_size)
add_mat_kernel(cuda.Out(dst), cuda.In(dst), cuda.In(dst), cuda.In(pnsz), block=block_size, grid=grid_size)

참고

Share on

Twitter Facebook LinkedIn

python, c, cuda, pycuda and embedded python

python

cpp

cuda (.cu)

pycuda

참고

Share on

You may also enjoy

CUDA example code 1. main.cpp, kernel.cu, kernel.cuh 구성으로 빌드하기, ubuntu

CMake

Tesseract OCR and OpenCV in Ubuntu

Qt for Linux