JCudpp
        Java bindings for CUDPP
        
        
        This library enables Java applications to use the public interface of
        
        CUDPP, the CUDA Data Parallel Primitives Library, version 2.0,
        which contains methods for sparse-matrix-vector-multiplications,
        parallel scans and sorting, as well as methods for maintaining
        a hash table on the device.
        
        
        
        
        
        
        JCudpp is only a Java binding for CUDPP. That means, in order to use JCudpp,
        you need the CUDPP library. This library can be compiled from the source
        code that is available at the
        
CUDPP home page
        
        
        
        
        
        
        
        The following table shows a comparison of a sample program performing a parallel
        prefix scan operation using CUDPP. The left side shows the C code, which is
        adapted from the
        
        "Simple CUDPP" code sample at the CUDPP home page
        (for simplicity, error checks are omitted here). The right side shows
        the same operation performed with JCudpp. The workflow
        and the involved operations are quite similar.
        
        (Using JCudpp for parallel prefix sums is easy! ;-) )
        
        There also is a complete, compileable 
        JCudpp sample on the samples page which sorts an array of integers, once in plain
        Java and once in JCudpp, and verifies the result.
        
        
        
        
            
                | Simple CUDPP in C | Simple JCudpp in Java | 
                
                
                
                
            
                | 
                    // includes#include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
 #include "cudpp.h"
 
 
 
 
 
 
 
 // Program main
 int main( int argc, char** argv)
 {
 unsigned int numElements = 32768;
 unsigned int memSize = sizeof( float) * numElements;
 
 // allocate host memory
 float* h_idata = (float*) malloc( memSize);
 
 // initialize the memory
 for (unsigned int i = 0; i < numElements; ++i)
 {
 h_idata[i] = (float) (rand() & 0xf);
 }
 
 // allocate device memory
 float* d_idata;
 cudaMalloc((void**) &d_idata, memSize);
 
 // copy host memory to device
 cudaMemcpy(d_idata, h_idata, memSize,
 cudaMemcpyHostToDevice);
 
 // allocate device memory for result
 float* d_odata;
 cudaMalloc((void**) &d_odata, memSize);
 
 // Initialize the CUDPP library
 CUDPPHandle theCudpp;
 cudppCreate(&theCudpp);
 
 CUDPPConfiguration config;
 config.op = CUDPP_ADD;
 config.datatype = CUDPP_FLOAT;
 config.algorithm = CUDPP_SCAN;
 config.options = CUDPP_OPTION_FORWARD |
 CUDPP_OPTION_EXCLUSIVE;
 
 CUDPPHandle scanplan = 0;
 cudppPlan(theCudpp, &scanplan, config, numElements, 1, 0);
 
 // Run the scan
 cudppScan(scanplan, d_odata, d_idata, numElements);
 
 // allocate mem for the result on host side
 float* h_odata = (float*) malloc( memSize);
 
 // copy result from device to host
 cudaMemcpy(h_odata, d_odata, memSize,
 cudaMemcpyDeviceToHost);
 
 cudppDestroyPlan(scanplan);
 cudppDestroy(theCudpp);
 free(h_idata);
 free(h_odata);
 cudaFree(d_idata);
 cudaFree(d_odata);
 return 0;
 }
 
 | 
                    // includesimport static jcuda.runtime.JCuda.*;
 import static jcuda.jcudpp.JCudpp.*;
 import static jcuda.jcudpp.CUDPPOperator.*;
 import static jcuda.jcudpp.CUDPPDatatype.*;
 import static jcuda.jcudpp.CUDPPAlgorithm.*;
 import static jcuda.jcudpp.CUDPPOption.*;
 import jcuda.*;
 import jcuda.runtime.*;
 import jcuda.jcudpp.*;
 
 class JCudppSample
 {
 // Program main
 public static void main(String args[])
 {
 int numElements = 32768;
 int memSize = Sizeof.FLOAT * numElements;
 
 // allocate host memory
 float h_idata[] = new float[numElements];
 
 // initialize the memory
 for (int i = 0; i < numElements; ++i)
 {
 h_idata[i] = (float)Math.random();
 }
 
 // allocate device memory
 Pointer d_idata = new Pointer();
 cudaMalloc(d_idata, memSize);
 
 // copy host memory to device
 cudaMemcpy(d_idata, Pointer.to(h_idata), memSize,
 cudaMemcpyKind.cudaMemcpyHostToDevice);
 
 // allocate device memory for result
 Pointer d_odata = new Pointer();
 cudaMalloc(d_odata, memSize);
 
 // Initialize the CUDPP library
 CUDPPHandle theCudpp = new CUDPPHandle();
 cudppCreate(theCudpp);
 
 CUDPPConfiguration config = new CUDPPConfiguration();
 config.op = CUDPP_ADD;
 config.datatype = CUDPP_FLOAT;
 config.algorithm = CUDPP_SCAN;
 config.options = CUDPP_OPTION_FORWARD |
 CUDPP_OPTION_EXCLUSIVE;
 
 CUDPPHandle scanplan = new CUDPPHandle();
 cudppPlan(theCudpp, scanplan, config, numElements, 1, 0);
 
 // Run the scan
 cudppScan(scanplan, d_odata, d_idata, numElements);
 
 // allocate mem for the result on host side
 float h_odata[] = new float[numElements];
 
 // copy result from device to host
 cudaMemcpy(Pointer.to(h_odata), d_odata, memSize,
 cudaMemcpyKind.cudaMemcpyDeviceToHost);
 
 cudppDestroyPlan(scanplan);
 cudppDestroy(theCudpp);
 
 
 cudaFree(d_idata);
 cudaFree(d_odata);
 
 }
 }
 |