/*
 * JCuda - Java bindings for NVIDIA CUDA driver and runtime API
 * http://www.jcuda.org
 *
 * Copyright 2010 Marco Hutter - http://www.jcuda.org
 */

import static jcuda.driver.JCudaDriver.*;
import static jcuda.driver.CUfilter_mode.*;
import static jcuda.driver.CUaddress_mode.*;
import static jcuda.driver.CUarray_format.*;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

import jcuda.*;
import jcuda.driver.*;

/**
 * This is a sample/test class for texture reference handling. <br />
 * <br />
 * It will create 1D, 2D and 3D arrays of float and float4
 * values, and access these arrays via texture references.<br />
 * <br />
 * The arrays will be of size 2 in each dimension. The float arrays
 * will have size N=2^dim, and be filled with consecutive values
 * 0...N-1. The float4 arrays will have size N=4*(2^dim), and filled
 * with values (0,0,0,0)...(N-1,N-1,N-1,N-1). <br />
 * <br />
 * The arrays will be read via a texture reference at position
 * 0.5, (0.5,0.5) or (0.5,0.5,0.5) respectively, and the value at
 * this position will be written into the output memory. Thus, the 
 * values that are read should be
 * <ul>
 *   <li>0.5 for the 1D float array</li> 
 *   <li>1.5 for the 2D float array</li> 
 *   <li>3.5 for the 3D float array</li> 
 *   <li>(0.5,0.5,0.5,0.5) for the 1D float4 array</li> 
 *   <li>(1.5,1.5,1.5,1.5) for the 2D float4 array</li> 
 *   <li>(3.5,3.5,3.5,3.5) for the 3D float4 array</li> 
 * </ul>
 */
public class JCudaDriverTextureTest
{
    /**
     * The module that is loaded from the CUBIN file
     */
    private static CUmodule module;

    // The size of the input arrays in each dimension
    private static int sizeX = 2;
    private static int sizeY = 2;
    private static int sizeZ = 2;

    // The float input arrays, 1D-3D 
    private static float input_float_1D[];
    private static float input_float_2D[];
    private static float input_float_3D[];

    // The float4 input arrays, 1D-3D 
    private static float input_float4_1D[];
    private static float input_float4_2D[];
    private static float input_float4_3D[];

    // The position at which the texture will be read
    private static float posX = 0.5f;
    private static float posY = 0.5f;
    private static float posZ = 0.5f;

    /**
     * The entry point of this test
     * 
     * @param args Not used
     * @throws IOException If an IO error occurs
     */
    public static void main(String args[]) throws IOException
    {
        JCudaDriver.setExceptionsEnabled(true);

        // Create the PTX file by calling the NVCC
        String ptxFileName = preparePtxFile("JCudaDriverTextureTestKernels.cu");

        // Initialize the driver and create a context for the first device.
        cuInit(0);
        CUcontext pctx = new CUcontext();
        CUdevice dev = new CUdevice();
        cuDeviceGet(dev, 0);
        cuCtxCreate(pctx, 0, dev);

        // Load the file containing the kernels
        module = new CUmodule();
        cuModuleLoad(module, ptxFileName);

        // Initialize the host input data
        initInputHost();

        // Perform the tests
        boolean passed = true;
        passed &= test_float_1D();
        passed &= test_float_2D();
        passed &= test_float_3D();
        passed &= test_float4_1D();
        passed &= test_float4_2D();
        passed &= test_float4_3D();
        System.out.println("Tests " + (passed ? "PASSED" : "FAILED"));
    }

    /**
     * Initialize all input arrays, namely the 1D-3D float and float4 arrays
     */
    private static void initInputHost()
    {
        input_float_1D = new float[sizeX];
        input_float_2D = new float[sizeX * sizeY];
        input_float_3D = new float[sizeX * sizeY * sizeZ];
        for (int x = 0; x < sizeX; x++)
        {
            input_float_1D[x] = x;
            for (int y = 0; y < sizeY; y++)
            {
                int xy = x + y * sizeY;
                input_float_2D[xy] = xy;
                for (int z = 0; z < sizeZ; z++)
                {
                    int xyz = xy + z * sizeX * sizeY;
                    input_float_3D[xyz] = xyz;
                }
            }
        }

        input_float4_1D = new float[sizeX * 4];
        input_float4_2D = new float[sizeX * sizeY * 4];
        input_float4_3D = new float[sizeX * sizeY * sizeZ * 4];
        for (int x = 0; x < sizeX; x++)
        {
            input_float4_1D[x * 4 + 0] = x;
            input_float4_1D[x * 4 + 1] = x;
            input_float4_1D[x * 4 + 2] = x;
            input_float4_1D[x * 4 + 3] = x;
            for (int y = 0; y < sizeY; y++)
            {
                int xy = x + y * sizeY;
                input_float4_2D[xy * 4 + 0] = xy;
                input_float4_2D[xy * 4 + 1] = xy;
                input_float4_2D[xy * 4 + 2] = xy;
                input_float4_2D[xy * 4 + 3] = xy;
                for (int z = 0; z < sizeZ; z++)
                {
                    int xyz = xy + z * sizeX * sizeY;
                    input_float4_3D[xyz * 4 + 0] = xyz;
                    input_float4_3D[xyz * 4 + 1] = xyz;
                    input_float4_3D[xyz * 4 + 2] = xyz;
                    input_float4_3D[xyz * 4 + 3] = xyz;
                }
            }
        }
    }

    /**
     * Test the 1D float texture access
     */
    private static boolean test_float_1D()
    {
        // Create the array on the device
        CUarray array = new CUarray();
        CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_FLOAT;
        ad.Width = sizeX;
        ad.Height = 1;
        ad.NumChannels = 1;
        cuArrayCreate(array, ad);

        // Copy the host input to the array
        Pointer pInput = Pointer.to(input_float_1D);
        cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT);

        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "texture_float_1D");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

        // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

        // Obtain the test function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "test_float_1D");
        
        // Set up the kernel parameters 
        Pointer kernelParameters = Pointer.to(
            Pointer.to(dOutput),
            Pointer.to(new float[]{ posX })
        );

        // Call the kernel function.
        cuLaunchKernel(function, 1, 1, 1, 
        	1, 1, 1, 0, null, kernelParameters, null);
        cuCtxSynchronize();
        
        // Obtain the output on the host
        float hOutput[] = new float[1];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

        // Print the results
        System.out.println("Result float  1D " + Arrays.toString(hOutput));
        float expected[] = new float[]{ 0.5f };
        boolean passed = Arrays.equals(hOutput, expected);
        System.out.println("Test   float  1D " + 
            (passed ? "PASSED" : "FAILED"));

        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);

        return passed;
    }

    /**
     * Test the 2D float texture access
     */
    private static boolean test_float_2D()
    {
        // Create the array on the device
        CUarray array = new CUarray();
        CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_FLOAT;
        ad.Width = sizeX;
        ad.Height = sizeY;
        ad.NumChannels = 1;
        cuArrayCreate(array, ad);

        // Copy the host input to the array
        CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
        copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copyHD.srcHost = Pointer.to(input_float_2D);
        copyHD.srcPitch = sizeX * Sizeof.FLOAT;
        copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copyHD.dstArray = array;
        copyHD.WidthInBytes = sizeX * Sizeof.FLOAT;
        copyHD.Height = sizeY;
        cuMemcpy2D(copyHD);

        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "texture_float_2D");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

        // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

        // Obtain the test function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "test_float_2D");

        // Set up the kernel parameters 
        Pointer kernelParameters = Pointer.to(
            Pointer.to(dOutput),
            Pointer.to(new float[]{ posX }),
         	Pointer.to(new float[]{ posY })
        );

        // Call the kernel function.
        cuLaunchKernel(function, 1, 1, 1, 
        	1, 1, 1, 0, null, kernelParameters, null);
        cuCtxSynchronize();

        // Obtain the output on the host
        float hOutput[] = new float[1];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

        // Print the results
        System.out.println("Result float  2D " + Arrays.toString(hOutput));
        float expected[] = new float[]{ 1.5f };
        boolean passed = Arrays.equals(hOutput, expected);
        System.out.println("Test   float  2D " + 
            (passed ? "PASSED" : "FAILED"));

        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);

        return passed;
    }

    /**
     * Test the 3D float texture access
     */
    private static boolean test_float_3D()
    {
        // Create the array on the device
        CUarray array = new CUarray();
        CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_FLOAT;
        ad.Width = sizeX;
        ad.Height = sizeY;
        ad.Depth = sizeZ;
        ad.NumChannels = 1;
        cuArray3DCreate(array, ad);

        // Copy the host input to the array
        CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
        copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copy.srcHost = Pointer.to(input_float_3D);
        copy.srcPitch = sizeX * Sizeof.FLOAT;
        copy.srcHeight = sizeY;
        copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copy.dstArray = array;
        copy.dstHeight = sizeX;
        copy.WidthInBytes = sizeX * Sizeof.FLOAT;
        copy.Height = sizeY;
        copy.Depth = sizeZ;
        cuMemcpy3D(copy);

        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "texture_float_3D");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 1);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

        // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.FLOAT * 1);

        // Obtain the test function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "test_float_3D");

        // Set up the kernel parameters 
        Pointer kernelParameters = Pointer.to(
            Pointer.to(dOutput),
            Pointer.to(new float[]{ posX }),
         	Pointer.to(new float[]{ posY }),
         	Pointer.to(new float[]{ posZ })
        );

        // Call the kernel function.
        cuLaunchKernel(function, 1, 1, 1, 
        	1, 1, 1, 0, null, kernelParameters, null);
        cuCtxSynchronize();

        // Obtain the output on the host
        float hOutput[] = new float[1];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 1);

        // Print the results
        System.out.println("Result float  3D " + Arrays.toString(hOutput));
        float expected[] = new float[]{ 3.5f };
        boolean passed = Arrays.equals(hOutput, expected);
        System.out.println("Test   float  3D " + 
            (passed ? "PASSED" : "FAILED"));

        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);

        return passed;
    }

    /**
     * Test the 1D float4 texture access
     */
    private static boolean test_float4_1D()
    {
        // Create the array on the device
        CUarray array = new CUarray();
        CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_FLOAT;
        ad.Width = sizeX;
        ad.Height = 1;
        ad.NumChannels = 4;
        cuArrayCreate(array, ad);

        // Copy the host input to the array
        Pointer pInput = Pointer.to(input_float4_1D);
        cuMemcpyHtoA(array, 0, pInput, sizeX * Sizeof.FLOAT * 4);

        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "texture_float4_1D");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

        // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

        // Obtain the test function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "test_float4_1D");

        // Set up the kernel parameters 
        Pointer kernelParameters = Pointer.to(
            Pointer.to(dOutput),
            Pointer.to(new float[]{ posX })
        );

        // Call the kernel function.
        cuLaunchKernel(function, 1, 1, 1, 
        	1, 1, 1, 0, null, kernelParameters, null);
        cuCtxSynchronize();

        // Obtain the output on the host
        float hOutput[] = new float[4];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

        // Print the results
        System.out.println("Result float4 1D " + Arrays.toString(hOutput));
        float expected[] = new float[]{ 0.5f, 0.5f, 0.5f, 0.5f };
        boolean passed = Arrays.equals(hOutput, expected);
        System.out.println("Test   float4 1D " + 
            (passed ? "PASSED" : "FAILED"));

        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);

        return passed;
    }

    /**
     * Test the 2D float4 texture access
     */
    private static boolean test_float4_2D()
    {
        // Create the array on the device
        CUarray array = new CUarray();
        CUDA_ARRAY_DESCRIPTOR ad = new CUDA_ARRAY_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_FLOAT;
        ad.Width = sizeX;
        ad.Height = sizeY;
        ad.NumChannels = 4;
        cuArrayCreate(array, ad);

        // Copy the host input to the array
        CUDA_MEMCPY2D copyHD = new CUDA_MEMCPY2D();
        copyHD.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copyHD.srcHost = Pointer.to(input_float4_2D);
        copyHD.srcPitch = sizeX * Sizeof.FLOAT * 4;
        copyHD.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copyHD.dstArray = array;
        copyHD.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
        copyHD.Height = sizeY;
        cuMemcpy2D(copyHD);

        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "texture_float4_2D");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

        // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

        // Obtain the test function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "test_float4_2D");

        // Set up the kernel parameters 
        Pointer kernelParameters = Pointer.to(
            Pointer.to(dOutput),
            Pointer.to(new float[]{ posX }),
         	Pointer.to(new float[]{ posY })
        );

        // Call the kernel function.
        cuLaunchKernel(function, 1, 1, 1, 
        	1, 1, 1, 0, null, kernelParameters, null);
        cuCtxSynchronize();

        // Obtain the output on the host
        float hOutput[] = new float[4];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

        // Print the results
        System.out.println("Result float4 2D " + Arrays.toString(hOutput));
        float expected[] = new float[]{ 1.5f, 1.5f, 1.5f, 1.5f };
        boolean passed = Arrays.equals(hOutput, expected);
        System.out.println("Test   float4 2D " + 
            (passed ? "PASSED" : "FAILED"));

        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);

        return passed;
    }

    /**
     * Test the 3D float4 texture access
     */
    private static boolean test_float4_3D()
    {
        // Create the array on the device
        CUarray array = new CUarray();
        CUDA_ARRAY3D_DESCRIPTOR ad = new CUDA_ARRAY3D_DESCRIPTOR();
        ad.Format = CU_AD_FORMAT_FLOAT;
        ad.Width = sizeX;
        ad.Height = sizeY;
        ad.Depth = sizeZ;
        ad.NumChannels = 4;
        cuArray3DCreate(array, ad);

        // Copy the host input to the array
        CUDA_MEMCPY3D copy = new CUDA_MEMCPY3D();
        copy.srcMemoryType = CUmemorytype.CU_MEMORYTYPE_HOST;
        copy.srcHost = Pointer.to(input_float4_3D);
        copy.srcPitch = sizeX * Sizeof.FLOAT * 4;
        copy.srcHeight = sizeY;
        copy.dstMemoryType = CUmemorytype.CU_MEMORYTYPE_ARRAY;
        copy.dstArray = array;
        copy.dstHeight = sizeX;
        copy.WidthInBytes = sizeX * Sizeof.FLOAT * 4;
        copy.Height = sizeY;
        copy.Depth = sizeZ;
        cuMemcpy3D(copy);

        // Set up the texture reference
        CUtexref texref = new CUtexref();
        cuModuleGetTexRef(texref, module, "texture_float4_3D");
        cuTexRefSetFilterMode(texref, CU_TR_FILTER_MODE_LINEAR);
        cuTexRefSetAddressMode(texref, 0, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 1, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetAddressMode(texref, 2, CU_TR_ADDRESS_MODE_CLAMP);
        cuTexRefSetFlags(texref, CU_TRSF_NORMALIZED_COORDINATES);
        cuTexRefSetFormat(texref, CU_AD_FORMAT_FLOAT, 4);
        cuTexRefSetArray(texref, array, CU_TRSA_OVERRIDE_FORMAT);

        // Prepare the output device memory
        CUdeviceptr dOutput = new CUdeviceptr();
        cuMemAlloc(dOutput, Sizeof.FLOAT * 4);

        // Obtain the test function
        CUfunction function = new CUfunction();
        cuModuleGetFunction(function, module, "test_float4_3D");

        // Set up the kernel parameters 
        Pointer kernelParameters = Pointer.to(
            Pointer.to(dOutput),
            Pointer.to(new float[]{ posX }),
         	Pointer.to(new float[]{ posY }),
         	Pointer.to(new float[]{ posZ })
        );

        // Call the kernel function.
        cuLaunchKernel(function, 1, 1, 1, 
        	1, 1, 1, 0, null, kernelParameters, null);
        cuCtxSynchronize();

        // Obtain the output on the host
        float hOutput[] = new float[4];
        cuMemcpyDtoH(Pointer.to(hOutput), dOutput, Sizeof.FLOAT * 4);

        // Print the results
        System.out.println("Result float4 3D " + Arrays.toString(hOutput));
        float expected[] = new float[]{ 3.5f, 3.5f, 3.5f, 3.5f };
        boolean passed = Arrays.equals(hOutput, expected);
        System.out.println("Test   float4 3D " + 
            (passed ? "PASSED" : "FAILED"));

        // Clean up
        cuArrayDestroy(array);
        cuMemFree(dOutput);

        return passed;
    }
    
    /**
     * The extension of the given file name is replaced with "ptx".
     * If the file with the resulting name does not exist, it is
     * compiled from the given file using NVCC. The name of the
     * PTX file is returned.
     *
     * @param cuFileName The name of the .CU file
     * @return The name of the PTX file
     * @throws IOException If an I/O error occurs
     */
    private static String preparePtxFile(String cuFileName) throws IOException
    {
        int endIndex = cuFileName.lastIndexOf('.');
        if (endIndex == -1)
        {
            endIndex = cuFileName.length()-1;
        }
        String ptxFileName = cuFileName.substring(0, endIndex+1)+"ptx";
        File ptxFile = new File(ptxFileName);
        if (ptxFile.exists())
        {
            return ptxFileName;
        }

        File cuFile = new File(cuFileName);
        if (!cuFile.exists())
        {
            throw new IOException("Input file not found: "+cuFileName);
        }
        String modelString = "-m"+System.getProperty("sun.arch.data.model");
        String command =
            "nvcc " + modelString + " -ptx "+
            cuFile.getPath()+" -o "+ptxFileName;

        System.out.println("Executing\n"+command);
        Process process = Runtime.getRuntime().exec(command);

        String errorMessage =
            new String(toByteArray(process.getErrorStream()));
        String outputMessage =
            new String(toByteArray(process.getInputStream()));
        int exitValue = 0;
        try
        {
            exitValue = process.waitFor();
        }
        catch (InterruptedException e)
        {
            Thread.currentThread().interrupt();
            throw new IOException(
                "Interrupted while waiting for nvcc output", e);
        }

        if (exitValue != 0)
        {
            System.out.println("nvcc process exitValue "+exitValue);
            System.out.println("errorMessage:\n"+errorMessage);
            System.out.println("outputMessage:\n"+outputMessage);
            throw new IOException(
                "Could not create .ptx file: "+errorMessage);
        }

        System.out.println("Finished creating PTX file");
        return ptxFileName;
    }

    /**
     * Fully reads the given InputStream and returns it as a byte array
     *
     * @param inputStream The input stream to read
     * @return The byte array containing the data from the input stream
     * @throws IOException If an I/O error occurs
     */
    private static byte[] toByteArray(InputStream inputStream)
        throws IOException
    {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        byte buffer[] = new byte[8192];
        while (true)
        {
            int read = inputStream.read(buffer);
            if (read == -1)
            {
                break;
            }
            baos.write(buffer, 0, read);
        }
        return baos.toByteArray();
    }
    

}