Loading...
/********************************************************************
*  This is a example of the OpenCL program.
*********************************************************************/




#include <mex.h>
#define PrintErrMsgQuit    mexErrMsgTxt
#define PRINTF  mexPrintf
  
#include <stdlib.h>
#include <stdio.h>
#include <string.h>


/* Note there is a fixed overhead for issuing kernel call for execution on GPU (about 5ms on NVIDIA GPUs) */
/* However with profiling, the timing is accurate.  Therefore iteration is not neccessary  */ 
// #define ITERATION    100




#include <CL/cl.h>


struct float2
{
	float x;
	float y;
};

int complexNumber = 0;
float* matlab_ar;
float* matlab_ai;

//////////////////////////////////////// matlab complex to float


void pack_r2c_sp(float2  *output_float,
              float *input_re,
              int M,
			  int N)
{
     int i, j;
     for (i = 0; i < M; i++)
     for (j = 0; j < N; j++)
     {
               output_float[i+M*j].x = input_re[i+M*j];
               output_float[i+M*j].y = 0.0f;
     }
}


void pack_c2c_sp(float2  *output_float,
              float *input_re,
              float *input_im,
              int M,
			  int N)
{
     int i, j;
     for (i = 0; i < M; i++)
     for (j = 0; j < N; j++)
     {
               output_float[i+M*j].x = input_re[i+M*j];
               output_float[i+M*j].y = input_im[i+M*j];
     }
}


void unpack_c2c_sp(float2  *input_float,
                float *output_re,
                float *output_im,
                int M,
			    int N)
{
     int i, j;
     for (i = 0; i < M; i++)
     for (j = 0; j < N; j++)
     {
                output_re[i+M*j] = input_float[i+M*j].x;
                output_im[i+M*j] = input_float[i+M*j].y;
     }

}
//////////////////////////////////////// matlab complex to float






/****************************************** CODE CHANGE ZONE STARTS *****************************************************/

char* kernel_file_name = "FFT_Kernels.cl";
char* kernel_function_2 = "FFT_2";
char* kernel_function_4 = "FFT_4";
char* kernel_function_8 = "FFT_8";
char* kernel_function_16 = "FFT_16";
char* kernel_function_32 = "FFT_32";
char* kernel_function_8_stride = "FFT_8_stride";

char* kernel_function_pack_complex = "pack_complex";
char* kernel_function_unpack_complex = "unpack_complex";


#define DATA_TYPE    float



#define CL_DATA_TYPE    cl_float



// note:make BLOCK_X 16 and BLOCK_Y 16 for NV and make BLOCK_X 64 and BLOCK_Y 1
#define BLOCK_X 64
#define BLOCK_Y 1

// mxUINT8_CLASS/uchar, mxINT8_CLASS/char, mxUINT16_CLASS/ushort, mxINT16_CLASS/short, mxUINT32_CLASS/uint, mxINT32_CLASS/int,mxSINGLE_CLASS/float, mXDOUBLE_CLASS/double    
#define MATRIX_DATA_TYPE    mxSINGLE_CLASS      
    
DATA_TYPE* input_ptr;
DATA_TYPE* output_ptr;


cl_int status;
cl_context    context;            /**< CL context */
cl_device_id *devices;            /**< CL device list */
cl_mem        inputBuffer;        /**< CL memory input buffer */
cl_mem        outputBuffer;        /**< CL memory output buffer */
cl_mem        inputBuffer_matlab;        /**< CL memory input buffer */
cl_mem        outputBuffer_ar_matlab;        /**< CL memory output buffer */
cl_mem        outputBuffer_ai_matlab;        /**< CL memory output buffer */
    
cl_command_queue commandQueue;    /**< CL command queue */
cl_program    program;            /**< CL program  */
cl_kernel    kernel_2 = NULL;             /**< CL kernel */
cl_kernel    kernel = NULL;             /**< CL kernel */
cl_kernel    kernel_4 = NULL;             /**< CL kernel */
cl_kernel    kernel_8 = NULL;             /**< CL kernel */
cl_kernel    kernel_16 = NULL;             /**< CL kernel */
cl_kernel    kernel_32 = NULL;             /**< CL kernel */
cl_kernel    kernel_64 = NULL;             /**< CL kernel */
cl_kernel    kernel_128 = NULL;             /**< CL kernel */
cl_kernel    kernel_256 = NULL;             /**< CL kernel */
cl_kernel    kernel_512 = NULL;             /**< CL kernel */
cl_kernel    kernel_1024 = NULL;             /**< CL kernel */
cl_kernel	 kernel_pack_complex;
cl_kernel	 kernel_unpack_complex;


/****************************************** CODE CHANGE ZONE ENDS ******************************************************/

int M, N;
int height, width;






void init_CL(){
    size_t deviceListSize;

    cl_device_type dType;
    
    dType = CL_DEVICE_TYPE_GPU;
    
    
/****************************************** CODE CHANGE ZONE STARTS *****************************************************/
    
    /**************************  ATI production release update *************/
 
    /*
     * Have a look at the available platforms and pick either
     * the AMD one if available or a reasonable default.
     */


    cl_uint numPlatforms;
    cl_platform_id platform = NULL;
    status = clGetPlatformIDs(0, NULL, &numPlatforms);
    if(status != CL_SUCCESS) PrintErrMsgQuit("\n clGetPlatformIDs (numPlatforms) failed\n");
    
    if (0 < numPlatforms) 
    {
        cl_platform_id* platforms = new cl_platform_id[numPlatforms];
        status = clGetPlatformIDs(numPlatforms, platforms, NULL);
        if(status != CL_SUCCESS) PrintErrMsgQuit("\n clGetPlatformIDs failed\n");
        
        
        for (unsigned i = 0; i < numPlatforms; ++i) 
        {
            char pbuf[100];
            status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL);


            if(status != CL_SUCCESS) PrintErrMsgQuit("\n clGetPlatformInfo failed\n");


            platform = platforms[i];
            if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) 
            {
                break;
            }
        }
        delete[] platforms;
    }


    /*
     * If we could find our platform, use it. Otherwise pass a NULL and get whatever the
     * implementation thinks we should be using.
     */


    cl_context_properties cps[3] = 
    {
        CL_CONTEXT_PLATFORM, 
        (cl_context_properties)platform, 
        0
    };
    /* Use NULL for backward compatibility */
    cl_context_properties* cprops = (NULL == platform) ? NULL : cps;


    context = clCreateContextFromType(cprops, dType, NULL, NULL, &status);
    
    
    /**********  End of the ATI production release update ***************/
    
    
    //context = clCreateContextFromType(0, dType, NULL, NULL, &status);

/*****The only runtime difference between ATI and NV OpenCL implementations is ATI production release update ***********/
/****************************************** CODE CHANGE ZONE ENDS ******************************************************/

    if(status != CL_SUCCESS) PrintErrMsgQuit("\n CreateContext failed\n");
    

    /* First, get the size of device list data */
    status = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize);
    if(status != CL_SUCCESS) PrintErrMsgQuit("\n GetContextInfo failed\n");
    

    /* Now allocate memory for device list based on the size we got earlier */
    devices = (cl_device_id *)malloc(deviceListSize);
    if(devices==NULL)  PrintErrMsgQuit("Failed to allocate memory (devices).");
    

    /* Now, get the device list data */
    status = clGetContextInfo(context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL);
    if(status != CL_SUCCESS)  PrintErrMsgQuit("\n GetContextInfo failed\n");

    {
        /* The block is to move the declaration of prop closer to its use */
        cl_command_queue_properties prop = 0;
        prop |= CL_QUEUE_PROFILING_ENABLE;

        commandQueue = clCreateCommandQueue(context, devices[0], prop, &status);
        if(status != CL_SUCCESS) PrintErrMsgQuit("\n CreateCommandQueue failed\n");
    }
    
/****************************************** CODE CHANGE ZONE STARTS *****************************************************/
    
    inputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(CL_DATA_TYPE ) * width * height, NULL,  &status);    
    if(status != CL_SUCCESS)  PrintErrMsgQuit("\n CreateBuffer failed\n");

    outputBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(CL_DATA_TYPE ) * width * height, NULL, &status);
    if(status != CL_SUCCESS) PrintErrMsgQuit("\n CreateBuffer failed\n");


    inputBuffer_matlab = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(CL_DATA_TYPE ) * width * height, NULL,  &status);    
    if(status != CL_SUCCESS)  PrintErrMsgQuit("\n CreateBuffer failed\n");

    outputBuffer_ar_matlab = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(CL_DATA_TYPE ) * width * height/2, NULL, &status);
    if(status != CL_SUCCESS) PrintErrMsgQuit("\n CreateBuffer failed\n");

    outputBuffer_ai_matlab = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(CL_DATA_TYPE ) * width * height/2, NULL, &status);
    if(status != CL_SUCCESS) PrintErrMsgQuit("\n CreateBuffer failed\n");


    
    
/****************************************** CODE CHANGE ZONE ENDS ******************************************************/
    
    
    /***********************************************************************************************/
    /***********************************DO NOT CHANGE THE CODE BELOW *******************************/
    /****************************** create a CL program using the kernel source ********************/

    char* source;
    
    // locals 
    FILE* file_ptr = NULL;
    size_t source_length;

    if(fopen_s(&file_ptr, kernel_file_name, "rb") != 0)  PrintErrMsgQuit("\nError: Open cl file \n");

    // get the length of the source code
    fseek(file_ptr, 0, SEEK_END); 
    source_length = ftell(file_ptr);
    fseek(file_ptr, 0, SEEK_SET); 

    // allocate a buffer for the source code string and read it in
    char* source_string = (char *)mxMalloc(source_length + 1); 
    if (fread((source_string), source_length, 1, file_ptr) != 1)
    {
        fclose(file_ptr);
        mxFree(source_string);
        PrintErrMsgQuit("\nError: Read cl file \n");
    }

    // close the file and return the total length of the combined (preamble + source) string
    fclose(file_ptr);
    source_string[source_length] = '\0';
    
    //get the source in string
    source = source_string;
    
    /******************************  END OF LOADING KERNEL CODE INTO STRING  ***********************/
    /***********************************************************************************************/
    

    size_t sourceSize[] = { strlen(source) };
    program = clCreateProgramWithSource(context, 1, (const char **)&source, sourceSize, &status);
    if(status != CL_SUCCESS) { PRINTF("\n CreateProgramWithSource failed, %d\n", status); PrintErrMsgQuit(" "); }

    /* create a cl program executable for all the devices specified */
    status = clBuildProgram(program, 1, devices, NULL, NULL, NULL);
    if(status != CL_SUCCESS) {
         //print kernel compilation error
            char programLog[2048];    
        PRINTF("\n BuildProgram failed:%d\n", status);
            status = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 2048, programLog, 0);
                    
        for (unsigned int i=0; i<2048; i++)
        PRINTF("%c", programLog[i]);
    
        PrintErrMsgQuit(" ");
    }

	kernel_2 = clCreateKernel(program, "FFT_2", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_4 = clCreateKernel(program, "FFT_4", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_8 = clCreateKernel(program, "FFT_8", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_16 = clCreateKernel(program, "FFT_16", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_32 = clCreateKernel(program, "FFT_32", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_64 = clCreateKernel(program, "FFT_64", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_128 = clCreateKernel(program, "FFT_128", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_256 = clCreateKernel(program, "FFT_256", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_512 = clCreateKernel(program, "FFT_512", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}
	kernel_1024 = clCreateKernel(program, "FFT_1024", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}

	kernel_pack_complex = clCreateKernel(program, "pack_complex", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}

	kernel_unpack_complex = clCreateKernel(program, "unpack_complex", &status);
    if(status != CL_SUCCESS)  { PRINTF("\n CreateKernel failed %d\n", status);    PrintErrMsgQuit(" ");}

    mxFree(source_string);
    
}


void run_CL() {
    cl_event events[2];

    size_t globalThreads[1];
    size_t localThreads[1];
	int shareSize;

    /********************  memcpy and kernel execution ************************************************/
    
    status = clEnqueueWriteBuffer(commandQueue, inputBuffer_matlab, CL_TRUE, 0, complexNumber?width*height*sizeof(CL_DATA_TYPE):width*height*sizeof(CL_DATA_TYPE)/2, input_ptr, 0, NULL, &events[0]);
	PRINTF("%f, %f\n", input_ptr[80], input_ptr[81]);
    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: Write Buffer. 'clWriteBuffer'\n");
    /* Wait for the read buffer to finish execution */
    
    status = clWaitForEvents(1, &events[0]);
    if(status != CL_SUCCESS)    PrintErrMsgQuit("Error: Waiting for write buffer call to finish. (clWaitForEvents)\n");
    clReleaseEvent(events[0]);
    //clFinish(commandQueue);
    

    //may not be neccessary    
    clFinish(commandQueue);
    
    

	/***************************************** transfer memory from matlab**************************/

	{
		int argv = 0;
		status = clSetKernelArg(kernel_pack_complex, argv++, sizeof(cl_mem), (void *)&inputBuffer);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		status = clSetKernelArg(kernel_pack_complex, argv++, sizeof(cl_mem), (void *)&inputBuffer_matlab);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		int w0 = width/2*height;
		status = clSetKernelArg(kernel_pack_complex, argv++, sizeof(cl_int), (void*)&w0);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		status = clSetKernelArg(kernel_pack_complex, argv++, sizeof(cl_int), (void*)&complexNumber);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		globalThreads[0] = w0;
		localThreads[0] = 64;
		status = clEnqueueNDRangeKernel(commandQueue, kernel_pack_complex, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]);
		if(status != CL_SUCCESS)  PrintErrMsgQuit("Error: clEnqueueNDRangeKernel failed.\n");
		status = clWaitForEvents(1, &events[0]);
		clReleaseEvent(events[0]);
	}
	


/****************************************** CODE CHANGE ZONE STARTS *****************************************************/


    /*** Set appropriate arguments to the kernel ***/
	switch (width/2) {
		case 2:
			globalThreads[0] = height*width/2/2;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = 16;
			kernel = kernel_2;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 4:
			globalThreads[0] = height*width/2/4;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = 16;
			kernel = kernel_4;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 8:
			globalThreads[0] = height*width/2/8;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = 16;
			kernel = kernel_8;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 16:
			globalThreads[0] = height*width/2/16;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = 16;
			kernel = kernel_16;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 32:
			globalThreads[0] = height*width/2/32;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = 4*32;
			kernel = kernel_32;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 64:
			globalThreads[0] = height*width/2/4;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = (64)*4;
			kernel = kernel_64;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 128:
			globalThreads[0] = height*width/2/4;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = (128)*2;
			kernel = kernel_128;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 256:
			globalThreads[0] = height*width/2/16;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = (256+16)*4;
			kernel = kernel_256;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 512:
			globalThreads[0] = height*width/2/8;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = 512;
			kernel = kernel_512;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
			break;
		case 1024:
			globalThreads[0] = height*width/2/32;
			localThreads[0] = 64;
			if (localThreads[0]>globalThreads[0]) localThreads[0] = globalThreads[0];
			shareSize = (1024+32)*2;
			kernel = kernel_1024;
//			std::cout << "(" << globalThreads[0] << ", " << localThreads[0] << ")" << std::endl;
	}

	{
		int argv = 0;
		status = clSetKernelArg(kernel, argv++, sizeof(cl_mem), (void *)&outputBuffer);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		status = clSetKernelArg(kernel, argv++, sizeof(cl_mem), (void *)&inputBuffer);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		int w0 = width/2;
		status = clSetKernelArg(kernel, argv++, sizeof(cl_int), (void*)&w0);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		status = clSetKernelArg(kernel, argv++, sizeof(cl_float)*shareSize, NULL);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");
	}


/****************************************** CODE CHANGE ZONE ENDS *****************************************************/
    

	status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]);
	if(status != CL_SUCCESS)  PrintErrMsgQuit("Error: clEnqueueNDRangeKernel failed.\n");



	/* wait for the kernel call to finish execution */
	status = clWaitForEvents(1, &events[0]);
	if(status != CL_SUCCESS)  PrintErrMsgQuit("Error: clWaitForEvents failed.\n");

    {
		long long kernelsStartTime, kernelsEndTime;
		long long enqueueTime, submitTime;
		double totalKernelTime;
		
		status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_QUEUED, sizeof(long long), &enqueueTime, NULL);
		if(status != CL_SUCCESS)  PrintErrMsgQuit("Error:clGetEventProfilingInfo start failed.\n");
		
		status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_SUBMIT, sizeof(long long), &submitTime, NULL);
		if(status != CL_SUCCESS)  PrintErrMsgQuit("Error:clGetEventProfilingInfo start failed.\n");
		
        	status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START, sizeof(long long), &kernelsStartTime, NULL);
		if(status != CL_SUCCESS)  PrintErrMsgQuit("Error:clGetEventProfilingInfo start failed.\n");
		

        	status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(long long), &kernelsEndTime, NULL);
		if(status != CL_SUCCESS)  PrintErrMsgQuit("Error:clGetEventProfilingInfo end failed.\n");

        	/* Compute total time (also convert from nanoseconds to miliseconds) */
        	totalKernelTime = (double)(kernelsEndTime - kernelsStartTime)/1e6;
        	PRINTF("\nThe kernel execution time is %.4f\n", totalKernelTime);
    }
	clReleaseEvent(events[0]);


    
	{
		int argv = 0;
		status = clSetKernelArg(kernel_unpack_complex, argv++, sizeof(cl_mem), (void *)&outputBuffer_ar_matlab);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		status = clSetKernelArg(kernel_unpack_complex, argv++, sizeof(cl_mem), (void *)&outputBuffer_ai_matlab);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		status = clSetKernelArg(kernel_unpack_complex, argv++, sizeof(cl_mem), (void *)&outputBuffer);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		int w0 = width/2*height;
		status = clSetKernelArg(kernel_unpack_complex, argv++, sizeof(cl_int), (void*)&w0);
	    if (status != CL_SUCCESS)  PrintErrMsgQuit("Error: clSetKernelArg\n");

		globalThreads[0] = w0;
		localThreads[0] = 64;
		status = clEnqueueNDRangeKernel(commandQueue, kernel_unpack_complex, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]);
		if(status != CL_SUCCESS)  PrintErrMsgQuit("Error: clEnqueueNDRangeKernel failed.\n");
		status = clWaitForEvents(1, &events[0]);
		clReleaseEvent(events[0]);
	}

/****************************************** CODE CHANGE ZONE STARTS *****************************************************/
    /* Enqueue readBuffer*/
    status = clEnqueueReadBuffer(commandQueue, outputBuffer_ar_matlab, CL_TRUE, 0, width * height * sizeof(CL_DATA_TYPE)/2, matlab_ar, 0, NULL, &events[1]);
    if(status != CL_SUCCESS)  PrintErrMsgQuit("Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)\n");
    
    /* Wait for the read buffer to finish execution */
    status = clWaitForEvents(1, &events[1]);
    if(status != CL_SUCCESS)    PrintErrMsgQuit("Error: Waiting for read buffer call to finish. (clWaitForEvents)\n");
    
    status = clEnqueueReadBuffer(commandQueue, outputBuffer_ai_matlab, CL_TRUE, 0, width * height * sizeof(CL_DATA_TYPE)/2, matlab_ai, 0, NULL, &events[1]);
    if(status != CL_SUCCESS)  PrintErrMsgQuit("Error: clEnqueueReadBuffer failed. (clEnqueueReadBuffer)\n");
    
    /* Wait for the read buffer to finish execution */
    status = clWaitForEvents(1, &events[1]);
    if(status != CL_SUCCESS)    PrintErrMsgQuit("Error: Waiting for read buffer call to finish. (clWaitForEvents)\n");

/****************************************** CODE CHANGE ZONE ENDS *****************************************************/
	PRINTF("%f, %f\n", input_ptr[80], input_ptr[81]);
    
    clReleaseEvent(events[1]);     
    
}

void cleanup_CL() {

    /*******************************CLEAN UP ***************************************/
    /*******************************************************************************/
    status = clReleaseKernel(kernel_2);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");
        
    status = clReleaseKernel(kernel_4);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");
    
    status = clReleaseKernel(kernel_8);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");

    status = clReleaseKernel(kernel_16);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");

    status = clReleaseKernel(kernel_32);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");
    
    status = clReleaseKernel(kernel_64);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");
    status = clReleaseKernel(kernel_128);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");
    status = clReleaseKernel(kernel_256);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");
    status = clReleaseKernel(kernel_512);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");
    status = clReleaseKernel(kernel_1024);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseKernel \n");

/****************************************** CODE CHANGE ZONE STARTS *****************************************************/    
    
    status = clReleaseMemObject(inputBuffer);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseMemObject (inputBuffer)\n");
    
    status = clReleaseMemObject(outputBuffer);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseMemObject (outputBuffer)\n");



/****************************************** CODE CHANGE ZONE ENDS *****************************************************/


    status = clReleaseCommandQueue(commandQueue);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseCommandQueue\n");
        
    status = clReleaseContext(context);
    if(status != CL_SUCCESS) PrintErrMsgQuit("Error: In clReleaseContext\n");
    
    
}






void init_Host_MATLAB() {
 
}

void cleanup_Host_MATLAB(){
    
    
}



void mexFunction(int nlhs, mxArray  *plhs[], int nrhs, const mxArray  *prhs[])
{
    mxClassID category;
    float *ar,*ai;
    N = mxGetM(prhs[0]);
    
	// batch size
	M = height = mxGetN(prhs[0]);
    width = 2*N;
    PRINTF("w%d, h%d, nrhs%d\n", width, height, nrhs);


	complexNumber = 0;
//    input_ptr = (float*) mxMalloc(sizeof(float2)*N*M);
    category = mxGetClassID(prhs[0]);
    if( category == mxSINGLE_CLASS)  
    {
        float *ar,*ai;
        /* Pointer for the real part of the input */
        ar =  (float *) mxGetData(prhs[0]);
		PRINTF("ar %f, %f\n", ar[80], ar[81]);
		input_ptr = ar;
		complexNumber = mxIsComplex(prhs[0])?1:0;
		/*
        if(mxIsComplex(prhs[0]))
        {
            ai =  (float *) mxGetImagData(prhs[0]);
            if(nrhs ==1) pack_c2c_sp((float2*)input_ptr, ar, ai, M, N);
        }
        else
        {
            if(nrhs ==1) pack_r2c_sp((float2*)input_ptr, ar, M, N);

        }
		*/

    }
	PRINTF("%f, %f\n", input_ptr[80], input_ptr[81]);


/****************************************** CODE CHANGE ZONE STARTS *****************************************************/


    
    plhs[0]=mxCreateNumericMatrix(N, M, category, mxCOMPLEX);
//    output_ptr = input_ptr;
	matlab_ar = (float *) mxGetPr(plhs[0]); 
	matlab_ai = (float *) mxGetPi(plhs[0]);
//	output_ptr = ar;
//    unpack_c2c_sp((float2*)input_ptr, ar, ai, M, N); 


/****************************************** CODE CHANGE ZONE ENDS *****************************************************/

    
    init_Host_MATLAB();
    
    //JK  We only need one-time setup for the MEX function
    if (kernel_8 == NULL) 
            init_CL();
    run_CL();
    cleanup_Host_MATLAB();

//    mxFree(input_ptr);
    
    //JK comment out cleanup function to keep those global variables alive in MATLAB
//    cleanup_CL(); 
    

/*
for i=1:2048
    for j=1:2048
        if result(i,j)==0
        else
        fprintf('%d, %d, %f\n', i,j, result(i,j))
        end
    end
end


k = 1
for N = 1:10
    k = k*2;
    A = rand(k, 1);
    A = single(A);
    C = FFT_small_ATI(A);
    B = ifft(C);
    D = real(B);
    result = D-A;
    max(result)
end
*/
}