@@ -91,6 +91,38 @@ public Matrix relu() {
9191 return result ;
9292 }
9393
94+ public Matrix softmax () {
95+ Matrix result = new Matrix (rows , cols );
96+
97+ float [] buffer = new float [rows ];
98+ for (int row = 0 ; row < rows ; row ++) {
99+ int offset = row * cols ;
100+
101+ // calculate the max values
102+ buffer [row ] = -Float .MAX_VALUE ;
103+ for (int i = 0 ; i < cols ; i ++) {
104+ float value = data [offset + i ];
105+ if (value > buffer [row ]) {
106+ buffer [row ] = value ;
107+ }
108+ }
109+
110+ // calculate the sums
111+ float sum = 0 ;
112+ float max = buffer [row ];
113+ for (int i = 0 ; i < cols ; i ++) {
114+ sum += Math .exp (data [offset + i ] - max );
115+ }
116+
117+ // calculate the softmax vectors
118+ for (int i = 0 ; i < cols ; i ++) {
119+ result .data [offset + i ] = (float ) (Math .exp (data [offset + i ] - max ) / sum );
120+ }
121+ }
122+
123+ return result ;
124+ }
125+
94126 public static boolean isCompatibleWithGPU (GPU gpu ) {
95127 return gpu .isInitialized () &&
96128 gpu .getKernel ("Matrices::matrixMultiply" ) != null &&
@@ -222,21 +254,21 @@ public Matrix relu(GPU gpu) {
222254
223255 Matrix result = new Matrix (rows , cols );
224256
225- Pointer pointerA = Pointer .to (data );
257+ Pointer pointerIn = Pointer .to (data );
226258 Pointer pointerOut = Pointer .to (result .data );
227259
228260 // Allocate the memory objects for the input- and output data
229- cl_mem memoryA = clCreateBuffer (context ,
261+ cl_mem memoryIn = clCreateBuffer (context ,
230262 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,
231- Sizeof .cl_float * data .length , pointerA , null );
263+ Sizeof .cl_float * data .length , pointerIn , null );
232264 cl_mem memoryOut = clCreateBuffer (context ,
233265 CL_MEM_READ_WRITE ,
234266 Sizeof .cl_float * result .data .length , null , null );
235267
236268 // Set the arguments for the kernel
237269 int argNum = 0 ;
238270 clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryOut ));
239- clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryA ));
271+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryIn ));
240272 clSetKernelArg (kernel , argNum ++, Sizeof .cl_uint , Pointer .to (new int []{cols }));
241273
242274 // Set the work-item dimensions
@@ -251,7 +283,53 @@ public Matrix relu(GPU gpu) {
251283 clEnqueueReadBuffer (commandQueue , memoryOut , CL_TRUE , 0 ,
252284 result .data .length * Sizeof .cl_float , pointerOut , 0 , null , null );
253285
254- clReleaseMemObject (memoryA );
286+ clReleaseMemObject (memoryIn );
287+ clReleaseMemObject (memoryOut );
288+
289+ return result ;
290+ }
291+
292+ public Matrix softmax (GPU gpu ) {
293+ cl_context context = gpu .getContext ();
294+ cl_command_queue commandQueue = gpu .getCommandQueue ();
295+ cl_kernel kernel = gpu .getKernel ("Matrices::softmax" );
296+
297+ if (kernel == null ) {
298+ return null ;
299+ }
300+
301+ Matrix result = new Matrix (rows , cols );
302+
303+ Pointer pointerIn = Pointer .to (data );
304+ Pointer pointerOut = Pointer .to (result .data );
305+
306+ // Allocate the memory objects for the input- and output data
307+ cl_mem memoryIn = clCreateBuffer (context ,
308+ CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,
309+ Sizeof .cl_float * data .length , pointerIn , null );
310+ cl_mem memoryOut = clCreateBuffer (context ,
311+ CL_MEM_READ_WRITE ,
312+ Sizeof .cl_float * result .data .length , null , null );
313+
314+ // Set the arguments for the kernel
315+ int argNum = 0 ;
316+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryOut ));
317+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryIn ));
318+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_uint , Pointer .to (new int []{cols }));
319+
320+ // Set the work-item dimensions
321+ long local_work_sizes [] = new long []{1 };
322+ long global_work_sizes [] = new long []{rows };
323+
324+ // Execute the kernel
325+ clEnqueueNDRangeKernel (commandQueue , kernel , 1 , null ,
326+ global_work_sizes , local_work_sizes , 0 , null , null );
327+
328+ // Read the output data
329+ clEnqueueReadBuffer (commandQueue , memoryOut , CL_TRUE , 0 ,
330+ result .data .length * Sizeof .cl_float , pointerOut , 0 , null , null );
331+
332+ clReleaseMemObject (memoryIn );
255333 clReleaseMemObject (memoryOut );
256334
257335 return result ;
0 commit comments