@@ -122,7 +122,7 @@ public Matrix vectorizedReluDerivative() {
122122 return result ;
123123 }
124124
125- public Matrix softmax () {
125+ public Matrix horizontalSoftmax () {
126126 Matrix result = new Matrix (rows , cols );
127127
128128 float [] buffer = new float [rows ];
@@ -154,6 +154,37 @@ public Matrix softmax() {
154154 return result ;
155155 }
156156
157+ public Matrix verticalSoftmax () {
158+ Matrix result = new Matrix (rows , cols );
159+
160+ float [] buffer = new float [cols ];
161+ for (int col = 0 ; col < cols ; col ++) {
162+ // calculate the max values
163+ buffer [col ] = -Float .MAX_VALUE ;
164+ for (int i = 0 ; i < rows ; i ++) {
165+ float value = data [col + i * cols ];
166+ if (value > buffer [col ]) {
167+ buffer [col ] = value ;
168+ }
169+ }
170+
171+ // calculate the sums
172+ float sum = 0 ;
173+ float max = buffer [col ];
174+ for (int i = 0 ; i < rows ; i ++) {
175+ sum += Math .exp (data [col + i * cols ] - max );
176+ }
177+
178+ // calculate the softmax vectors
179+ for (int i = 0 ; i < rows ; i ++) {
180+ int index = col + i * cols ;
181+ result .data [index ] = (float ) (Math .exp (data [index ] - max ) / sum );
182+ }
183+ }
184+
185+ return result ;
186+ }
187+
157188// public Matrix fastBatchSoftmaxDerivative(Matrix output) {
158189// Matrix partialDerivatives = new Matrix(cols, cols);
159190//
@@ -187,20 +218,25 @@ public static boolean isCompatibleWithGPU(GPU gpu) {
187218 return gpu .isInitialized () &&
188219 gpu .getKernel ("Matrices::matrixMultiply" ) != null &&
189220 gpu .getKernel ("Matrices::addRowToRows" ) != null &&
190- gpu .getKernel ("Matrices::relu" ) != null ;
221+ gpu .getKernel ("Matrices::addColToCols" ) != null &&
222+ gpu .getKernel ("Matrices::relu" ) != null &&
223+ gpu .getKernel ("Matrices::horizontalSoftmax" ) != null &&
224+ gpu .getKernel ("Matrices::verticalSoftmax" ) != null ;
191225 }
192226
193227 public Matrix multiply (GPU gpu , Matrix other ) {
194228 if (cols != other .rows ) {
195- return null ;
229+ final int [] dimensionsA = {rows , cols };
230+ final int [] dimensionsB = {other .rows , other .cols };
231+ throw new DimensionsMismatchException (dimensionsA , dimensionsB );
196232 }
197233
198234 cl_context context = gpu .getContext ();
199235 cl_command_queue commandQueue = gpu .getCommandQueue ();
200236 cl_kernel kernel = gpu .getKernel ("Matrices::matrixMultiply" );
201237
202238 if (kernel == null ) {
203- return null ;
239+ throw new NullPointerException ( "Matrices::matrixMultiply not found to be loaded in GPU" ) ;
204240 }
205241
206242 Matrix result = new Matrix (rows , other .cols );
@@ -369,7 +405,7 @@ public Matrix relu(GPU gpu) {
369405 cl_kernel kernel = gpu .getKernel ("Matrices::relu" );
370406
371407 if (kernel == null ) {
372- return null ;
408+ throw new NullPointerException ( "Matrices::relu not found to be loaded in GPU" ) ;
373409 }
374410
375411 Matrix result = new Matrix (rows , cols );
@@ -409,13 +445,13 @@ public Matrix relu(GPU gpu) {
409445 return result ;
410446 }
411447
412- public Matrix softmax (GPU gpu ) {
448+ public Matrix horizontalSoftmax (GPU gpu ) {
413449 cl_context context = gpu .getContext ();
414450 cl_command_queue commandQueue = gpu .getCommandQueue ();
415- cl_kernel kernel = gpu .getKernel ("Matrices::softmax " );
451+ cl_kernel kernel = gpu .getKernel ("Matrices::horizontalSoftmax " );
416452
417453 if (kernel == null ) {
418- return null ;
454+ throw new NullPointerException ( "Matrices::horizontalSoftmax not found to be loaded in GPU" ) ;
419455 }
420456
421457 Matrix result = new Matrix (rows , cols );
@@ -454,4 +490,51 @@ public Matrix softmax(GPU gpu) {
454490
455491 return result ;
456492 }
493+
494+ public Matrix verticalSoftmax (GPU gpu ) {
495+ cl_context context = gpu .getContext ();
496+ cl_command_queue commandQueue = gpu .getCommandQueue ();
497+ cl_kernel kernel = gpu .getKernel ("Matrices::verticalSoftmax" );
498+
499+ if (kernel == null ) {
500+ throw new NullPointerException ("Matrices::verticalSoftmax not found to be loaded in GPU" );
501+ }
502+
503+ Matrix result = new Matrix (rows , cols );
504+
505+ Pointer pointerIn = Pointer .to (data );
506+ Pointer pointerOut = Pointer .to (result .data );
507+
508+ // Allocate the memory objects for the input- and output data
509+ cl_mem memoryIn = clCreateBuffer (context ,
510+ CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR ,
511+ Sizeof .cl_float * data .length , pointerIn , null );
512+ cl_mem memoryOut = clCreateBuffer (context ,
513+ CL_MEM_READ_WRITE ,
514+ Sizeof .cl_float * result .data .length , null , null );
515+
516+ // Set the arguments for the kernel
517+ int argNum = 0 ;
518+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryOut ));
519+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_mem , Pointer .to (memoryIn ));
520+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_uint , Pointer .to (new int []{cols }));
521+ clSetKernelArg (kernel , argNum ++, Sizeof .cl_uint , Pointer .to (new int []{rows }));
522+
523+ // Set the work-item dimensions
524+ long local_work_sizes [] = new long []{1 };
525+ long global_work_sizes [] = new long []{cols };
526+
527+ // Execute the kernel
528+ clEnqueueNDRangeKernel (commandQueue , kernel , 1 , null ,
529+ global_work_sizes , local_work_sizes , 0 , null , null );
530+
531+ // Read the output data
532+ clEnqueueReadBuffer (commandQueue , memoryOut , CL_TRUE , 0 ,
533+ result .data .length * Sizeof .cl_float , pointerOut , 0 , null , null );
534+
535+ clReleaseMemObject (memoryIn );
536+ clReleaseMemObject (memoryOut );
537+
538+ return result ;
539+ }
457540}
0 commit comments