softmax

TannerLow · TannerLow · commit 74155b0f1abc · 2024-05-08T00:33:35.000-05:00
diff --git a/build.gradle b/build.gradle
@@ -3,7 +3,7 @@ plugins {
 }
 
 group 'com.github.TannerLow'
-version '0.3'
+version '0.4'
 description 'Matrix Library with GPU compatibility.'
 
 repositories {
diff --git a/src/main/java/com/github/TannerLow/JavaMatrixMath/Matrix.java b/src/main/java/com/github/TannerLow/JavaMatrixMath/Matrix.java
@@ -91,6 +91,38 @@ public Matrix relu() {
         return result;
     }
 
+    public Matrix softmax() {
+        Matrix result = new Matrix(rows, cols);
+
+        float[] buffer = new float[rows];
+        for(int row = 0; row < rows; row++) {
+            int offset = row * cols;
+
+            // calculate the max values
+            buffer[row] = -Float.MAX_VALUE;
+            for(int i = 0; i < cols; i++) {
+                float value = data[offset + i];
+                if(value > buffer[row]) {
+                    buffer[row] = value;
+                }
+            }
+
+            // calculate the sums
+            float sum = 0;
+            float max = buffer[row];
+            for(int i = 0; i < cols; i++) {
+                sum += Math.exp(data[offset + i] - max);
+            }
+
+            // calculate the softmax vectors
+            for(int i = 0; i < cols; i++) {
+                result.data[offset + i] = (float) (Math.exp(data[offset + i] - max) / sum);
+            }
+        }
+
+        return result;
+    }
+
     public static boolean isCompatibleWithGPU(GPU gpu) {
         return  gpu.isInitialized() &&
                 gpu.getKernel("Matrices::matrixMultiply") != null &&
@@ -222,21 +254,21 @@ public Matrix relu(GPU gpu) {
 
         Matrix result = new Matrix(rows, cols);
 
-        Pointer pointerA = Pointer.to(data);
+        Pointer pointerIn = Pointer.to(data);
         Pointer pointerOut = Pointer.to(result.data);
 
         // Allocate the memory objects for the input- and output data
-        cl_mem memoryA = clCreateBuffer(context,
+        cl_mem memoryIn = clCreateBuffer(context,
                 CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
-                Sizeof.cl_float * data.length, pointerA, null);
+                Sizeof.cl_float * data.length, pointerIn, null);
         cl_mem memoryOut = clCreateBuffer(context,
                 CL_MEM_READ_WRITE,
                 Sizeof.cl_float * result.data.length, null, null);
 
         // Set the arguments for the kernel
         int argNum = 0;
         clSetKernelArg(kernel, argNum++, Sizeof.cl_mem, Pointer.to(memoryOut));
-        clSetKernelArg(kernel, argNum++, Sizeof.cl_mem, Pointer.to(memoryA));
+        clSetKernelArg(kernel, argNum++, Sizeof.cl_mem, Pointer.to(memoryIn));
         clSetKernelArg(kernel, argNum++, Sizeof.cl_uint, Pointer.to(new int[]{cols}));
 
         // Set the work-item dimensions
@@ -251,7 +283,53 @@ public Matrix relu(GPU gpu) {
         clEnqueueReadBuffer(commandQueue, memoryOut, CL_TRUE, 0,
                 result.data.length * Sizeof.cl_float, pointerOut, 0, null, null);
 
-        clReleaseMemObject(memoryA);
+        clReleaseMemObject(memoryIn);
+        clReleaseMemObject(memoryOut);
+
+        return result;
+    }
+
+    public Matrix softmax(GPU gpu) {
+        cl_context context = gpu.getContext();
+        cl_command_queue commandQueue = gpu.getCommandQueue();
+        cl_kernel kernel = gpu.getKernel("Matrices::softmax");
+
+        if(kernel == null) {
+            return null;
+        }
+
+        Matrix result = new Matrix(rows, cols);
+
+        Pointer pointerIn = Pointer.to(data);
+        Pointer pointerOut = Pointer.to(result.data);
+
+        // Allocate the memory objects for the input- and output data
+        cl_mem memoryIn = clCreateBuffer(context,
+                CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+                Sizeof.cl_float * data.length, pointerIn, null);
+        cl_mem memoryOut = clCreateBuffer(context,
+                CL_MEM_READ_WRITE,
+                Sizeof.cl_float * result.data.length, null, null);
+
+        // Set the arguments for the kernel
+        int argNum = 0;
+        clSetKernelArg(kernel, argNum++, Sizeof.cl_mem, Pointer.to(memoryOut));
+        clSetKernelArg(kernel, argNum++, Sizeof.cl_mem, Pointer.to(memoryIn));
+        clSetKernelArg(kernel, argNum++, Sizeof.cl_uint, Pointer.to(new int[]{cols}));
+
+        // Set the work-item dimensions
+        long local_work_sizes[] = new long[]{1};
+        long global_work_sizes[] = new long[]{rows};
+
+        // Execute the kernel
+        clEnqueueNDRangeKernel(commandQueue, kernel, 1, null,
+                global_work_sizes, local_work_sizes, 0, null, null);
+
+        // Read the output data
+        clEnqueueReadBuffer(commandQueue, memoryOut, CL_TRUE, 0,
+                result.data.length * Sizeof.cl_float, pointerOut, 0, null, null);
+
+        clReleaseMemObject(memoryIn);
         clReleaseMemObject(memoryOut);
 
         return result;
diff --git a/src/main/resources/kernels/Matrices.cl b/src/main/resources/kernels/Matrices.cl
@@ -62,10 +62,10 @@ addRowToRows(__global float* C,
     }
 }
 
-// Add row to rows: C = ReLu(A).
+// Add row to rows: output = ReLu(A).
 __kernel void
-relu(__global float* C,
-     __global float* A,
+relu(__global float* output,
+     __global float* input,
      const int rowSize)
 {
     int globalRow = get_global_id(0);
@@ -75,13 +75,46 @@ relu(__global float* C,
     {
         int index = globalRow * rowSize + i;
 
-        // C[i] = max(C[i], 0)
-        float value = A[index];
+        // output[i] = max(output[i], 0)
+        float value = input[index];
         float newValue = 0;
         if(value > 0) {
             newValue = value;
         }
 
-        C[index] = newValue;
+        output[index] = newValue;
     }
-}
+}
+
+// Add row to rows: C = exp(A[i]) for all rows i.
+__kernel void softmax(__global float* output,
+                      __global float* input,
+                      const int rowSize)
+{
+    int globalRow = get_global_id(0);
+
+    int offset = globalRow * rowSize;
+
+    // get the max value of the row
+    float max = -3.4028235E37f;
+    float value;
+    for (int i = 0; i < rowSize; i++) {
+        value = input[offset + i];
+        if(value > max) {
+            max = value;
+        }
+    }
+
+    // Calculate sum of exponentials of input elements
+    float sum = 0.0f;
+    for (int i = 0; i < rowSize; i++) {
+        sum += exp(input[offset + i] - max);
+    }
+
+    // Calculate softmax for each element
+    int index;
+    for (int i = 0; i < rowSize; i++) {
+        index = offset + i;
+        output[index] = exp(input[index] - max) / sum;
+    }
+}
diff --git a/src/test/java/com/github/TannerLow/JavaMatrixMath/CpuTest.java b/src/test/java/com/github/TannerLow/JavaMatrixMath/CpuTest.java
@@ -8,6 +8,7 @@ public static void testAll() {
         testMultiply();
         testAddRowToRows();
         testRelu();
+        testSoftmax();
     }
 
     private static void testMultiply() {
@@ -70,4 +71,23 @@ private static void testRelu() {
             }
         }
     }
+
+    private static void testSoftmax() {
+        float[] data = {1.1f,2.2f,0.2f,-1.7f};
+        float[] expected = {0.223636f,0.671841f,0.090923f,0.013599f};
+
+        Matrix m = new Matrix(1, 4, data);
+
+        Matrix result = m.softmax();
+
+        if(result.rows != m.rows || result.cols != m.cols) {
+            throw new TestFailedException();
+        }
+
+        for(int i = 0; i < result.data.length; i++) {
+            if(!TestMath.withinMariginOfError(expected[i], result.data[i], 0.0005f)) {
+                throw new TestFailedException();
+            }
+        }
+    }
 }
diff --git a/src/test/java/com/github/TannerLow/JavaMatrixMath/GpuTest.java b/src/test/java/com/github/TannerLow/JavaMatrixMath/GpuTest.java
@@ -18,6 +18,7 @@ public static void testAll() throws IOException {
             testMultiply();
             testAddRowToRows();
             testRelu();
+            testSoftmax();
         }
     }
 
@@ -34,6 +35,7 @@ private static void setup() throws IOException {
         gpu.loadKernel(programId, "Matrices", "matrixMultiply");
         gpu.loadKernel(programId, "Matrices", "addRowToRows");
         gpu.loadKernel(programId, "Matrices", "relu");
+        gpu.loadKernel(programId, "Matrices", "softmax");
 
         if(!gpu.isInitialized()) {
             throw new IllegalStateException("GPU in unexpected state.");
@@ -101,6 +103,26 @@ private static void testRelu() {
         }
     }
 
+    private static void testSoftmax() {
+        float[] data = {1.1f,2.2f,0.2f,-1.7f};
+        float[] expected = {0.223636f,0.671841f,0.090923f,0.013599f};
+
+        Matrix m = new Matrix(1, 4, data);
+
+        Matrix result = m.softmax(gpu);
+
+        if(result.rows != m.rows || result.cols != m.cols) {
+            throw new TestFailedException();
+        }
+
+        for(int i = 0; i < result.data.length; i++) {
+            if(!TestMath.withinMariginOfError(expected[i], result.data[i], 0.0005f)) {
+                System.out.println(expected[i] + " vs. " + result.data[i]);
+                throw new TestFailedException();
+            }
+        }
+    }
+
     private static String readFromInternalFile(String filepath) {
         try(InputStream fileInputStream = InternalFile.getInstance().getFileInputStream(filepath)) {
             byte[] bytes = fileInputStream.readAllBytes();

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@ plugins {`
`3`	`3`	`}`
`4`	`4`
`5`	`5`	`group 'com.github.TannerLow'`
`6`		`-version '0.3'`
	`6`	`+version '0.4'`
`7`	`7`	`description 'Matrix Library with GPU compatibility.'`
`8`	`8`
`9`	`9`	`repositories {`