Hot questions for Using Lightweight Java Game Library in opencl

Question:

I've been looking into OpenCL for use with optimizing code and running tasks in parallel to achieve greater speed over pure Java. Now I'm having a bit of an issue.

I've put together a Java program using LWJGL, which as far as I can tell,should be able to do nearly identical tasks -- in this case adding elements from two arrays together and storing the result in another array -- two separate ways: one with pure Java, and the other with an OpenCL Kernel. I'm using System.currentTimeMillis() to keep track of how long each one takes for arrays with a large number of elements(~10,000,000). For whatever reason, the pure java loop seems to be executing around 3 to 10 times, depending on array size, faster than the CL program. My code is as follows(imports omitted):

public class TestCL {

    private static final int SIZE = 9999999; //Size of arrays to test, this value is changed sometimes in between tests

    private static CLContext context; //CL Context
    private static CLPlatform platform; //CL platform
    private static List<CLDevice> devices; //List of CL devices
    private static CLCommandQueue queue; //Command Queue for context
    private static float[] aData, bData, rData; //float arrays to store test data

    //---Kernel Code---
    //The actual kernel script is here:
    //-----------------
    private static String kernel = "kernel void sum(global const float* a, global const float* b, global float* result, int const size){\n" + 
            "const int itemId = get_global_id(0);\n" + 
            "if(itemId < size){\n" + 
            "result[itemId] = a[itemId] + b[itemId];\n" +
            "}\n" +
            "}";;

    public static void main(String[] args){

        aData = new float[SIZE];
        bData = new float[SIZE];
        rData = new float[SIZE]; //Only used for CPU testing

        //arbitrary testing data
        for(int i=0; i<SIZE; i++){
            aData[i] = i;
            bData[i] = SIZE - i;
        }

        try {
            testCPU(); //How long does it take running in traditional Java code on the CPU?
            testGPU(); //How long does the GPU take to run it w/ CL?
        } catch (Exception e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * Test the CPU with pure Java code
     */
    private static void testCPU(){
        long time = System.currentTimeMillis();
        for(int i=0; i<SIZE; i++){
            rData[i] = aData[i] + bData[i];
        }
        //Print the time FROM THE START OF THE testCPU() FUNCTION UNTIL NOW
        System.out.println("CPU processing time for " + SIZE + " elements: " + (System.currentTimeMillis() - time));
    }

    /**
     * Test the GPU with OpenCL
     * @throws LWJGLException
     */
    private static void testGPU() throws LWJGLException {
        CLInit(); //Initialize CL and CL Objects

        //Create the CL Program
        CLProgram program = CL10.clCreateProgramWithSource(context, kernel, null);

        int error = CL10.clBuildProgram(program, devices.get(0), "", null);
        Util.checkCLError(error);

        //Create the Kernel
        CLKernel sum = CL10.clCreateKernel(program, "sum", null);

        //Error checker
        IntBuffer eBuf = BufferUtils.createIntBuffer(1);

        //Floatbuffer for the first array of floats
        FloatBuffer aBuf = BufferUtils.createFloatBuffer(SIZE);
        aBuf.put(aData);
        aBuf.rewind();
        CLMem aMem = CL10.clCreateBuffer(context, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR, aBuf, eBuf);
        Util.checkCLError(eBuf.get(0));

        //And the second
        FloatBuffer bBuf = BufferUtils.createFloatBuffer(SIZE);
        bBuf.put(bData);
        bBuf.rewind();
        CLMem bMem = CL10.clCreateBuffer(context, CL10.CL_MEM_WRITE_ONLY | CL10.CL_MEM_COPY_HOST_PTR, bBuf, eBuf);
        Util.checkCLError(eBuf.get(0));

        //Memory object to store the result
        CLMem rMem = CL10.clCreateBuffer(context, CL10.CL_MEM_READ_ONLY, SIZE * 4, eBuf);
        Util.checkCLError(eBuf.get(0));

        //Get time before setting kernel arguments
        long time = System.currentTimeMillis();

        sum.setArg(0, aMem);
        sum.setArg(1, bMem);
        sum.setArg(2, rMem);
        sum.setArg(3, SIZE);

        final int dim = 1;
        PointerBuffer workSize = BufferUtils.createPointerBuffer(dim);
        workSize.put(0, SIZE);

        //Actually running the program
        CL10.clEnqueueNDRangeKernel(queue, sum, dim, null, workSize, null, null, null);
        CL10.clFinish(queue);

        //Write results to a FloatBuffer
        FloatBuffer res = BufferUtils.createFloatBuffer(SIZE);
        CL10.clEnqueueReadBuffer(queue, rMem, CL10.CL_TRUE, 0, res, null, null);

        //How long did it take?
        //Print the time FROM THE SETTING OF KERNEL ARGUMENTS UNTIL NOW
        System.out.println("GPU processing time for " + SIZE + " elements: " + (System.currentTimeMillis() - time));

        //Cleanup objects
        CL10.clReleaseKernel(sum);
        CL10.clReleaseProgram(program);
        CL10.clReleaseMemObject(aMem);
        CL10.clReleaseMemObject(bMem);
        CL10.clReleaseMemObject(rMem);

        CLCleanup();
    }

    /**
     * Initialize CL objects
     * @throws LWJGLException
     */
    private static void CLInit() throws LWJGLException {
        IntBuffer eBuf = BufferUtils.createIntBuffer(1);

        CL.create();

        platform = CLPlatform.getPlatforms().get(0);
        devices = platform.getDevices(CL10.CL_DEVICE_TYPE_GPU);
        context = CLContext.create(platform, devices, eBuf);
        queue = CL10.clCreateCommandQueue(context, devices.get(0), CL10.CL_QUEUE_PROFILING_ENABLE, eBuf);

        Util.checkCLError(eBuf.get(0));
    }

    /**
     * Cleanup after CL completion
     */
    private static void CLCleanup(){
        CL10.clReleaseCommandQueue(queue);
        CL10.clReleaseContext(context);
        CL.destroy();
    }

}

Here are a few example console results from various tests:

CPU processing time for 10000000 elements: 24
GPU processing time for 10000000 elements: 88

CPU processing time for 1000000 elements: 7
GPU processing time for 1000000 elements: 10

CPU processing time for 100000000 elements: 193
GPU processing time for 100000000 elements: 943

Is there something wrong with my coding that's causing the CL to take faster, or is that actually to be expected in cases such as this? If the case is the latter, then when is CL preferable?


Answer:

I revised the test to do something which I believe is more computationally expensive than simple addition.

Regarding the CPU test, the line:

rData[i] = aData[i] + bData[i];

was changed to:

rData[i] = (float)(Math.sin(aData[i]) * Math.cos(bData[i]));

And in the CL kernel, the line:

result[itemId] = a[itemId] + b[itemId];

was changed to:

result[itemId] = sin(a[itemId]) * cos(b[itemId]);

I'm now getting console results such as:

CPU processing time for 1000000 elements: 154
GPU processing time for 1000000 elements: 11

CPU processing time for 10000000 elements: 8699
GPU processing time for 10000000 elements: 98

(The CPU is taking longer than I'd like to bother with for tests of 100000000 elements.)

For checking accuracy, I added checks that compare an arbitrary element of rData and res to ensure they're the same. I omitted the result here, as it should suffice to say that they were equal.

Now that the function is more complicated(two trigonometric functions being multiplied together), it appears that the CL kernel is much more efficient than the pure Java loop.

Question:

I am creating a voxel engine. I have created chunk generation in addition to some simple simplex noise integration but it is extremely laggy due to all of the face of each quad being drawn even the ones you can't see.

To my understanding this is commonly dealt with using ray casting of which I understand the basic theory: you draw several rays from the camera and check for collision, if no collision is found then the face is not within view and therefor should not be rendered. Even though I understand the theory of it all I haven't yet been able to implement it due to lack of prior knowledge and what I found on the internet lacking i.e. they give the code but not the knowledge.

The steps I could imagine I need to take are as follows:

  1. Learn OpenCL (though I haven't used it before to my understanding it allows you to better make use of your graphics card by the use of 'kernels' which I mentally associate with OpenGL 'shaders').
  2. Learn the theory and math behind Ray casting. I have also have heard of ray tracing which I believe has a different use.
  3. Learn how to use this information to not render hidden faces. Assuming I get a working implementation how would I go about telling OpenGL not to render the hidden faces? The cube is one object and to the best of my knowledge there is no way to manipulate the faces of an object in OpenGL only the vertices. Also how would OpenCL communicate with OpenGL? OpenCL isn't a graphics api so it isn't capable of drawing the rays.

Could anyone point me in the right direction? I also believe that there are pure OpenGL implementations as well but I would like to keep the OpenCL aspect as this is a learning experience.


Answer:

I wouldn't recommend working with OpenCL or OpenGL in developing your first game, both will slow you down extraordinarily because each requires a different mindset. Well done though on getting as far as you have.

You mentioned that you are currently rendering all quads all the time which you want to remove hidden ones. I have written a voxel engine for practice too and ran into this issue and spent a lot of time thinking how to fix it. My solution was to not draw faces that are facing another voxel. Imagine two voxels next to each other, the two faces that are touching cant be seen and don't need to be rendered.

However, this will not make any difference if your method of talking with the GPU is the bottleneck. You will have to use buffered methods, I used Display Lists but it is also possible (but harder) to use VBOs.

I'd also recommend grouping large numbers of voxels into chunks for many reasons. Then you only need to recalculate the visible quads on the chunk that changed.

Regarding Ray Casting, If you adopt the chunk system I just described calculating visible entire chucks will be easier. E.g Chunks behind the player don't need to be rendered and that can be calculated with just one dot product calculation per chunk.

Question:

So im trying to start using LWJGL3 and i want to make a simple window that draws 2 triangles with the vertices stored in the GPU. i keep on getting this error when i run the code A fatal error has been detected by the Java

Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x00007f92f01996ec, pid=4580, tid=0x00007f93664c4700
#
# JRE version: Java(TM) SE Runtime Environment (8.0_171-b11) (build 1.8.0_171-b11)
# Java VM: Java HotSpot(TM) 64-Bit Server VM (25.171-b11 mixed mode linux-amd64 compressed oops)
# Problematic frame:
# C  [liblwjgl_opengl.so+0x446ec]  Java_org_lwjgl_opengl_GL30_nglGenVertexArrays__IJ+0xc
#
# Failed to write core dump. Core dumps have been disabled. To enable core dumping, try "ulimit -c unlimited" before starting Java again

Here is my code

 private void loop() {
    // This line is critical for LWJGL's interoperation with GLFW's
    // OpenGL context, or any context that is managed externally.
    // LWJGL detects the context that is current in the current thread,
    // creates the GLCapabilities instance and makes the OpenGL
    // bindings available for use.
    GL.createCapabilities();

    // Run the rendering loop until the user has attempted to close
    // the window or has pressed the ESCAPE key.
    while (!glfwWindowShouldClose(window)) {
        glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // clear the framebuffer

        glBindVertexArray(vaoId);
        glEnableVertexAttribArray(0);

        glDrawArrays(GL_TRIANGLES, 0, vertexCount);
        glDisableVertexAttribArray(0);
        glBindVertexArray(0);


        glfwSwapBuffers(window); // swap the color buffers
        // Poll for window events. The key callback above will only be
        // invoked during this call.
        glfwPollEvents();

    }
}

private void initTriangle() {

    FloatBuffer vertBuf = MemoryUtil.memAllocFloat(vertices.length);
    vertBuf.put(vertices);
    vertBuf.flip();

    vaoId = GL30.glGenVertexArrays();
    GL30.glBindVertexArray(vaoId);



    vboId = GL15.glGenBuffers();
    GL15.glBindBuffer(GL15.GL_ARRAY_BUFFER, vboId);
    GL15.glBufferData(GL15.GL_ARRAY_BUFFER, vertBuf, GL15.GL_STATIC_DRAW);


    GL20.glVertexAttribPointer(0, 3, GL11.GL_FLOAT, false, 0, 0);

    GL15.glBindBuffer(GL15.GL_ARRAY_BUFFER, 0);
    GL30.glBindVertexArray(0);
}

I am calling InitTriangle before the loop and i get the error at vaoId = GL30.glGenVertexArrays();

anyone that could tell me what im doing wrong or lead me in the right direction, i would appreciate it.


Answer:

I called initTriangle before i made the GLCapabilites

Question:


Answer:

You've to create and use a Vertex Array Object.

The Vertex Array Object stores the specification of the arrays of generic vertex attribute data.

Create and bind the VAO when you specify the vertex arrays.e.g:

public class Mesh {

    private int vao;
    private int vbo;
    private int size;

    public Mesh() {
       this.size = 0;

       this.vao = glGenVertexArrays();
       this.vbo = glGenBuffers();

       glBindVertexArray(this.vao);

       glBindBuffer(GL_ARRAY_BUFFER, vbo);
       glVertexAttribPointer(0, 3, GL_FLOAT, false, 0, 0);
       glEnableVertexAttribArray(0);
    }

    // [...]
}

And use it when you draw the geometry:

public class Mesh {

    // [...]

    public void draw(){

        glBindVertexArray(this.vao);
        glDrawArrays(GL_TRIANGLES, 0, this.size);
        glBindVertexArray(0); // <--- Note, this is not needed
    }
}