Opengl - get rid of glu dll [Archive] - Pascal Game Development

JernejL

10-03-2018, 10:04 AM

This is based on opengl reference implementation, which has a super permissive license:
https://spdx.org/licenses/SGI-B-2.0.html

My implementations are a mess, some stuff might be missing (let mek now what, and i will provide).

Some parts are just chopped from a class i use that has a lot of nonsense, but the projection functions also have commented out portions to just use opengl calls to get some data.

This lets you use most often used functionality from glu, without actually linking to glu :)

// util, if anything is missing, please let me know

function MatrixInverseV2(var src: Tmatrix4f): boolean;
var
i, j, k, swap: integer;
t: double;
src_access: array[0..16] of single absolute src;
temp: array[0..3,0..3] of single;
inverse: Tmatrix4f;
inverse_access: array[0..16] of single absolute inverse;
begin

result:= false;

Move(src, temp, sizeof(src)); // faster

{ for i:=0 to 3 do begin
for j:= 0 to 3 do begin
temp[i][j] := src_access[i*4+j];
end;
end;}

inverse:= Identity;

for i:= 0 to 3 do begin

// Look for largest element in column

swap := i;

for j := i + 1 to 3 do begin
if (fabs(temp[j][i]) > fabs(temp[i][i])) then begin
swap := j;
end;
end;

if (swap <> i) then begin
// Swap rows.
for k := 0 to 3 do begin
t := temp[i][k];
temp[i][k] := temp[swap][k];
temp[swap][k] := t;

t := inverse_access[i*4+k];
inverse_access[i*4+k] := inverse_access[swap*4+k];
inverse_access[swap*4+k] := t;
end;
end;

if (temp[i][i] = 0) then begin
// No non-zero pivot. The matrix is singular, which shouldn't happen. This means the user gave us a bad matrix.
result:= false;
exit;
end;

t := temp[i][i];

for k := 0 to 3 do begin
//temp[i][k] /= t;
temp[i][k] := temp[i][k] / t;
inverse_access[i*4+k]:= inverse_access[i*4+k] / t;
//inverse_access[i*4+k] /= t;
end;

for j := 0 to 3 do begin
if (j <> i) then begin
t := temp[j][i];
for k:= 0 to 3 do begin
//temp[j][k] -= temp[i][k]*t;
//inverse_access[j*4+k] -= inverse_access[i*4+k]*t;

temp[j][k] := temp[j][k] - temp[i][k]*t;
inverse_access[j*4+k] := inverse_access[j*4+k] - inverse_access[i*4+k]*t;

end;
end;
end;
end;

result:= true;

Move(inverse_access, src, sizeof(src));

end;

function MultMatrix(const matrixa, matrixb: TMatrix4f): TMatrix4f; // the result depends on proper parameter order, don't mix it up!
begin
Result[0,0]:= matrixa[0,0] * matrixb[0,0] + matrixa[0,1] * matrixb[1,0] + matrixa[0,2] * matrixb[2,0] + matrixa[0,3] * matrixb[3,0];
Result[0,1]:= matrixa[0,0] * matrixb[0,1] + matrixa[0,1] * matrixb[1,1] + matrixa[0,2] * matrixb[2,1] + matrixa[0,3] * matrixb[3,1];
Result[0,2]:= matrixa[0,0] * matrixb[0,2] + matrixa[0,1] * matrixb[1,2] + matrixa[0,2] * matrixb[2,2] + matrixa[0,3] * matrixb[3,2];
Result[0,3]:= matrixa[0,0] * matrixb[0,3] + matrixa[0,1] * matrixb[1,3] + matrixa[0,2] * matrixb[2,3] + matrixa[0,3] * matrixb[3,3];
Result[1,0]:= matrixa[1,0] * matrixb[0,0] + matrixa[1,1] * matrixb[1,0] + matrixa[1,2] * matrixb[2,0] + matrixa[1,3] * matrixb[3,0];
Result[1,1]:= matrixa[1,0] * matrixb[0,1] + matrixa[1,1] * matrixb[1,1] + matrixa[1,2] * matrixb[2,1] + matrixa[1,3] * matrixb[3,1];
Result[1,2]:= matrixa[1,0] * matrixb[0,2] + matrixa[1,1] * matrixb[1,2] + matrixa[1,2] * matrixb[2,2] + matrixa[1,3] * matrixb[3,2];
Result[1,3]:= matrixa[1,0] * matrixb[0,3] + matrixa[1,1] * matrixb[1,3] + matrixa[1,2] * matrixb[2,3] + matrixa[1,3] * matrixb[3,3];
Result[2,0]:= matrixa[2,0] * matrixb[0,0] + matrixa[2,1] * matrixb[1,0] + matrixa[2,2] * matrixb[2,0] + matrixa[2,3] * matrixb[3,0];
Result[2,1]:= matrixa[2,0] * matrixb[0,1] + matrixa[2,1] * matrixb[1,1] + matrixa[2,2] * matrixb[2,1] + matrixa[2,3] * matrixb[3,1];
Result[2,2]:= matrixa[2,0] * matrixb[0,2] + matrixa[2,1] * matrixb[1,2] + matrixa[2,2] * matrixb[2,2] + matrixa[2,3] * matrixb[3,2];
Result[2,3]:= matrixa[2,0] * matrixb[0,3] + matrixa[2,1] * matrixb[1,3] + matrixa[2,2] * matrixb[2,3] + matrixa[2,3] * matrixb[3,3];
Result[3,0]:= matrixa[3,0] * matrixb[0,0] + matrixa[3,1] * matrixb[1,0] + matrixa[3,2] * matrixb[2,0] + matrixa[3,3] * matrixb[3,0];
Result[3,1]:= matrixa[3,0] * matrixb[0,1] + matrixa[3,1] * matrixb[1,1] + matrixa[3,2] * matrixb[2,1] + matrixa[3,3] * matrixb[3,1];
Result[3,2]:= matrixa[3,0] * matrixb[0,2] + matrixa[3,1] * matrixb[1,2] + matrixa[3,2] * matrixb[2,2] + matrixa[3,3] * matrixb[3,2];
Result[3,3]:= matrixa[3,0] * matrixb[0,3] + matrixa[3,1] * matrixb[1,3] + matrixa[3,2] * matrixb[2,3] + matrixa[3,3] * matrixb[3,3];
end;

function MatrixTransformVector34(vec: Vector; m: Tmatrix4f): Vector4; overload;
begin

Result.x := vec.x * m[0][0] + vec.y * m[1][0] + vec.z * m[2][0] + m[3][0];
Result.y := vec.x * m[0][1] + vec.y * m[1][1] + vec.z * m[2][1] + m[3][1];
Result.z := vec.x * m[0][2] + vec.y * m[1][2] + vec.z * m[2][2] + m[3][2];
Result.w := vec.x * m[0][3] + vec.y * m[1][3] + vec.z * m[2][3] + m[3][3];

end;

function MatrixTransformVector44(vec: Vector4; m: Tmatrix4f): Vector4; overload;
begin

Result.x := vec.x * m[0][0] + vec.y * m[1][0] + vec.z * m[2][0] + vec.w * m[3][0];
Result.y := vec.x * m[0][1] + vec.y * m[1][1] + vec.z * m[2][1] + vec.w * m[3][1];
Result.z := vec.x * m[0][2] + vec.y * m[1][2] + vec.z * m[2][2] + vec.w * m[3][2];
Result.w := vec.x * m[0][3] + vec.y * m[1][3] + vec.z * m[2][3] + vec.w * m[3][3];

end;

procedure Tcamera.LookAt(eyex, eyey, eyez, centerx, centery, centerz, upx, upy, upz: single);
var
forw, side, up: array[0..2] of single;
begin
forw[0] := centerx - eyex;
forw[1] := centery - eyey;
forw[2] := centerz - eyez;

up[0] := upx;
up[1] := upy;
up[2] := upz;

normalize(forw);

// Side:= forw x up
crossp(forw, up, side);
normalize(side);

// Recompute up as: up:= side x forw
crossp(side, forw, up);

modelviewmatrix := Identity;
modelviewmatrix[0][0] := side[0];
modelviewmatrix[0][1] := side[1];
modelviewmatrix[0][2] := side[2];

modelviewmatrix[1][0] := up[0];
modelviewmatrix[1][1] := up[1];
modelviewmatrix[1][2] := up[2];

modelviewmatrix[2][0] := -forw[0];
modelviewmatrix[2][1] := -forw[1];
modelviewmatrix[2][2] := -forw[2];

modelviewmatrix[3][0] := -eyex;
modelviewmatrix[3][1] := -eyey;
modelviewmatrix[3][2] := -eyez;

glMultMatrixf(@modelviewmatrix[0][0]);

glGetFloatv(GL_PROJECTION_MATRIX, @projectionmatrix[0][0]);

end;

// this is gluPickingMatrix equivalent translated from ogl-sample.
procedure Tcamera.PickingMatrix(x, y, deltax, deltay: GLfloat; viewport: array of integer);
begin
if (deltax <= 0) and (deltay <= 0) then
exit;

// Translate and scale the picked region to the entire window
glTranslatef( (viewport[2] - 2 * (x - viewport[0])) / deltax, (viewport[3] - 2 * (y - viewport[1])) / deltay, 0);

glScalef(viewport[2] / deltax, viewport[3] / deltay, 1.0);
end;

Screen / world transformation (gluproject, gluunproject)

function Tcamera.worldtoscreen(location: Vector): Vector2;
var
view, projection, tmp: TMatrix4f;
screen: Vector4;
viewport: array[0..3] of integer;
begin

// our data (so this works even when switched to 2d gui projection)
view:= modelviewmatrix;
projection:= projectionmatrix;

viewport[0]:= 0;
viewport[1]:= 0;
viewport[2]:= engine.win_width;
viewport[3]:= engine.win_height;

// alternative - works, gets opengl data:
// glGetIntegerv(GL_VIEWPORT, @viewport[0]); // returns viewport / window dimensions
// glGetFloatv(GL_MODELVIEW_MATRIX, @view[0][0]);
// glGetFloatv(GL_PROJECTION_MATRIX, @projection[0][0]);

tmp:= MultMatrix(view, projection);

screen:= MatrixTransformVector34(location, tmp); // MUST use vector4.

result.x := (screen.x * 0.5 / screen.w + 0.5) * viewport[2]; // engine.win_width;
result.y := ((-screen.y) * 0.5 / screen.w + 0.5) * viewport[3]; //engine.win_height; // y is flipped since opengl considers screen space to start at bottom-left ang go up-right

end;

{
winX and winY will be the corners of your screen in pixels.
winZ is a number in [0,1] which will specify where between zNear and zFar (clipping planes) the points should fall.
objX-Z will hold the results.
The middle variables are the relevant matrices.
They can be queried if needed.
}

function Tcamera.screenToWorld(location: Vector): Vector;
var
view, projection, tmp, finalMatrix: TMatrix4f;
viewport: array[0..3] of integer;
x, y, z: glDouble;
in_: Vector4;
depthrange: array[0..1] of single;
far_, near_, clipw: single;
out_: Vector4;
depth_here, clip_z, world_z: single;
oldbuff: integer;
begin

clipw:= 1.0; // 1.0; // TODO: INPUT, but for vector3 it is 1

// our data (so this works even when switched to 2d gui projection)
view:= modelviewmatrix;
projection:= projectionmatrix;

viewport[0]:= 0;
viewport[1]:= 0;
viewport[2]:= engine.win_width;
viewport[3]:= engine.win_height;

// alternative - works, gets opengl data:
// glGetIntegerv(GL_VIEWPORT, @viewport[0]); // returns viewport / window dimensions
// glGetFloatv(GL_MODELVIEW_MATRIX, @view[0][0]);
// glGetFloatv(GL_PROJECTION_MATRIX, @projection[0][0]);

location.y:= viewport[3] - location.y; // since screen space in opengl starts at bottom-left and goes up-right.

//glGetIntegerv(GL_READ_BUFFER, @oldbuff);

glReadBuffer(GL_BACK); // use back buffer for speed?

if location.z = infinite then begin

glReadPixels( trunc(location.x), trunc(location.y), 1, 1, GL_DEPTH_COMPONENT, GL_FLOAT, @depth_here);
location.z:= depth_here;

end;

//glReadBuffer(oldbuff);

glGetFloatv(GL_DEPTH_RANGE, @depthrange[0]);

clip_z := (depth_here - 0.5) * 2.0;
world_z:= 2*depthrange[1]*depthrange[0]/(clip_z*(depthrange[1]-depthrange[0])-(depthrange[1]+depthrange[0]));

clipw:= 1.0;

finalMatrix:= MultMatrix(view, projection);

//if (!__gluInvertMatrixd(finalMatrix, finalMatrix)) return(GL_FALSE);
MatrixInverseV2(finalMatrix);

in_.components[0]:= location.x;
in_.components[1]:= location.y;
in_.components[2]:= location.z;
in_.components[3]:= clipw;

// Map x and y from window coordinates
in_.components[0] := (in_.components[0] - viewport[0]) / viewport[2];
in_.components[1] := (in_.components[1] - viewport[1]) / viewport[3];
// in_.components[2] := (in_.components[2] - near_) / (far_ - near_); not for vector3

// Map to range -1 to 1
in_.components[0] := in_.components[0] * 2 - 1;
in_.components[1] := in_.components[1] * 2 - 1;
in_.components[2] := in_.components[2] * 2 - 1;

out_:= MatrixTransformVector44(in_, finalMatrix);

if (out_.components[3] = 0.0) then begin
result:= nullvector;
exit;
end;

out_.components[0] := out_.components[0] / out_.components[3];
out_.components[1] := out_.components[1] / out_.components[3];
out_.components[2] := out_.components[2] / out_.components[3];

Result:= makevector(out_.components[0], out_.components[1], out_.components[2]);

end;

phibermon

11-03-2018, 04:11 PM

Thanks for sharing your code! :)

I wasn't aware of the 'absolute' keyword - you learn something new every day.

For that kind of access I either typecast or use the 'case' syntax with records (example here from Jink) :

TVec4 = packed record
case UInt8 of
0:(XYZ : TVec3);
1:(XY, ZW : TVec2);
2:(X,Y,Z,W : JFloat);
3:(V : packed array[0..3] of JFloat);
end;

TJMatrix = packed record
case UInt8 of
0:(_11, _12, _13, _14,
_21, _22, _23, _24,
_31, _32, _33, _34,
_41, _42, _43, _44 : JFloat);
1 : (M : packed array[0..15] of JFloat) ;
2 : (RC : packed array[0..3, 0..3] of JFloat);
3 : (Rows : packed array[0..3] of TVec4);
end;

I'll certainly make use of the 'absolute' keyword in future to make things a little more succinct :)

JernejL

12-03-2018, 10:04 AM

if it's a simple case like dealing with a variable in a single function like this, it makes things kinda easier, otherwise i make use of case within records too, everything has its purpose :)

Chebmaster

13-03-2018, 08:12 AM

I wasn't aware of the 'absolute' keyword
Be wary it doesn't always work. If you have register variables optimization on, the compiler sometimes gets confused.
Example: my attempt to implement Carmack's fast inverse square root.

function FastInverseSquareRoot(a: float): float; inline;
var
i: longint absolute Result;
begin
Result:= a;
i:= $5f3759df - (i shr 1);
Result*= 1.5 - (a * 0.5 * Result * Result);
Result*= 1.5 - (a * 0.5 * Result * Result);
end;
- it failed craptacularly. The compiler did not give an error but generated totally rubbish code seqence (I turned on ASM output to check).

So I had to settle on

// https://en.wikipedia.org/wiki/Fast_inverse_square_root
function FastInverseSquareRoot(a: float): float; inline;
var
i: longint;// absolute Result; //code generator FAILS to marry SSE2 and general-purpose registers
begin
//Result:= a;
i:= longint(pointer(@a)^);
i:= $5f3759df - (i shr 1);
Result:= float(pointer(@i)^);
Result*= 1.5 - (a * 0.5 * Result * Result);
Result*= 1.5 - (a * 0.5 * Result * Result);
end;

P.S. So I'd group absolute together with goto: necessary for backward compatibility but not recommended to use unless you have a very good understanding of processes involved.

JernejL

13-03-2018, 02:51 PM

I have my own variant of that:

function NSQRT(X: single): single; {$ifdef fpc} inline; {$endif}
var
XHalf: single;
I: integer Absolute X;
X2: single Absolute I;
XB: single;
begin
XB := X;
XHalf := 0.5 * X;
I := $5f375a86 - (I shr 1);
// ($5f375a86 for best range, $5F375A84 for whole fpu range, 5F3759DF for carmacks range)
X := X2 * (1.5 - XHalf * X2 * X2);
Result := XB * X;
end;

Perhaps not as optimal, but look at how 2 variables are on same absolute variable address, iirc this version works with optimizations on.

phibermon

14-03-2018, 02:06 AM

I'd not come across this approximation either - thanks very much for the info both! exactly the kind of esoteric little trick I love to explore :)

Reading up, it was apparently used before our lord and saviour Carmack did, as was the famous 'Carmack's Reverse' technique for shadow volumes.

But John is a master of applied mathematics in the field of 3D - had he been born 10 years earlier we'd probably of seen Doom on the Amiga at full speed because he found some trick you could pull with the floppy drive ;)

Chebmaster

15-03-2018, 12:49 PM

exactly the kind of esoteric little trick I love to explore
Just don't forget: on modern hardware this vintage method has the same speed(x86) or is three times *slower*(Raspberry Pi) than honest 1/sqrt(x) while the dedicated SSE instruction RSQRTPS leaves it in the dust, being more than 8 times faster (although it is not deterministic) because it does that trick *in hardware* operating on 4 floats in parallel.

See my research results @ http://openarena.ws/board/index.php?topic=5379.0

So, be wary of stale methodologies and remember: modern hardware demands different optimizations!

JernejL

17-03-2018, 08:11 AM

Just don't forget: on modern hardware this vintage method has the same speed(x86) or is three times *slower*(Raspberry Pi) than honest 1/sqrt(x) while the dedicated SSE instruction RSQRTPS leaves it in the dust, being more than 8 times faster (although it is not deterministic) because it does that trick *in hardware* operating on 4 floats in parallel.

See my research results @ http://openarena.ws/board/index.php?topic=5379.0

So, be wary of stale methodologies and remember: modern hardware demands different optimizations!

This is gold, i didn't even consider this, and yeah we did come into era of 8 and 16 core processor with massive IPCs and internal optimizations, i guess it is time to put some of these old methods to rest.