This is based on opengl reference implementation, which has a super permissive license:
https://spdx.org/licenses/SGI-B-2.0.html

My implementations are a mess, some stuff might be missing (let mek now what, and i will provide).

Some parts are just chopped from a class i use that has a lot of nonsense, but the projection functions also have commented out portions to just use opengl calls to get some data.

This lets you use most often used functionality from glu, without actually linking to glu

Code:
// util, if anything is missing, please let me know

function MatrixInverseV2(var src: Tmatrix4f): boolean;
var
	i, j, k, swap: integer;
	t: double;
	src_access: array[0..16] of single absolute src;
    temp: array[0..3,0..3] of single;
    inverse: Tmatrix4f;
    inverse_access: array[0..16] of single absolute inverse;
begin

  result:= false;

  Move(src, temp, sizeof(src)); // faster

{  for i:=0 to 3 do begin
	  for j:= 0 to 3 do begin
	      temp[i][j] := src_access[i*4+j];
	  end;
  end;}

  inverse:= Identity;

  for i:= 0 to 3 do begin

	// Look for largest element in column

	swap := i;

	for j := i + 1 to 3 do begin
		if (fabs(temp[j][i]) > fabs(temp[i][i])) then begin
			swap := j;
		end;
	end;

	if (swap <> i) then begin
		// Swap rows.
		for k := 0 to 3 do begin
			t := temp[i][k];
			temp[i][k] := temp[swap][k];
			temp[swap][k] := t;

			t := inverse_access[i*4+k];
			inverse_access[i*4+k] := inverse_access[swap*4+k];
			inverse_access[swap*4+k] := t;
		end;
	end;

	if (temp[i][i] = 0) then begin
		// No non-zero pivot. The matrix is singular, which shouldn't happen. This means the user gave us a bad matrix.
        result:= false;
        exit;
	end;

	t := temp[i][i];

	for k := 0 to 3 do begin
		//temp[i][k] /= t;
		temp[i][k] := temp[i][k] / t;
		inverse_access[i*4+k]:= inverse_access[i*4+k] / t;
		//inverse_access[i*4+k] /= t;
	end;

	for j := 0 to 3 do begin
		if (j <> i) then begin
			t := temp[j][i];
			for k:= 0 to 3 do begin
				//temp[j][k] -= temp[i][k]*t;
				//inverse_access[j*4+k] -= inverse_access[i*4+k]*t;

				temp[j][k] := temp[j][k] - temp[i][k]*t;
				inverse_access[j*4+k] := inverse_access[j*4+k] - inverse_access[i*4+k]*t;

			end;
		end;
	end;
  end;

  result:= true;

  Move(inverse_access, src, sizeof(src));

end;

function MultMatrix(const matrixa, matrixb: TMatrix4f): TMatrix4f; // the result depends on proper parameter order, don't mix it up!
begin
  Result[0,0]:= matrixa[0,0] * matrixb[0,0] + matrixa[0,1] * matrixb[1,0] + matrixa[0,2] * matrixb[2,0] + matrixa[0,3] * matrixb[3,0];
  Result[0,1]:= matrixa[0,0] * matrixb[0,1] + matrixa[0,1] * matrixb[1,1] + matrixa[0,2] * matrixb[2,1] + matrixa[0,3] * matrixb[3,1];
  Result[0,2]:= matrixa[0,0] * matrixb[0,2] + matrixa[0,1] * matrixb[1,2] + matrixa[0,2] * matrixb[2,2] + matrixa[0,3] * matrixb[3,2];
  Result[0,3]:= matrixa[0,0] * matrixb[0,3] + matrixa[0,1] * matrixb[1,3] + matrixa[0,2] * matrixb[2,3] + matrixa[0,3] * matrixb[3,3];
  Result[1,0]:= matrixa[1,0] * matrixb[0,0] + matrixa[1,1] * matrixb[1,0] + matrixa[1,2] * matrixb[2,0] + matrixa[1,3] * matrixb[3,0];
  Result[1,1]:= matrixa[1,0] * matrixb[0,1] + matrixa[1,1] * matrixb[1,1] + matrixa[1,2] * matrixb[2,1] + matrixa[1,3] * matrixb[3,1];
  Result[1,2]:= matrixa[1,0] * matrixb[0,2] + matrixa[1,1] * matrixb[1,2] + matrixa[1,2] * matrixb[2,2] + matrixa[1,3] * matrixb[3,2];
  Result[1,3]:= matrixa[1,0] * matrixb[0,3] + matrixa[1,1] * matrixb[1,3] + matrixa[1,2] * matrixb[2,3] + matrixa[1,3] * matrixb[3,3];
  Result[2,0]:= matrixa[2,0] * matrixb[0,0] + matrixa[2,1] * matrixb[1,0] + matrixa[2,2] * matrixb[2,0] + matrixa[2,3] * matrixb[3,0];
  Result[2,1]:= matrixa[2,0] * matrixb[0,1] + matrixa[2,1] * matrixb[1,1] + matrixa[2,2] * matrixb[2,1] + matrixa[2,3] * matrixb[3,1];
  Result[2,2]:= matrixa[2,0] * matrixb[0,2] + matrixa[2,1] * matrixb[1,2] + matrixa[2,2] * matrixb[2,2] + matrixa[2,3] * matrixb[3,2];
  Result[2,3]:= matrixa[2,0] * matrixb[0,3] + matrixa[2,1] * matrixb[1,3] + matrixa[2,2] * matrixb[2,3] + matrixa[2,3] * matrixb[3,3];
  Result[3,0]:= matrixa[3,0] * matrixb[0,0] + matrixa[3,1] * matrixb[1,0] + matrixa[3,2] * matrixb[2,0] + matrixa[3,3] * matrixb[3,0];
  Result[3,1]:= matrixa[3,0] * matrixb[0,1] + matrixa[3,1] * matrixb[1,1] + matrixa[3,2] * matrixb[2,1] + matrixa[3,3] * matrixb[3,1];
  Result[3,2]:= matrixa[3,0] * matrixb[0,2] + matrixa[3,1] * matrixb[1,2] + matrixa[3,2] * matrixb[2,2] + matrixa[3,3] * matrixb[3,2];
  Result[3,3]:= matrixa[3,0] * matrixb[0,3] + matrixa[3,1] * matrixb[1,3] + matrixa[3,2] * matrixb[2,3] + matrixa[3,3] * matrixb[3,3];
end;

function MatrixTransformVector34(vec: Vector; m: Tmatrix4f): Vector4; overload;
begin

	Result.x := vec.x * m[0][0] + vec.y * m[1][0] + vec.z * m[2][0] + m[3][0];
	Result.y := vec.x * m[0][1] + vec.y * m[1][1] + vec.z * m[2][1] + m[3][1];
	Result.z := vec.x * m[0][2] + vec.y * m[1][2] + vec.z * m[2][2] + m[3][2];
	Result.w := vec.x * m[0][3] + vec.y * m[1][3] + vec.z * m[2][3] + m[3][3];

end;


function MatrixTransformVector44(vec: Vector4; m: Tmatrix4f): Vector4; overload;
begin

	Result.x := vec.x * m[0][0] + vec.y * m[1][0] + vec.z * m[2][0] + vec.w * m[3][0];
	Result.y := vec.x * m[0][1] + vec.y * m[1][1] + vec.z * m[2][1] + vec.w * m[3][1];
	Result.z := vec.x * m[0][2] + vec.y * m[1][2] + vec.z * m[2][2] + vec.w * m[3][2];
	Result.w := vec.x * m[0][3] + vec.y * m[1][3] + vec.z * m[2][3] + vec.w * m[3][3];

end;
Code:
procedure Tcamera.LookAt(eyex, eyey, eyez, centerx, centery, centerz, upx, upy, upz: single);
var
	forw, side, up: array[0..2] of single;
begin
	forw[0] := centerx - eyex;
	forw[1] := centery - eyey;
	forw[2] := centerz - eyez;

	up[0] := upx;
	up[1] := upy;
	up[2] := upz;

	normalize(forw);

	// Side:= forw x up
	crossp(forw, up, side);
	normalize(side);

	// Recompute up as: up:= side x forw
	crossp(side, forw, up);

	modelviewmatrix := Identity;
	modelviewmatrix[0][0] := side[0];
	modelviewmatrix[0][1] := side[1];
	modelviewmatrix[0][2] := side[2];

	modelviewmatrix[1][0] := up[0];
	modelviewmatrix[1][1] := up[1];
	modelviewmatrix[1][2] := up[2];

	modelviewmatrix[2][0] := -forw[0];
	modelviewmatrix[2][1] := -forw[1];
	modelviewmatrix[2][2] := -forw[2];

	modelviewmatrix[3][0] := -eyex;
	modelviewmatrix[3][1] := -eyey;
	modelviewmatrix[3][2] := -eyez;

	glMultMatrixf(@modelviewmatrix[0][0]);

	glGetFloatv(GL_PROJECTION_MATRIX, @projectionmatrix[0][0]);

end;
Code:
// this is gluPickingMatrix equivalent translated from ogl-sample.
procedure Tcamera.PickingMatrix(x, y, deltax, deltay: GLfloat;	viewport: array of integer);
begin
	if (deltax <= 0) and (deltay <= 0) then
		exit;

	// Translate and scale the picked region to the entire window
	glTranslatef( (viewport[2] - 2 * (x - viewport[0])) / deltax, (viewport[3] - 2 * (y - viewport[1])) / deltay, 0);

	glScalef(viewport[2] / deltax, viewport[3] / deltay, 1.0);
end;
Screen / world transformation (gluproject, gluunproject)

Code:
function Tcamera.worldtoscreen(location: Vector): Vector2;
var
	view, projection, tmp: TMatrix4f;
	screen: Vector4;
    viewport: array[0..3] of integer;
begin

	// our data (so this works even when switched to 2d gui projection)
  	view:= modelviewmatrix;
    projection:= projectionmatrix;

    viewport[0]:= 0;
    viewport[1]:= 0;
    viewport[2]:= engine.win_width;
    viewport[3]:= engine.win_height;

// alternative - works, gets opengl data:
// glGetIntegerv(GL_VIEWPORT, @viewport[0]); // returns viewport / window dimensions
//	glGetFloatv(GL_MODELVIEW_MATRIX, @view[0][0]);
//	glGetFloatv(GL_PROJECTION_MATRIX, @projection[0][0]);

	tmp:= MultMatrix(view, projection);

	screen:= MatrixTransformVector34(location, tmp); // MUST use vector4.

    result.x := (screen.x * 0.5 / screen.w + 0.5) * viewport[2]; // engine.win_width;
	result.y := ((-screen.y) * 0.5 / screen.w + 0.5) *  viewport[3]; //engine.win_height; // y is flipped since opengl considers screen space to start at bottom-left ang go up-right

end;

{
winX and winY will be the corners of your screen in pixels.
winZ is a number in [0,1] which will specify where between zNear and zFar (clipping planes) the points should fall.
objX-Z will hold the results.
The middle variables are the relevant matrices.
They can be queried if needed.
}

function Tcamera.screenToWorld(location: Vector): Vector;
var
	view, projection, tmp, finalMatrix: TMatrix4f;
    viewport: array[0..3] of integer;
    x, y, z: glDouble;
    in_: Vector4;
    depthrange: array[0..1] of single;
    far_, near_, clipw: single;
    out_: Vector4;
    depth_here, clip_z, world_z: single;
    oldbuff: integer;
begin

  clipw:= 1.0; // 1.0; // TODO: INPUT, but for vector3 it is 1

  // our data (so this works even when switched to 2d gui projection)
  view:= modelviewmatrix;
  projection:= projectionmatrix;

  viewport[0]:= 0;
  viewport[1]:= 0;
  viewport[2]:= engine.win_width;
  viewport[3]:= engine.win_height;

// alternative - works, gets opengl data:
// glGetIntegerv(GL_VIEWPORT, @viewport[0]); // returns viewport / window dimensions
//	glGetFloatv(GL_MODELVIEW_MATRIX, @view[0][0]);
//	glGetFloatv(GL_PROJECTION_MATRIX, @projection[0][0]);

	location.y:= viewport[3] - location.y; // since screen space in opengl starts at bottom-left and goes up-right.

    //glGetIntegerv(GL_READ_BUFFER, @oldbuff);

    glReadBuffer(GL_BACK); // use back buffer for speed?

    if location.z = infinite then begin

		glReadPixels( trunc(location.x), trunc(location.y), 1, 1, GL_DEPTH_COMPONENT, GL_FLOAT, @depth_here);
	    location.z:= depth_here;

	end;

    //glReadBuffer(oldbuff);

	glGetFloatv(GL_DEPTH_RANGE, @depthrange[0]);

	clip_z := (depth_here - 0.5) * 2.0;
	world_z:= 2*depthrange[1]*depthrange[0]/(clip_z*(depthrange[1]-depthrange[0])-(depthrange[1]+depthrange[0]));

	clipw:= 1.0;

	finalMatrix:= MultMatrix(view, projection);

    //if (!__gluInvertMatrixd(finalMatrix, finalMatrix)) return(GL_FALSE);
    MatrixInverseV2(finalMatrix);

	in_.components[0]:= location.x;
	in_.components[1]:= location.y;
	in_.components[2]:= location.z;
	in_.components[3]:= clipw;

	// Map x and y from window coordinates
	in_.components[0] := (in_.components[0] - viewport[0]) / viewport[2];
	in_.components[1] := (in_.components[1] - viewport[1]) / viewport[3];
//	in_.components[2] := (in_.components[2] - near_) / (far_ - near_); not for vector3

	// Map to range -1 to 1
	in_.components[0] := in_.components[0] * 2 - 1;
	in_.components[1] := in_.components[1] * 2 - 1;
	in_.components[2] := in_.components[2] * 2 - 1;

	out_:= MatrixTransformVector44(in_, finalMatrix);

	if (out_.components[3] = 0.0) then begin
    	result:= nullvector;
        exit;
    end;

    out_.components[0] := out_.components[0] / out_.components[3];
    out_.components[1] := out_.components[1] / out_.components[3];
    out_.components[2] := out_.components[2] / out_.components[3];

	Result:= makevector(out_.components[0], out_.components[1], out_.components[2]);

end;