For comparison, here is my Vector unit. It is not yet tested for accuracy. It compiles under both Delphi and FPC. Unfortunately, under FPC if you want operator overloads, you cannot use Delphi compatibility mode. Without Delphi compatibility mode, you do not have the implied Result variable in functions. Hence the old-school method of assigning to the function name. I used to do that all the time when I was first learning Pascal at uni (Mac Pascal on a Macintosh 128K), but since I've been using Delphi I've become so used to having the Result variable. You will note that there was one function (VectorCreate) where I had no choice but to use the Result variable in Delphi because it failed compilation using the function name.

[pascal]unit Vector;

interface

uses
Scalar, Matrix;

type
PVector = ^TVector;
TVector = record
case Integer of
0:
(
x: TScalar;
y: TScalar;
z: TScalar;
);
1:
(
v: array [0..3] of TScalar;
);
end;

{$IFDEF FPC}
operator + (const v1: TVector; const v2: TVector) Result: TVector;
operator - (const v1: TVector; const v2: TVector) Result: TVector;
operator * (const v1: TVector; const v2: TVector) Result: TVector;
operator * (const v1: TVector; s: TScalar) Result: TVector;
operator * (const v1: TVector; const m: TMatrix) Result: TVector;
operator / (const v1: TVector; s: TScalar) Result: TVector;
operator = (const v1: TVector; const v2: TVector) Result: Boolean;
{$ENDIF FPC}

function VectorAdd(const v1: TVector; const v2: TVector): TVector;
function VectorSubtract(const v1: TVector; const v2: TVector): TVector;
function VectorMultiply(const v1: TVector; const v2: TVector): TVector; overload;
function VectorMultiply(const v1: TVector; s: TScalar): TVector; overload;
function VectorDivide(const v1: TVector; s: TScalar): TVector;
function VectorEqual(const v1: TVector; const v2: TVector): Boolean;
function VectorDot(const v1: TVector; const v2: TVector): TScalar;
function VectorCross(const v1: TVector; const v2: TVector): TVector;
function VectorLength(const v1: TVector): TScalar;
function VectorLengthSq(const v1: TVector): TScalar;
function VectorInterpolate(const v1: TVector; const v2: TVector; t: TScalar): TVector;
procedure VectorNormalize(var v1: TVector);
function VectorNormal(const v1: TVector): TVector;
function VectorNormalTri(const v1: TVector; const v2: TVector; const v3: TVector): TVector;
function VectorAngle(const v1: TVector; const v2: TVector): TScalar;

function VectorCreate(const v1: TVector): TVector; overload;
function VectorCreate(x, y, z: TScalar): TVector; overload;
function VectorCreate(a: PScalar): TVector; overload;
procedure VectorToArray(const v1: TVector; a: PScalar);

const
NullVector: TVector = (x: 0.0; y: 0.0; z: 0.0);
FwdVector: TVector = (x: 0.0; y: 0.0; z: 1.0);
LeftVector: TVector = (x: 1.0; y: 0.0; z: 0.0);
UpVector: TVector = (x: 0.0; y: 1.0; z: 0.0);

implementation

{$IFDEF FPC}
operator + (const v1: TVector; const v2: TVector) Result: TVector;
begin
Result.x := v1.x + v2.x;
Result.y := v1.y + v2.y;
Result.z := v1.z + v2.z;
end;

operator - (const v1: TVector; const v2: TVector) Result: TVector;
begin
Result.x := v1.x - v2.x;
Result.y := v1.y - v2.y;
Result.z := v1.z - v2.z;
end;

operator * (const v1: TVector; const v2: TVector) Result: TVector;
begin
Result.x := v1.x * v2.x;
Result.y := v1.y * v2.y;
Result.z := v1.z * v2.z;
end;

operator * (const v1: TVector; s: TScalar) Result: TVector;
begin
Result.x := v1.x * s;
Result.y := v1.y * s;
Result.z := v1.z * s;
end;

operator * (const v1: TVector; const m: TMatrix) Result: TVector;
begin
Result.x := v1.x * m.m[0, 0] + v1.y * m.m[1, 0] + v1.z * m.m[2, 0] + m.m[3, 0];
Result.y := v1.x * m.m[0, 1] + v1.y * m.m[1, 1] + v1.z * m.m[2, 1] + m.m[3, 1];
Result.z := v1.x * m.m[0, 2] + v1.y * m.m[1, 2] + v1.z * m.m[2, 2] + m.m[3, 2];
end;

operator / (const v1: TVector; s: TScalar) Result: TVector;
var
r: TScalar;
begin
r := 1.0 / s;
Result.x := v1.x * r;
Result.y := v1.y * r;
Result.z := v1.z * r;
end;

operator = (const v1: TVector; const v2: TVector) Result: Boolean;
begin
Result := (v1.x = v2.x) and (v1.y = v2.y) and (v1.z = v2.z);
end;
{$ENDIF FPC}

function VectorAdd(const v1: TVector; const v2: TVector): TVector;
begin
VectorAdd.x := v1.x + v2.x;
VectorAdd.y := v1.y + v2.y;
VectorAdd.z := v1.z + v2.z;
end;

function VectorSubtract(const v1: TVector; const v2: TVector): TVector;
begin
VectorSubtract.x := v1.x - v2.x;
VectorSubtract.y := v1.y - v2.y;
VectorSubtract.z := v1.z - v2.z;
end;

function VectorMultiply(const v1: TVector; const v2: TVector): TVector;
begin
VectorMultiply.x := v1.x * v2.x;
VectorMultiply.y := v1.y * v2.y;
VectorMultiply.z := v1.z * v2.z;
end;

function VectorMultiply(const v1: TVector; s: TScalar): TVector;
begin
VectorMultiply.x := v1.x * s;
VectorMultiply.y := v1.y * s;
VectorMultiply.z := v1.z * s;
end;

function VectorDivide(const v1: TVector; s: TScalar): TVector;
var
r: TScalar;
begin
r := 1.0 / s;
VectorDivide.x := v1.x * r;
VectorDivide.y := v1.y * r;
VectorDivide.z := v1.z * r;
end;

function VectorEqual(const v1: TVector; const v2: TVector): Boolean;
begin
VectorEqual := (v1.x = v2.x) and (v1.y = v2.y) and (v1.z = v2.z);
end;

function VectorDot(const v1: TVector; const v2: TVector): TScalar;
begin
VectorDot := v1.x * v2.x + v1.y * v2.y + v1.z * v2.z;
end;

function VectorCross(const v1: TVector; const v2: TVector): TVector;
begin
VectorCross.x := v1.y * v2.z - v1.z * v2.y;
VectorCross.y := v1.z * v2.x - v1.x * v2.z;
VectorCross.z := v1.x * v2.y - v1.y * v2.x;
end;

function VectorLength(const v1: TVector): TScalar;
begin
VectorLength := Sqrt(v1.x * v1.x + v1.y * v1.y + v1.z * v1.z);
end;

function VectorLengthSq(const v1: TVector): TScalar;
begin
VectorLengthSq := v1.x * v1.x + v1.y * v1.y + v1.z * v1.z;
end;

function VectorInterpolate(const v1: TVector; const v2: TVector; t: TScalar): TVector;
begin
VectorInterpolate.x := v1.x + (v2.x - v1.x) * t;
VectorInterpolate.y := v1.y + (v2.y - v1.y) * t;
VectorInterpolate.z := v1.z + (v2.z - v1.z) * t;
end;

procedure VectorNormalize(var v1: TVector);
var
s: TScalar;
begin
s := VectorLength(v1);
if s > 0.0 then
begin
v1.x := v1.x * s;
v1.y := v1.y * s;
v1.z := v1.z * s;
end;
end;

function VectorNormal(const v1: TVector): TVector;
var
s: TScalar;
begin
s := VectorLength(v1);
if s > 0.0 then
begin
VectorNormal.x := v1.x * s;
VectorNormal.y := v1.y * s;
VectorNormal.z := v1.z * s;
end
else
begin
VectorNormal := NullVector;
end;
end;

function VectorNormalTri(const v1: TVector; const v2: TVector; const v3: TVector): TVector;
begin
VectorNormalTri := VectorNormal(VectorCross(VectorSubtract(v2, v1), VectorSubtract(v3, v1)));
end;

function VectorAngle(const v1: TVector; const v2: TVector): TScalar;
begin
VectorAngle := VectorDot(VectorNormal(v1), VectorNormal(v2));
end;

function VectorProject(const v1: TVector; const v2: TVector): TVector;
var
t: TVector;
begin
t := VectorNormal(v2);
VectorProject := VectorMultiply(t, VectorDot(v1, t));
end;

function VectorReflect(const v1: TVector; const n: TVector): TVector;
begin
if VectorAngle(v1, n) < 0.0 then
VectorReflect := VectorAdd(v1, VectorMultiply(VectorProject(v1, n), -2.0))
else
VectorReflect := v1;
end;

function VectorCreate(const v1: TVector): TVector;
begin
VectorCreate := v1;
end;

function VectorCreate(x, y, z: TScalar): TVector;
begin
VectorCreate.x := x;
VectorCreate.y := y;
VectorCreate.z := z;
end;

function VectorCreate(a: PScalar): TVector;
begin
{$IFDEF FPC}
Move(a^, VectorCreate, SizeOf(TVector));
{$ELSE}
Move(a^, Result, SizeOf(TVector));
{$ENDIF FPC}
end;

procedure VectorToArray(const v1: TVector; a: PScalar);
begin
Move(v1, a^, SizeOf(TVector));
end;

end.[/pascal]

I tried some timings using these functions. Note that they may be rough, but the more iterations reduces the effect of test setup overhead.

Code:
VectorCreate &#40;10 iterations&#41; = 7892 cycles &#40;789 cycles per iteration&#41;
VectorCreate &#40;100 iterations&#41; = 4040 cycles &#40;40 cycles per iteration&#41;
VectorCreate &#40;1000 iterations&#41; = 37200 cycles &#40;37 cycles per iteration&#41;
VectorCreate &#40;10000 iterations&#41; = 370192 cycles &#40;37 cycles per iteration&#41;
VectorLength &#40;10 iterations&#41; = 1100 cycles &#40;110 cycles per iteration&#41;
VectorLength &#40;100 iterations&#41; = 4940 cycles &#40;49 cycles per iteration&#41;
VectorLength &#40;1000 iterations&#41; = 45240 cycles &#40;45 cycles per iteration&#41;
VectorLength &#40;10000 iterations&#41; = 448244 cycles &#40;44 cycles per iteration&#41;
VectorNormalTri &#40;10 iterations&#41; = 3228 cycles &#40;322 cycles per iteration&#41;
VectorNormalTri &#40;100 iterations&#41; = 27268 cycles &#40;272 cycles per iteration&#41;
VectorNormalTri &#40;1000 iterations&#41; = 269816 cycles &#40;269 cycles per iteration&#41;
VectorNormalTri &#40;10000 iterations&#41; = 2695328 cycles &#40;269 cycles per iteration&#41;
I do not know how much effect Delphi's compiler optimizations may have had on these. I tried turning optimizations off, but that does not disable all optimizations.