In reversed order:
You want something like this: correction = unsigned 16-bit value, calculaton =
saturate8( src[x] * corr[x] shr SHIFTBITS ) ? I would rather avoid the int -> floating point -> int conversion, and the intermediate saturation is unnecessary imho - I assume that w2 can be bigger than word, because it's saturated to 8 bits later anyway, so it shouldn't matter.
The width must be divisible by 8 for the sse (sse2 actually) version.

[pascal]
const
SHIFTBITS = 9; //just an example

procedure correction_pas(src, dest: pbyte; width, height: integer; corr: pword);

function clip(i: longword): byte;
begin
if i > 255 then result := 255 else result := i;
end;

function clip16(i: longword): word;
begin
if i > 65535 then result := 65535 else result := i;
end;

var
x, y: integer;
begin
for y := 0 to height - 1 do
for x := 0 to width - 1 do
// dest[y * width + x] := clip(clip16( src[y * width + x] * corr[x] ) shr SHIFTBITS);
dest[y * width + x] := clip( src[y * width + x] * corr[x] shr SHIFTBITS ); //imho this is more desirable
end;


//unpack, no intermed. saturation
procedure correction_sse(src, dest: pbyte; width, height: integer; corr: pword);
var
i: integer;
begin
for i := 0 to height - 1 do begin
asm
mov eax, src
mov edx, dest
mov ebx, corr
mov ecx, width
shr ecx, 3 // width / 8, since we work on 8 pixels at once
pxor xmm7, xmm7 // 0

@loop_x:
movq xmm0, [eax] // load 8 bytes = pixels
punpcklbw xmm0, xmm7 // unpack to 8 words
movdqa xmm1, xmm0 // duplicate

movdqu xmm6, [ebx] // load 8 words = light/correction
pmullw xmm0, xmm6 // multiply 8 words to 8 low word results
pmulhw xmm1, xmm6 // multiply 8 words to 8 high word results
movdqa xmm2, xmm0 // duplicate
movdqa xmm3, xmm1

punpcklwd xmm0, xmm1 // merge 8 low + 8 high words into 8 doublewords (2x4)
punpckhwd xmm2, xmm3

psrld xmm0, SHIFTBITS // right shift 4 dw
psrld xmm2, SHIFTBITS // same

packssdw xmm0, xmm2 // 8 dwords to 8 words (signed, but we don't have to care)
packuswb xmm0, xmm7 // pack 8 words to 8 bytes

movq [edx], xmm0 // store 8 bytes = pixels
add eax, 8
add edx, 8
add ebx, 16

dec ecx
jnz @loop_x
end['eax', 'ebx', 'ecx', 'edx'];
src += width;
dest += width;
end;
end;
[/pascal]

For tutorials: the intel developer manuals should be helpful, like: "x64 and IA-32 Optimization Reference Manual", or try this webpage: http://webster.cs.ucr.edu/AoA/Window...2.html#1004358 . Drawing the computations and data flow on a sheet of paper is often very useful also. I learned most things from experimenting. Documents with mmx/sse instructions and their descriptions are very handy, too - for example "AMD64 Architecture Programmer’s Manual Volume 4: 128-Bit Media Instructions" or nasm 0.99.x manual with x86 instruction listing (up to sse2).