I went ahead and did it, though this implementation is for 32-bit color and FPC only. It will probably port to Delphi or others with very little effort. My cpu stays at 60% with an 800x600x32x60fps SDL window. That could probably be sped up by moving the final render(the one onto the screen) to OpenGL.


(a screen shot of Cave Story run through the scaler.)

Code:
(*

   32-bit only scale2x for SDL; by James Hofmann 2007
   based on Pete Shinners' SDL scale2x.
   Doubles the size of a surface with a smoothing algorithm.
   This code is public domain.

*)

{$inline on}
{$mode objfpc}

unit scale2x32;

interface

uses sdl,sdl_video;

procedure scale2x(const src : PSDL_Surface; var dst : PSDL_Surface);

implementation

function max(const a,b : longint) : longint; inline;
begin
    { note: if we were to use unsigned numbers the border checks would cause a crash
      because there would be a negative overflow }
    if (a>b) then result:=a else result:=b;
end;

function min(const a,b : longint) : longint; inline;
begin
    if &#40;a<b&#41; then result&#58;=a else result&#58;=b;
end;

&#40;*
  this requires a destination surface already setup to be twice as
  large as the source. oh, and formats must match too. this will just
  blindly assume you didn't flounder.
*&#41;

procedure scale2x&#40;const src &#58; PSDL_Surface; var dst &#58; PSDL_Surface&#41;;
var
	srclinewidth &#58; longint;
	dstlinewidth &#58; longint;
	width &#58; longint;
	height &#58; longint;
    srcpix, dstpix &#58; ^longint;
    E0, E1, E2, E3, B, D, E, F, H &#58; longint;
	looph, loopw &#58; longint;
begin 

	srclinewidth &#58;= &#40;src^.pitch&#41; div sizeof&#40;longword&#41;;
	dstlinewidth &#58;= &#40;dst^.pitch&#41; div sizeof&#40;longword&#41;;
	width &#58;= src^.w;
	height &#58;= src^.h;
	
	srcpix &#58;= src^.pixels;
	dstpix &#58;= dst^.pixels;

		for looph &#58;= 0 to height-1 do
		begin
			for loopw &#58;= 0 to width-1 do
			begin 

                    &#123; from this grid we get the values BDEFH&#58; 

                            ABC
                            DEF
                            GHI

                      and map them into a E0,E1,E2,E3&#58;
                            E0 E1
                            E2 E3

                      we must also account for the borders&#40;using the pixels nearest to border&#41;
                    &#125;


                    &#123; increment each pointer to the requested pixel &#125;

                    B &#58;= &#40;srcpix + srclinewidth * MAX&#40;0,looph-1&#41; + loopw&#41;^;

			    	D &#58;= &#40;srcpix + srclinewidth * looph + MAX&#40;0,loopw-1&#41;&#41;^;

			    	E &#58;= &#40;srcpix + srclinewidth * looph + loopw&#41;^;

			    	F &#58;= &#40;srcpix + srclinewidth * looph + MIN&#40;width-1,loopw+1&#41;&#41;^;

			    	H &#58;= &#40;srcpix + srclinewidth * MIN&#40;height-1,looph+1&#41; + loopw&#41;^;

                &#123; determine smoothing result &#125;
				
                if &#40;&#40;D = B&#41; and &#40;B <> F&#41; and &#40;D <> H&#41;&#41; then e0&#58;=D else e0&#58;=E;
                if &#40;&#40;B = F&#41; and &#40;B <> D&#41; and &#40;F <> H&#41;&#41; then e1&#58;=F else e1&#58;=E;
                if &#40;&#40;D = H&#41; and &#40;D <> B&#41; and &#40;H <> F&#41;&#41; then e2&#58;=D else e2&#58;=E;
                if &#40;&#40;H = F&#41; and &#40;D <> H&#41; and &#40;B <> F&#41;&#41; then e3&#58;=F else e3&#58;=E;

                &#123; blit the doubled pixel &#125;

                &#40;dstpix+&#40;looph*2*dstlinewidth + loopw*2&#41;&#41;^&#58;=E0;

                &#40;dstpix+&#40;looph*2*dstlinewidth + &#40;loopw*2+1&#41;&#41;&#41;^&#58;=E1;              

                &#40;dstpix+&#40;&#40;looph*2+1&#41;*dstlinewidth + loopw*2&#41;&#41;^&#58;=E2;              

                &#40;dstpix+&#40;&#40;looph*2+1&#41;*dstlinewidth + &#40;loopw*2+1&#41;&#41;&#41;^&#58;=E3;              
				
			end; 
		end;

end; 

end.
edit: optimized to remove unnecessary pointer and temp-value assignments.