Results 1 to 9 of 9

Thread: ASM includes broken in fpc 3.0.4 / x86_64

  1. #1

    ASM includes broken in fpc 3.0.4 / x86_64

    Wait. I'm now not sure it wasn't MY fault.
    x86_64 asm is a veritable minefield for someone who learned x86 asm during DOS times. So many instructions that compile, seem to work, but fail with addresses above 4Gb causing AVs or trashed memory.

    Will investigate in detail later.















    I will investigate it and fill a bug report later, got no time now.

    But the fact is, ASM includes are broken in this combination.
    Everything works fine for i686; the same ASM blocks work fine with fpc 2.6.4 so it's not my fault (probably).

    Example of code that silently trashes memory:

    Code:
    function GetExceptionState(): PMotherSehState;
    {
       Note: while it *can* allocate the block for a thread
       it knows nothing about, that is *not* a desired behavior.
    
       Thus, any thread better call this function beforehand,
       in the beginning of it's Execute method.
    
       There is Mother^.Core.AllocateThreadExceptionState for the module
    }
    var
      T: TThreadId;
    begin
      EnterCriticalsection(Mother^.SehCriticalSection);
      T:= GetCurrentThreadId();
      Result:= @Mother^.ExceptionState;
      while true do begin
        if Result^.ThreadId = T then begin
          LeaveCriticalsection(Mother^.SehCriticalSection);
          Exit;
        end;
        if not Assigned(Result^.Next) then begin
          VerboseLog('  Allocating exception state for thread ID=' + IntToHex(T, 8) + 'h...');
          Result^.Next:= new(PMotherSehState);
          Result:= Result^.Next;
          FillChar(Result^, SizeOf(Result^), 0);
          Result^.ThreadId:= T;
          Result^.events[High(Result^.events)].eType:= deke_Terminator;
        {$ifdef CPUX86_64}
           {$if (FPC_FULLVERSION>20604) and (FPC_FULLVERSION<39999)}
             {$fatal This #@$# trashes memory. Use FPC 2.6.4 instead.}
           {$endif}
          asm
            mov rsi, qword[Result]
            mov ax, ss
            mov word[rsi + TMotherSEHState.StackSegment], ax
          end['rax', 'rsi'];
        {$endif}
        {$ifdef CPU386}
          asm
            mov esi, dword[Result]
            mov ax, ss
            mov word[esi + TMotherSEHState.StackSegment], ax
          end['eax', 'esi'];
        {$endif}
    
          LeaveCriticalsection(Mother^.SehCriticalSection);
          Exit;
        end;
        Result:= Result^.Next;
      end;
    end;
    -- it seems to work, then following exception processing crashes with "exception state not found for thread <ID>". Commenting the ASM block out fixes that.

    where
    Code:
      PMother = ^TMother;
      TMother = maybepacked record
        Initialized,
        Safe: boolean;
        Validate: function(sizes: array of const): boolean; cdecl;
        Criticalsection,
        SehCriticalSection: TRTLCriticalSection;
        State: TMotherState_;
        Core: TMotherCore;
        ExceptionState
          : TMotherSehState; //contains a pointer-chain of states for other threads
        CPU: TMotherCPU;
        Timer: TMotherTimer;
        Config: TMotherConfig;
        AssetKeeper: TMotherAssetKeeper;
        Memory: TMotherMemory;
        Stream: TMotherStream_;
        Module: TMotherModule_;
        GAPI: TMotherGAPI;
        Sound: TMotherSound;
        Text: TMotherText;
        Display: TMotherDisplay;
        Image: TMotherImaging;
    	  Input: TMotherInput_;
        Gamepad: TMotherGamepad;
        PenTablet: TMotherPenTablet;
        {$ifdef cgekernel}
         LockupGuard: TLockupGuard;
        {$else}
         stub1: ptruint;
        {$endif}
        Debug: TMotherDebug;
        ReservedStub: array[0..10000] of ptruint;
      end;
    where
    Code:
      PMotherSehState = ^TMotherSEHState;
      TMotherSEHState = maybepacked record
        // see cge.ResetMotherErrorState() and cl_die.GetExceptionState()
    
        ThreadId: TThreadId;
    
        ExceptionCode, //true exception codes in Windows, not used in Linux (see cl_seh_hack.inc)
        ExceptionAddress: ptruint;
        StackFrameAddress: ptruint; //NOT used //used to restore rbp
    
        NowDying,
        DyingAfterTrueException, {if the exception handling chain was set off
           by a real exception like an AV and not by a controlled call to Die() }
        CallStackLogged,
        ModuleIsAlreadyProcessingUnhandledException,
        IsModuleThread, {The current thread is created by the module and should be
          processed differently}
        ThreadFinished
          : boolean;
    
        AbbrTitleForIndicator: ShortString3;
        StackSegment: Word;
        ThreadTitle: {$ifdef cgekernel} UnicodeString {$else} PUnicodeChar {$endif};
    
        AVMemoryAddress,
        AVMemoryOperation: ptruint; //0 - read, 1 - write, 8-Data Execution Prevention. See MSDN.
    
        //thread load details are stored here: I did not find a better place
        tlic: TThreadLoadKind;
        tlitsc: {$ifdef cpuarm} timeval {$else} qword {$endif};
        tli: array [TThreadLoadKind] of int64; //accumulates RDTSC cycles
    
        ReservedStub: array[0..3] of ptruint;
    
        Next: PMotherSehState;
    
        events: array[0..DyingEventsLimit] of TDyingEventRec; //approximately 60 kilobytes
      end;
    where

    Code:
    {$ifdef FPC_REQUIRES_PROPER_ALIGNMENT}
      // arm on Raspberry Pi
      {$define maybepacked:=}
      {$packrecords 4}
    {$else}
      {$define maybepacked:=packed}
    {$endif}
    Last edited by Chebmaster; 07-09-2018 at 07:52 AM.

  2. #2
    This Cheb is stumped.

    2.6.4:
    Code:
    ; [37] Result^.events[High(Result^.events)].eType:= deke_Terminator;
    		mov	rax,qword ptr [rbp-8]
    ; Register raxreleased
    		mov	dword ptr [rax+59118],1
    ; Register raxallocated
    ; Register rsiallocated
    @@l3527:
    ; [43] mov rsi, qword[Result]
    		mov	rsi,qword ptr [rbp-8]
    @@l3528:
    ; [44] mov ax, ss
    		mov	ax,ss
    @@l3529:
    ; [45] mov word[rsi + TMotherSEHState.StackSegment], ax
    		mov	word ptr [rsi+60],ax
    ; Register raxreleased
    ; Register rsireleased
    ; Register raxallocated
    @@l3530:
    ; [56] LeaveCriticalsection(Mother^.SehCriticalSection);
    		mov	rax,qword ptr [U_CL_CGE_MOTHER+$]
    3.0.4:
    Code:
    # [37] Result^.events[High(Result^.events)].eType:= deke_Terminator;
    	movq	-8(%rbp),%rax
    	movl	$1,59118(%rax)
    	# Register rax released
    	# Register rax,rsi allocated
    .Ll4092:
    # [43] mov rsi, qword[Result]
    	movq	-8(%rbp),%rsi
    .Ll4093:
    # [44] mov ax, ss
    	movw	%ss,%ax
    .Ll4094:
    # [45] mov word[rsi + TMotherSEHState.StackSegment], ax
    	movw	%ax,60(%rsi)
    	# Register rsi released
    	# Register rax allocated
    .Ll4095:
    # [56] LeaveCriticalsection(Mother^.SehCriticalSection);
    	movq	U_$CL_CGE_$$_MOTHER(%rip),%rax
    These look the same to me.

  3. #3
    Did you figure it out? If you can glue together some test program that fails, I can take a look.

  4. #4
    Sorry, I delayed investigating this problem for much later, when I have 32-bit version of my engine running many real tasks without glitching or crashing.

    Add to that I install my own SEH handler for Win32, which I had to change to VEH handler for Win64...
    Add to that that I use pointer arithmetics in many places and had since weeded out several coding horrors via simple search for "cardinal" keyword...
    No, this was probably stupid me crying wolf because of trashed memory elsewhere

  5. #5
    No need to apologize, glad you managed to fix the issue - one sad panda less in the world

  6. #6
    PGDCE Developer de_jean_7777's Avatar
    Join Date
    Nov 2006
    Location
    Bosnia and Herzegovina (Herzegovina)
    Posts
    287
    Quote Originally Posted by Chebmaster View Post
    No, this was probably stupid me crying wolf because of trashed memory elsewhere
    Too often this is the case for me too, where weird behavior is because I trashed memory due to the hacks I employ. A lot less of these nowadays, but still can cause a lot of hours wasted because of memory mismanagement
    Existence is pain

  7. #7
    So I've found *yet another* chunk of assembly code I botched porting from i386 to x86_64. Which compiles and works in 64-bit compatibility mode (all addresses limited to the lower 4Gb) that is only only generated using fpc 2.6.4 when you set debugging info to stabs) but trashes memory happily in true 64-bit addressing mode (because, as I was informed, most of MOV variations silently clip the address to its lower 32 bits unless the register you move to/from memory is RAX/EAX).
    Namely, instead of
    Code:
          asm
            mov rax, [pes]
            mov rbx, rax
            mov rax, qword[rbx + TMotherSEHState.tlitsc]
            mov rcx, rax
            xor rax, rax
            rdtsc
            shl rdx, 32
            or rax, rdx
            mov qword[rbx + TMotherSEHState.tlitsc], rax;
            sub rax, rcx
            mov rdx, rax
            xor rax, rax
            mov eax, dword[rbx + TMotherSEHState.tlic]
            mov ecx, eax
            mov rax, rdx
            add qword[rbx + rcx * 8 + TMotherSEHState.tli], rax
          end ['rax', 'rcx', 'rdx', 'rbx'];
    I had this heresy:
    Code:
              asm
                mov rbx, [pes]
                mov rcx, qword[rbx + TMotherSEHState.tlitsc]
                mov eax, 0
                rdtsc
                mov dword[rbx + TMotherSEHState.tlitsc], eax;
                mov dword[rbx + 4 + TMotherSEHState.tlitsc], edx;
                shl rdx, 32
                add rdx, rax
                sub rdx, rcx
                xor rcx, rcx
                mov ecx, dword[rbx + TMotherSEHState.tlic]
                add qword[rbx + rcx * 8 + TMotherSEHState.tli], rdx
              end ['rax', 'rcx', 'rdx', 'rbx'];
    I hope this compiles and works when my project compiles again.

  8. #8
    Hi, did it work?
    Last edited by davido; 28-08-2024 at 04:20 PM.

  9. #9
    Cannot say yet because I froze working on both Linux and x86_64 until I finish rehauling both my ODBMS and GUI systems and get a perfectly working test using fpc 2.6.4 for Win32.
    Also, I need FPC 3.2 released before trying x86_64 again: 3.0.x is lacking some much needed features so I cannot debug using my tools.

Bookmarks

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •