Convolution filter with x86 and SSE2 assembly

BugInTheSYS · Post by **BugInTheSYS** » Sun Apr 28, 2013 5:24 am

I just worked out a little program that applies a 5x5 convolution filter onto a GDI bitmap in the 32-bit per pixel ARGB format. I'm posting it for anyone who's trying to get into ASM or anyone who wants to give me hints about performance improvements and etc.

This is written for Microsoft Visual C++ and is meant to operate with GDI bitmaps in .NET programs, where Scan0 points to the beginning of the bitmap pixel array, Stride is the amount of bytes in a line of the bitmap, Width and Height refer to the dimensions of the bitmap.
These values can be obtained in the BitmapData structure returned from .LockBits() in the System.Drawing.Bitmap class in C# and Visual Basic.

To include the algorithm in a C# program, you will need to use P/Invoke syntax, like this:

[System.Runtime.InteropServices.DllImport("DoConvolution.dll", CallingConvention = System.Runtime.InteropServices.CallingConvention.Cdecl, EntryPoint = "Convolute")]
public static extern void CConvolution(IntPtr Scan0, int Stride, int Width, int Height);

And now for the code (I set the syntax to MASM because that's what most of the code consists of. The rest is C.)

I'm heading out so I hope I haven't forgotten to tell you anything, always ask.

Edit: Spoilered the outdated code so it doesn't occupy half your screen when you're scrolling in this topic. Scroll down for a new version.

Click here to see the hidden message (It might contain spoilers)

extern "C" _declspec( dllexport ) void Convolute( char *Scan0, int Stride, int Width, int Height ) {
#define MULTIPLIER 1.0f / 256.0f
#define REPEAT_FOUR_TIMES( X ) X, X, X, X
static float Kernel[5][20] = {	{ REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ) },
									{ REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ) },
									{ REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 36.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER ) },
									{ REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ) },
									{ REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ) } };

void *KernelPtr = (void*)Kernel;
__asm {
 jmp asml_convolute ; Start of the algorithm is behind that label, go down a couple of lines

asml_roix_loopbody_conditional:
 ; This code checks whether the current roi-x and roi-y refer to pixels that are
 ; inside the boundaries of the GDI bitmap, and if not, jumps back to asml_roix_loopbody_conditional_false
 mov edx, dword ptr ss:[esp] ;PixelX
 add edx, ecx ;RoiX
 sub edx, 2h
 cmp edx, 0
 jl asml_roix_loopbody_conditional_false
 cmp edx, Width
 jge asml_roix_loopbody_conditional_false
 mov edx, dword ptr ss:[esp + 4] ;PixelY
 add edx, ebx ;RoiY
 sub edx, 2h
 cmp edx, 0
 jl asml_roix_loopbody_conditional_false
 cmp edx, Height
 jge asml_roix_loopbody_conditional_false ; CONDITION is now ASSERTED
 
 ; This code calculates Scan0 + ( PixelY + RoiY - 2 ) * Stride + ( PixelX + RoiX - 2 ) * 4
 ; Which is the pointer to the current bitmap pixel we're working on
 mov edx, Scan0
 mov eax, dword ptr ss:[esp + 4] ;PixelY
 add eax, ebx ;RoiY
 sub eax, 2h
 imul eax, Stride
 add edx, eax
 mov eax, dword ptr ss:[esp] ;PixelX
 add eax, ecx ;RoiX
 sub eax, 2h
 imul eax, 4h
 add edx, eax ; Pixel Pointer is now stored in EDX

 ; Read the color from the bitmap pixel pointer now in EDX
 pxor mm0, mm0
 pxor xmm1, xmm1
 movd xmm2, [edx] ; Load color (32 bits) from memory into register
 punpcklbw xmm2, xmm1 ; Expand the 32-bit color consisting of 8-bit channel values 
 punpcklwd xmm2, xmm1 ; to a 128-bit color of 32-bit channel values
 cvtdq2ps xmm1, xmm2 ; Convert the 32-bit integer channel values to 32-bit (single-precision) floats
 ; Color read from pixel pointer and converted to floats is now stored in xmm1

 ; Get pointer to value from the Kernel
 ; Calculate KernelPtr + RoiY * (Kernel array stride) + RoiX * (Kernel element width (in this case 16, 
 ; because there are 5 * (4 repeated floats) per line))
 ; The kernel value we need is the one that tells us how much influence the current ROI pixel will have
 ; in the calculation of the color for the current bitmap pixel
 mov eax, KernelPtr
 mov edx, ebx ;RoiY
 imul edx, 50h
 add eax, edx
 mov edx, ecx ;RoiX
 imul edx, 10h
 add eax, edx ; 

 ; Load Kernel value pointer into xmm2
 movups xmm2, [eax]
 mulps xmm1, xmm2 ; Kernel-weighted color is now in xmm1
 addps xmm0, xmm1 ; Kernel-weighted color is now added to the current accumulator color value in xmm0
 jmp asml_roix_loopbody_return

asml_convolute:
 ; START
 ; Begin a loop (PIXEL-Y) that iterates over the height of the bitmap (from Height - 1 down to 0)
 mov ecx, Height
 sub ecx, 1
asml_pixy_loopbegin:
 push ecx ; Push PixelY onto the stack

 ; PIXEL-Y LOOP BODY
 ; Begin a loop (PIXEL-X) that iterates over the width of the bitmap (from Width - 1 down to 0)
 mov ecx, Width
 sub ecx, 1
asml_pixx_loopbegin:
 push ecx ; Push PixelX onto the stack

 ; PIXEL-X LOOP BODY

 ; set xmm0 to zero (xmm0 always stores the "Current Color", 
 ; the color that is accumulated through the convolution algorithm and later written to the bitmap)
 pxor xmm0, xmm0

 ; The loop over the y-axis of the Region of Interest (from two pixels below the current PIXEL-Y to two pixels above it
 ; goes from 4 down to 0 to access the Kernel values)
 ; The current roi-y is permanently stored in EBX until the loop ends
 mov ebx, 4h
asml_roiy_loopbegin:
 
 ; ROI-Y LOOP BODY

 ; The loop over the x-axis of the region of interest
 ; The current roi-x is permanently stored in ECX until the loop ends
 mov ecx, 4h
asml_roix_loopbegin:
 ;push ecx ; Push RoiX

 ; ROIX LOOP BODY
 jmp asml_roix_loopbody_conditional ; Go up to the beginning of the asm code
asml_roix_loopbody_return:
asml_roix_loopbody_conditional_false:

 ; ROIX LOOP BODY END
 dec ecx
 cmp ecx, 0
 jge asml_roix_loopbegin
 
 ; ROIY LOOP BODY END
 ;pop ecx
 dec ebx
 cmp ebx, 0
 jge asml_roiy_loopbegin
 ;asml_roiy_loop_end:
 
 ; Calculate the pointer of the current pixel in the bitmap (x = PixelX, y = PixelY, which are on the stack)
 mov eax, Scan0
 mov ebx, dword ptr ss:[esp + 4] ; PixelY
 imul ebx, Stride
 add eax, ebx
 mov ebx, dword ptr ss:[esp] ; PixelX
 imul ebx, 4
 add eax, ebx ; Dest ptr is now in eax

 ; Copy calculated color into bitmap
 cvtps2dq xmm1, xmm0
 packssdw xmm2, xmm1
 psrldq xmm2, 8
 movdq2q mm1, xmm2
 packsswb mm0, mm1
 psrlq mm0, 20h
 movd [eax], mm0

 ;move on with the next pixel

 ; PIXELX LOOP BODY END
 pop ecx ; Pull PixelX from the stack, increment, jump to beginning
 dec ecx
 cmp ecx, 0
 jge asml_pixx_loopbegin
 
 ; PIXELY LOOP BODY END
 pop ecx ; Pull PixelY from the stack, increment, jump to beginning
 dec ecx
 cmp ecx, 0
 jge asml_pixy_loopbegin
 }
}

Post by **Falco Girgis** » Sun Apr 28, 2013 11:36 am

Damn, that's really fucking cool. Finding or seeing somebody find a great use for hardware accelerating an algorithm through SIMD instructions always gives me a boner.

I would recommend being more const-correct with your C-code, though. Your static LUT especially should be made const.

BugInTheSYS · Post by **BugInTheSYS** » Sun Apr 28, 2013 12:19 pm

Falco Girgis wrote:Damn, that's really fucking cool. Finding or seeing somebody find a great use for hardware accelerating an algorithm through SIMD instructions always gives me a boner.

I would recommend being more const-correct with your C-code, though. Your static LUT especially should be made const.

Thanks for the kind words and advice. And yeah, I should do that.

(For those who are not familiar with convolution filters: Maybe it will be helpful for you to know that in this case the algorithm is set to achieve a Gaussian blur with a radius of 2. I could change the values in the LUT which Falco mentioned to achieve something like an emboss effect or a filter that will highlight edges in an image.)

I believe I'm jumping too much for this few lines of code - cut out a couple distracting jumps (especially the big one at the beginning of the code in my first post) and got away with no mentionable performance drawbacks.

Also I've changed an instruction from SSE2 movups (formerly l. 69) to SSE3 lddqu (now l. 102) for experimental purposes. If the CPUs you're targeting don't support SSE3, you can change it back to movups.

Lastly, Visual Studio told me in a warning I was missing an emms instruction, which is supposed to clean up the MMX status information. Cleaning up after oneself is always a good thing to do. I inserted it at the end.

extern "C" _declspec( dllexport ) void Convolute( char * Scan0, int Stride, int Width, int Height ) {
#define MULTIPLIER 1.0f / 256.0f
#define REPEAT_FOUR_TIMES( X ) X, X, X, X
    static const float Kernel[5][20] = {{ REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ) },
                                        { REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ) },
                                        { REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 36.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER ) },
                                        { REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ) },
                                        { REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER  ), REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ) } };

    const void *KernelPtr = (const void*)Kernel;
    __asm {
        ; START
        ; Begin a loop (PIXEL-Y) that iterates over the height of the bitmap (from Height - 1 down to 0)
        mov     ecx, Height
        sub     ecx, 1
asml_pixy_loopbegin:
        push    ecx                     ; Push PixelY onto the stack

        ; PIXEL-Y LOOP BODY
        ; Begin a loop (PIXEL-X) that iterates over the width of the bitmap (from Width - 1 down to 0)
        mov     ecx, Width
        sub     ecx, 1
asml_pixx_loopbegin:
        push    ecx                     ; Push PixelX onto the stack

        ; PIXEL-X LOOP BODY

        ; set xmm0 to zero (xmm0 always stores the "Current Color", 
        ; the color that is accumulated through the convolution algorithm and later written to the bitmap)
        pxor    xmm0, xmm0

        ; The loop over the y-axis of the Region of Interest (from two pixels below the current PIXEL-Y to two pixels above it
        ; goes from 4 down to 0 to access the Kernel values)
        ; The current roi-y is permanently stored in EBX until the loop ends
        mov     ebx, 4h
asml_roiy_loopbegin:
        
        ; ROI-Y LOOP BODY

        ; The loop over the x-axis of the region of interest
        ; The current roi-x is permanently stored in ECX until the loop ends
        mov     ecx, 4h
asml_roix_loopbegin:
        ;push   ecx                     ; Push RoiX

        ; ROIX LOOP BODY
        ; This code checks whether the current roi-x and roi-y refer to pixels that are
        ; inside the boundaries of the GDI bitmap, and if not, jumps back to asml_roix_loopbody_conditional_false
        mov     edx, dword ptr ss:[esp] ;PixelX
        add     edx, ecx                ;RoiX
        sub     edx, 2h
        cmp     edx, 0
        jl      asml_roix_loopbody_conditional_false
        cmp     edx, Width
        jge     asml_roix_loopbody_conditional_false
        mov     edx, dword ptr ss:[esp + 4] ;PixelY
        add     edx, ebx                ;RoiY
        sub     edx, 2h
        cmp     edx, 0
        jl      asml_roix_loopbody_conditional_false
        cmp     edx, Height
        jge     asml_roix_loopbody_conditional_false ; CONDITION is now ASSERTED
        
        ; This code calculates Scan0 + ( PixelY + RoiY - 2 ) * Stride + ( PixelX + RoiX - 2 ) * 4
        ; Which is the pointer to the current bitmap pixel we're working on
        mov     edx, Scan0
        mov     eax, dword ptr ss:[esp + 4] ;PixelY
        add     eax, ebx                ;RoiY
        sub     eax, 2h
        imul    eax, Stride
        add     edx, eax
        mov     eax, dword ptr ss:[esp] ;PixelX
        add     eax, ecx                ;RoiX
        sub     eax, 2h
        imul    eax, 4h
        add     edx, eax    ; Pixel Pointer is now stored in EDX

        ; Read the color from the bitmap pixel pointer now in EDX
        pxor    mm0, mm0
        pxor    xmm1, xmm1
        movd    xmm2, [edx]             ; Load color (32 bits) from memory into register
        punpcklbw xmm2, xmm1            ; Expand the 32-bit color consisting of 8-bit channel values 
        punpcklwd xmm2, xmm1            ;   to a 128-bit color of 32-bit channel values
        cvtdq2ps xmm1, xmm2             ; Convert the 32-bit integer channel values to 32-bit (single-precision) floats
                                        ; Color read from pixel pointer and converted to floats is now stored in xmm1

        ; Get pointer to value from the Kernel
        ; Calculate KernelPtr + RoiY * (Kernel array stride) + RoiX * (Kernel element width (in this case 16, 
        ;   because there are 5 * (4 repeated floats) per line))
        ; The kernel value we need is the one that tells us how much influence the current ROI pixel will have
        ;   in the calculation of the color for the current bitmap pixel
        mov     eax, KernelPtr
        mov     edx, ebx                ;RoiY
        imul    edx, 50h
        add     eax, edx
        mov     edx, ecx                ;RoiX
        imul    edx, 10h
        add     eax, edx

        ; Load Kernel value pointer into xmm2
        lddqu   xmm2, [eax]
        mulps   xmm1, xmm2              ; Kernel-weighted color is now in xmm1
        addps   xmm0, xmm1              ; Kernel-weighted color is now added to the current accumulator color value in xmm0
asml_roix_loopbody_conditional_false:

        ; ROIX LOOP BODY END
        dec     ecx
        cmp     ecx, 0
        jge     asml_roix_loopbegin
        
        ; ROIY LOOP BODY END
        ;pop    ecx
        dec     ebx
        cmp     ebx, 0
        jge     asml_roiy_loopbegin
        ;asml_roiy_loop_end:
        
        ; Calculate the pointer of the current pixel in the bitmap (x = PixelX, y = PixelY, which are on the stack)
        mov     eax, Scan0
        mov     ebx, dword ptr ss:[esp + 4] ; PixelY
        imul    ebx, Stride
        add     eax, ebx
        mov     ebx, dword ptr ss:[esp] ; PixelX
        imul    ebx, 4
        add     eax, ebx                ; Dest ptr is now in eax

        ; Copy calculated color into bitmap
        cvtps2dq xmm1, xmm0             ; Convert floats in xmm0 to int32s in xmm1
        packssdw xmm2, xmm1             ; pack these int32s into int16s
        psrldq  xmm2, 8                 ; right-shift the contents of the xmm1 register by 8 bytes (so the int16s occupy only the low half)
        movdq2q mm1, xmm2               ; copy the low int16s from xmm2 to mm1
        packsswb mm0, mm1               ; pack the int16s in mm1 into bytes in mm0
        psrlq   mm0, 20h                ; right-shift the contents of mm0 by 32 bytes, so the bytes occupy only the low half of mm0
        movd    [eax], mm0              ; copy the data (now in 32-bit ARGB format) back to memory

        ;move on with the next pixel

        ; PIXELX LOOP BODY END
        pop     ecx                     ; Pull PixelX from the stack, increment, jump to beginning
        dec     ecx
        cmp     ecx, 0
        jge     asml_pixx_loopbegin
        
        ; PIXELY LOOP BODY END
        pop     ecx                     ; Pull PixelY from the stack, increment, jump to beginning
        dec     ecx
        cmp     ecx, 0
        jge     asml_pixy_loopbegin

        emms
    }
}

Post by **Falco Girgis** » Mon Apr 29, 2013 1:24 pm

Very sexy!

By the way, dude, have you looked into auto-detecting the SSE version of a CPU? I haven't actually done any SSE programming, but I do a good deal of reading. I believe there should be some SSE version register that you can poll to determine what the CPU supports. Once you know that, you can call a routine that only uses older SSE instructions or even a routine that does all of the processing in standard C/++ without using the SIMD instructions.

Elysian Shadows

Convolution filter with x86 and SSE2 assembly

Convolution filter with x86 and SSE2 assembly

Re: Convolution filter with x86 and SSE2 assembly

Re: Convolution filter with x86 and SSE2 assembly

Re: Convolution filter with x86 and SSE2 assembly