Falco Girgis wrote:Damn, that's really fucking cool. Finding or seeing somebody find a great use for hardware accelerating an algorithm through SIMD instructions always gives me a boner.
I would recommend being more const-correct with your C-code, though. Your static LUT especially should be made const.
Thanks for the kind words and advice. And yeah, I should do that.
(For those who are not familiar with convolution filters: Maybe it will be helpful for you to know that in this case the algorithm is set to achieve a Gaussian blur with a radius of 2. I could change the values in the LUT which Falco mentioned to achieve something like an emboss effect or a filter that will highlight edges in an image.)
I believe I'm jumping too much for this few lines of code - cut out a couple distracting jumps (especially the big one at the beginning of the code in my first post) and got away with no mentionable performance drawbacks.
Also I've changed an instruction from SSE2 movups (formerly l. 69) to SSE3 lddqu (now l. 102) for experimental purposes. If the CPUs you're targeting don't support SSE3, you can change it back to movups.
Lastly, Visual Studio told me in a warning I was missing an emms instruction, which is supposed to clean up the MMX status information. Cleaning up after oneself is always a good thing to do. I inserted it at the end.
extern "C" _declspec( dllexport ) void Convolute( char * Scan0, int Stride, int Width, int Height ) {
#define MULTIPLIER 1.0f / 256.0f
#define REPEAT_FOUR_TIMES( X ) X, X, X, X
static const float Kernel[5][20] = {{ REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ) },
{ REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ) },
{ REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 36.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER ) },
{ REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 24.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 16.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ) },
{ REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 6.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 4.0f * MULTIPLIER ), REPEAT_FOUR_TIMES( 1.0f * MULTIPLIER ) } };
const void *KernelPtr = (const void*)Kernel;
__asm {
; START
; Begin a loop (PIXEL-Y) that iterates over the height of the bitmap (from Height - 1 down to 0)
mov ecx, Height
sub ecx, 1
asml_pixy_loopbegin:
push ecx ; Push PixelY onto the stack
; PIXEL-Y LOOP BODY
; Begin a loop (PIXEL-X) that iterates over the width of the bitmap (from Width - 1 down to 0)
mov ecx, Width
sub ecx, 1
asml_pixx_loopbegin:
push ecx ; Push PixelX onto the stack
; PIXEL-X LOOP BODY
; set xmm0 to zero (xmm0 always stores the "Current Color",
; the color that is accumulated through the convolution algorithm and later written to the bitmap)
pxor xmm0, xmm0
; The loop over the y-axis of the Region of Interest (from two pixels below the current PIXEL-Y to two pixels above it
; goes from 4 down to 0 to access the Kernel values)
; The current roi-y is permanently stored in EBX until the loop ends
mov ebx, 4h
asml_roiy_loopbegin:
; ROI-Y LOOP BODY
; The loop over the x-axis of the region of interest
; The current roi-x is permanently stored in ECX until the loop ends
mov ecx, 4h
asml_roix_loopbegin:
;push ecx ; Push RoiX
; ROIX LOOP BODY
; This code checks whether the current roi-x and roi-y refer to pixels that are
; inside the boundaries of the GDI bitmap, and if not, jumps back to asml_roix_loopbody_conditional_false
mov edx, dword ptr ss:[esp] ;PixelX
add edx, ecx ;RoiX
sub edx, 2h
cmp edx, 0
jl asml_roix_loopbody_conditional_false
cmp edx, Width
jge asml_roix_loopbody_conditional_false
mov edx, dword ptr ss:[esp + 4] ;PixelY
add edx, ebx ;RoiY
sub edx, 2h
cmp edx, 0
jl asml_roix_loopbody_conditional_false
cmp edx, Height
jge asml_roix_loopbody_conditional_false ; CONDITION is now ASSERTED
; This code calculates Scan0 + ( PixelY + RoiY - 2 ) * Stride + ( PixelX + RoiX - 2 ) * 4
; Which is the pointer to the current bitmap pixel we're working on
mov edx, Scan0
mov eax, dword ptr ss:[esp + 4] ;PixelY
add eax, ebx ;RoiY
sub eax, 2h
imul eax, Stride
add edx, eax
mov eax, dword ptr ss:[esp] ;PixelX
add eax, ecx ;RoiX
sub eax, 2h
imul eax, 4h
add edx, eax ; Pixel Pointer is now stored in EDX
; Read the color from the bitmap pixel pointer now in EDX
pxor mm0, mm0
pxor xmm1, xmm1
movd xmm2, [edx] ; Load color (32 bits) from memory into register
punpcklbw xmm2, xmm1 ; Expand the 32-bit color consisting of 8-bit channel values
punpcklwd xmm2, xmm1 ; to a 128-bit color of 32-bit channel values
cvtdq2ps xmm1, xmm2 ; Convert the 32-bit integer channel values to 32-bit (single-precision) floats
; Color read from pixel pointer and converted to floats is now stored in xmm1
; Get pointer to value from the Kernel
; Calculate KernelPtr + RoiY * (Kernel array stride) + RoiX * (Kernel element width (in this case 16,
; because there are 5 * (4 repeated floats) per line))
; The kernel value we need is the one that tells us how much influence the current ROI pixel will have
; in the calculation of the color for the current bitmap pixel
mov eax, KernelPtr
mov edx, ebx ;RoiY
imul edx, 50h
add eax, edx
mov edx, ecx ;RoiX
imul edx, 10h
add eax, edx
; Load Kernel value pointer into xmm2
lddqu xmm2, [eax]
mulps xmm1, xmm2 ; Kernel-weighted color is now in xmm1
addps xmm0, xmm1 ; Kernel-weighted color is now added to the current accumulator color value in xmm0
asml_roix_loopbody_conditional_false:
; ROIX LOOP BODY END
dec ecx
cmp ecx, 0
jge asml_roix_loopbegin
; ROIY LOOP BODY END
;pop ecx
dec ebx
cmp ebx, 0
jge asml_roiy_loopbegin
;asml_roiy_loop_end:
; Calculate the pointer of the current pixel in the bitmap (x = PixelX, y = PixelY, which are on the stack)
mov eax, Scan0
mov ebx, dword ptr ss:[esp + 4] ; PixelY
imul ebx, Stride
add eax, ebx
mov ebx, dword ptr ss:[esp] ; PixelX
imul ebx, 4
add eax, ebx ; Dest ptr is now in eax
; Copy calculated color into bitmap
cvtps2dq xmm1, xmm0 ; Convert floats in xmm0 to int32s in xmm1
packssdw xmm2, xmm1 ; pack these int32s into int16s
psrldq xmm2, 8 ; right-shift the contents of the xmm1 register by 8 bytes (so the int16s occupy only the low half)
movdq2q mm1, xmm2 ; copy the low int16s from xmm2 to mm1
packsswb mm0, mm1 ; pack the int16s in mm1 into bytes in mm0
psrlq mm0, 20h ; right-shift the contents of mm0 by 32 bytes, so the bytes occupy only the low half of mm0
movd [eax], mm0 ; copy the data (now in 32-bit ARGB format) back to memory
;move on with the next pixel
; PIXELX LOOP BODY END
pop ecx ; Pull PixelX from the stack, increment, jump to beginning
dec ecx
cmp ecx, 0
jge asml_pixx_loopbegin
; PIXELY LOOP BODY END
pop ecx ; Pull PixelY from the stack, increment, jump to beginning
dec ecx
cmp ecx, 0
jge asml_pixy_loopbegin
emms
}
}
Someday, everything will go to /dev/null. - Bug's prophecy 13:37