#this variant tries to cram as many targets into 2 run-time #I-Cache line accesses as possible, by using a more linear and #less tree-oriented search within each cache line. Somewhat #like a B-tree works. #foo_btree comes a little later, so it can use some short #backward branches instead of long forward branches #likewise, foo_0_7 is in the middle of its cache line, not at #the start, to avoid making the back-branches too long. # in this proof-of-concept, I always call foo0..foo7, so I #don't have to define so many functions. The called functions #are modulo 8, i.e., instead of foo33 I call foo1 .globl foo_btree .globl foo_btree1 .align 64 foo_0_3: cmpl $1, %esi jb foo0 je foo1 cmpl $3, %esi jb foo2 jmp foo3 .align 32 foo_btree1: foo_0_7: cmpl $4, %esi jb foo_0_3 je foo4 cmpl $6, %esi jb foo5 je foo6 jmp foo7 .align 64 foo_8_11: cmpl $1, %esi jb foo0 je foo1 cmpl $3, %esi jb foo2 jmp foo3 .align 32 foo_8_15: cmpl $4, %esi jb foo_8_11 je foo4 cmpl $6, %esi jb foo5 je foo6 jmp foo7 .align 64 foo_btree: cmpl $40,%esi jae foo_40_72 cmpl $8, %esi jb foo_0_7 cmpl $16, %esi jb foo_8_15 cmpl $24, %esi jb foo_16_23 cmpl $32, %esi jb foo_24_31 jmp foo_32_39 foo_40_72: cmpl $65, %esi jae foo_65_72 cmpl $57, %esi jae foo_57_64 cmpl $49, %esi jae foo_49_56 #fall into foo_40_48 #here we can use leftover space from the previous block to # handle 9 targets foo_40_48: cmpl $45, %esi ja foo_46_48 je foo5 cmpl $41, %esi jb foo0 je foo1 cmpl $43, %esi jb foo2 je foo3 jmp foo4 foo_46_48: cmpl $47, %esi jb foo6 je foo7 jmp foo0 #foo48 .align 64 foo_16_23: cmpl $19, %esi ja foo_20_23 je foo3 cmpl $17, %esi jb foo0 je foo1 jmp foo2 .align 32 foo_20_23: cmpl $21, %esi jb foo4 je foo5 cmpl $23, %esi jb foo6 jmp foo7 .align 64 foo_24_31: cmpl $27, %esi ja foo_28_31 je foo3 cmpl $25, %esi jb foo0 je foo1 jmp foo2 .align 32 foo_28_31: cmpl $29, %esi jb foo4 je foo5 cmpl $31, %esi jb foo6 jmp foo7 .align 64 foo_32_39: cmpl $35, %esi ja foo_36_39 je foo3 cmpl $33, %esi jb foo0 je foo1 jmp foo2 .align 32 foo_36_39: cmpl $37, %esi jb foo4 je foo5 cmpl $39, %esi jb foo6 jmp foo7 .align 64 foo_49_56: cmpl $52, %esi ja foo_53_56 je foo4 cmpl $50, %esi jb foo1 je foo2 jmp foo3 .align 32 foo_53_56: cmpl $54, %esi jb foo5 je foo6 cmpl $56, %esi jb foo7 jmp foo0 .align 64 foo_57_64: cmpl $60, %esi ja foo_61_64 je foo4 cmpl $58, %esi jb foo1 je foo2 jmp foo3 .align 32 foo_61_64: cmpl $62, %esi jb foo5 je foo6 cmpl $64, %esi jb foo7 jmp foo0 .align 64 foo_65_72: cmpl $68, %esi ja foo_69_72 je foo4 cmpl $66, %esi jb foo1 je foo2 jmp foo3 .align 32 foo_69_72: cmpl $70, %esi jb foo5 je foo6 cmpl $72, %esi jb foo7 je foo0 jmp abort