Ticket #8632: diff.2

File diff.2, 7.3 KB (added by SF/robinwatts, 13 years ago)

New PocketPCHalf implementation (unrolled), plus C changes to call it

Line 
1Index: CEScaler.cpp
2===================================================================
3--- CEScaler.cpp (revision 26438)
4+++ CEScaler.cpp (working copy)
5@@ -128,8 +128,19 @@
6 }
7 }
8
9+#ifdef ARM
10+extern "C" {
11+ void PocketPCHalfARM(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height, int mask, int round);
12+ // Rounding constants and masks used for different pixel formats
13+ int roundingconstants[] = { 0x00200802, 0x00201002 };
14+ int redbluegreenMasks[] = { 0x03E07C1F, 0x07E0F81F };
15+}
16+#endif
17
18 void PocketPCHalf(const uint8 *srcPtr, uint32 srcPitch, uint8 *dstPtr, uint32 dstPitch, int width, int height) {
19+#ifdef ARM
20+ PocketPCHalfARM(srcPtr, srcPitch, dstPtr, dstPitch, width, height, redbluegreenMasks[maskUsed],roundingconstants[maskUsed]);
21+#else
22 uint8 *work;
23 int i;
24 uint16 srcPitch16 = (uint16)(srcPitch / sizeof(uint16));
25@@ -151,6 +162,7 @@
26 srcPtr += 2 * srcPitch;
27 dstPtr += dstPitch;
28 }
29+#endif
30 }
31
32
33Index: ARMscaler.s
34===================================================================
35--- ARMscaler.s (revision 0)
36+++ ARMscaler.s (revision 0)
37@@ -0,0 +1,184 @@
38+@ ScummVM Scumm Interpreter
39+@ Copyright (C) 2007 The ScummVM project
40+@
41+@ This program is free software; you can redistribute it and/or
42+@ modify it under the terms of the GNU General Public License
43+@ as published by the Free Software Foundation; either version 2
44+@ of the License, or (at your option) any later version.
45+@
46+@ This program is distributed in the hope that it will be useful,
47+@ but WITHOUT ANY WARRANTY; without even the implied warranty of
48+@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
49+@ GNU General Public License for more details.
50+@
51+@ You should have received a copy of the GNU General Public License
52+@ along with this program; if not, write to the Free Software
53+@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
54+@
55+@ $URL$
56+@ $Id$
57+@
58+@ @author Robin Watts (robin@wss.co.uk)
59+
60+ .text
61+
62+ .global PocketPCHalfARM
63+
64+ @ ARM implementation of PocketPCHalf scaler.
65+ @ Scales a width x height block of 16bpp pixels from srcPtr to
66+ @ dstPtr. srcPitch and dstPitch identify how to reach subsequent
67+ @ lines. redblueMask and round allow for one routine to do both
68+ @ 565 and 555 formats.
69+PocketPCHalfARM:
70+ @ r0 = srcPtr
71+ @ r1 = srcPitch
72+ @ r2 = dstPtr
73+ @ r3 = dstPitch
74+ MOV r12,r13
75+ STMFD r13!,{r4-r11,r14}
76+ LDMIA r12,{r4-r7}
77+ @ r4 = width
78+ @ r5 = height
79+ @ r6 = redblueMask
80+ @ r7 = round
81+
82+ SUB r3,r3,r4 @ dstPitch -= width
83+ SUBS r5,r5,#2 @ while ((height-=2) >= 0)
84+ BLT end
85+height_loop:
86+
87+ SUBS r11,r4,#8 @ r11= width_minus_8
88+ BLT thin
89+
90+width_loop:
91+ @ unroll loop 4 times here
92+ LDRH r8,[r0],r1 @ r8 = A = srcPtr[0]
93+ LDRH r9,[r0],#2 @ r9 = C = srcPtr[dstPitch]
94+ LDRH r12,[r0],-r1 @ r12= D = srcPtr[dstPitch+2]
95+ LDRH r14,[r0],#2 @ r14= B = srcPtr[2]
96+
97+ ORR r8, r8, r8, LSL #16 @ r8 = b | g | r | b | g | r
98+ ORR r9, r9, r9, LSL #16 @ r9 = b | g | r | b | g | r
99+ ORR r12,r12,r12,LSL #16 @ r12= b | g | r | b | g | r
100+ ORR r14,r14,r14,LSL #16 @ r14= b | g | r | b | g | r
101+ AND r8, r8, r6 @ r8 = 0 | g | 0 | b | 0 | r
102+ AND r9, r9, r6 @ r9 = 0 | g | 0 | b | 0 | r
103+ AND r12,r12,r6 @ r12= 0 | g | 0 | b | 0 | r
104+ AND r14,r14,r6 @ r14= 0 | g | 0 | b | 0 | r
105+ ADD r8, r8, r9
106+ ADD r8, r8, r12
107+ ADD r8, r8, r14
108+ ADD r8, r8, r7 @ r8 = summed pixels + rounding
109+ AND r8, r6, r8, LSR #2 @ r8 = 0 | g | 0 | b | 0 | r
110+ ORR r10,r8, r8, LSR #16 @ r10= 0 | g | 0 | b | g | r
111+
112+ LDRH r8,[r0],r1 @ r8 = A = srcPtr[0]
113+ LDRH r9,[r0],#2 @ r9 = C = srcPtr[dstPitch]
114+ LDRH r12,[r0],-r1 @ r12= D = srcPtr[dstPitch+2]
115+ LDRH r14,[r0],#2 @ r14= B = srcPtr[2]
116+
117+ STRH r10,[r2],#2 @ *dstPtr++
118+
119+ ORR r8, r8, r8, LSL #16 @ r8 = b | g | r | b | g | r
120+ ORR r9, r9, r9, LSL #16 @ r9 = b | g | r | b | g | r
121+ ORR r12,r12,r12,LSL #16 @ r12= b | g | r | b | g | r
122+ ORR r14,r14,r14,LSL #16 @ r14= b | g | r | b | g | r
123+ AND r8, r8, r6 @ r8 = 0 | g | 0 | b | 0 | r
124+ AND r9, r9, r6 @ r9 = 0 | g | 0 | b | 0 | r
125+ AND r12,r12,r6 @ r12= 0 | g | 0 | b | 0 | r
126+ AND r14,r14,r6 @ r14= 0 | g | 0 | b | 0 | r
127+ ADD r8, r8, r9
128+ ADD r8, r8, r12
129+ ADD r8, r8, r14
130+ ADD r8, r8, r7 @ r8 = summed pixels + rounding
131+ AND r8, r6, r8, LSR #2 @ r8 = 0 | g | 0 | b | 0 | r
132+ ORR r10,r8, r8, LSR #16 @ r10= 0 | g | 0 | b | g | r
133+
134+ LDRH r8,[r0],r1 @ r8 = A = srcPtr[0]
135+ LDRH r9,[r0],#2 @ r9 = C = srcPtr[dstPitch]
136+ LDRH r12,[r0],-r1 @ r12= D = srcPtr[dstPitch+2]
137+ LDRH r14,[r0],#2 @ r14= B = srcPtr[2]
138+
139+ STRH r10,[r2],#2 @ *dstPtr++
140+
141+ ORR r8, r8, r8, LSL #16 @ r8 = b | g | r | b | g | r
142+ ORR r9, r9, r9, LSL #16 @ r9 = b | g | r | b | g | r
143+ ORR r12,r12,r12,LSL #16 @ r12= b | g | r | b | g | r
144+ ORR r14,r14,r14,LSL #16 @ r14= b | g | r | b | g | r
145+ AND r8, r8, r6 @ r8 = 0 | g | 0 | b | 0 | r
146+ AND r9, r9, r6 @ r9 = 0 | g | 0 | b | 0 | r
147+ AND r12,r12,r6 @ r12= 0 | g | 0 | b | 0 | r
148+ AND r14,r14,r6 @ r14= 0 | g | 0 | b | 0 | r
149+ ADD r8, r8, r9
150+ ADD r8, r8, r12
151+ ADD r8, r8, r14
152+ ADD r8, r8, r7 @ r8 = summed pixels + rounding
153+ AND r8, r6, r8, LSR #2 @ r8 = 0 | g | 0 | b | 0 | r
154+ ORR r10,r8, r8, LSR #16 @ r10= 0 | g | 0 | b | g | r
155+
156+ LDRH r8,[r0],r1 @ r8 = A = srcPtr[0]
157+ LDRH r9,[r0],#2 @ r9 = C = srcPtr[dstPitch]
158+ LDRH r12,[r0],-r1 @ r12= D = srcPtr[dstPitch+2]
159+ LDRH r14,[r0],#2 @ r14= B = srcPtr[2]
160+
161+ STRH r10,[r2],#2 @ *dstPtr++
162+
163+ ORR r8, r8, r8, LSL #16 @ r8 = b | g | r | b | g | r
164+ ORR r9, r9, r9, LSL #16 @ r9 = b | g | r | b | g | r
165+ ORR r12,r12,r12,LSL #16 @ r12= b | g | r | b | g | r
166+ ORR r14,r14,r14,LSL #16 @ r14= b | g | r | b | g | r
167+ AND r8, r8, r6 @ r8 = 0 | g | 0 | b | 0 | r
168+ AND r9, r9, r6 @ r9 = 0 | g | 0 | b | 0 | r
169+ AND r12,r12,r6 @ r12= 0 | g | 0 | b | 0 | r
170+ AND r14,r14,r6 @ r14= 0 | g | 0 | b | 0 | r
171+ ADD r8, r8, r9
172+ ADD r8, r8, r12
173+ ADD r8, r8, r14
174+ ADD r8, r8, r7 @ r8 = summed pixels + rounding
175+ AND r8, r6, r8, LSR #2 @ r8 = 0 | g | 0 | b | 0 | r
176+ ORR r10, r8, r8, LSR #16 @ r8 = 0 | g | 0 | b | g | r
177+
178+ STRH r10,[r2],#2 @ *dstPtr++
179+
180+ SUBS r11,r11,#8 @ width_minus_8 -= 8
181+ BGE width_loop @ (width_minus_8 >= 0) => do 8+ more
182+
183+thin:
184+ ADDS r11,r11,#8 @ r11= width
185+ BEQ width_end @ if no more left to do, then bale
186+thin_lp:
187+ @ single output pixels done in this bit
188+ LDRH r8,[r0],r1 @ r8 = A = srcPtr[0]
189+ LDRH r9,[r0],#2 @ r9 = C = srcPtr[dstPitch]
190+ LDRH r12,[r0],-r1 @ r12= D = srcPtr[dstPitch+2]
191+ LDRH r14,[r0],#2 @ r14= B = srcPtr[2]
192+
193+ ORR r8, r8, r8, LSL #16 @ r8 = b | g | r | b | g | r
194+ ORR r9, r9, r9, LSL #16 @ r9 = b | g | r | b | g | r
195+ ORR r12,r12,r12,LSL #16 @ r12= b | g | r | b | g | r
196+ ORR r14,r14,r14,LSL #16 @ r14= b | g | r | b | g | r
197+ AND r8, r8, r6 @ r8 = 0 | g | 0 | b | 0 | r
198+ AND r9, r9, r6 @ r9 = 0 | g | 0 | b | 0 | r
199+ AND r12,r12,r6 @ r12= 0 | g | 0 | b | 0 | r
200+ AND r14,r14,r6 @ r14= 0 | g | 0 | b | 0 | r
201+ ADD r8, r8, r9
202+ ADD r8, r8, r12
203+ ADD r8, r8, r14
204+ ADD r8, r8, r7 @ r8 = summed pixels + rounding
205+ AND r8, r6, r8, LSR #2 @ r8 = 0 | g | 0 | b | 0 | r
206+ ORR r8, r8, r8, LSR #16 @ r8 = 0 | g | 0 | b | g | r
207+
208+ STRH r8,[r2],#2 @ *dstPtr++
209+
210+ SUBS r11,r11,#2
211+ BGT thin_lp
212+width_end:
213+ ADD r2,r2,r3 @ dstPtr += dstPitch
214+ ADD r0,r0,r1,LSL #1 @ srcPtr += 2*srcPitch
215+ SUB r0,r0,r4,LSL #1 @ srcPtr -= 2*width
216+
217+ SUBS r5,r5,#2
218+ BGE height_loop
219+
220+end:
221+ LDMFD r13!,{r4-r11,PC}
222
223Property changes on: ARMscaler.s
224___________________________________________________________________
225Name: svn:executable
226 + *
227