cg14_render.c revision 72fd264f
1/* $NetBSD: cg14_render.c,v 1.14 2021/12/24 04:41:40 macallan Exp $ */ 2/* 3 * Copyright (c) 2013 Michael Lorenz 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * - Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * - Redistributions in binary form must reproduce the above 13 * copyright notice, this list of conditions and the following 14 * disclaimer in the documentation and/or other materials provided 15 * with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 20 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 21 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 22 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 23 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 27 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 * 30 */ 31 32#ifdef HAVE_CONFIG_H 33#include "config.h" 34#endif 35 36#include <sys/types.h> 37 38/* all driver need this */ 39#include "xf86.h" 40#include "xf86_OSproc.h" 41#include "compiler.h" 42 43#include "cg14.h" 44 45/*#define SX_SINGLE*/ 46/*#define SX_RENDER_DEBUG*/ 47/*#define SX_ADD_SOFTWARE*/ 48 49#ifdef SX_RENDER_DEBUG 50#define ENTER xf86Msg(X_ERROR, "%s>\n", __func__); 51#define DPRINTF xf86Msg 52#else 53#define ENTER 54#define DPRINTF while (0) xf86Msg 55#endif 56 57char c[8] = " .,:+*oX"; 58 59 60void CG14Comp_Over32Solid(Cg14Ptr p, 61 uint32_t src, uint32_t srcpitch, 62 uint32_t dst, uint32_t dstpitch, 63 int width, int height) 64{ 65 uint32_t msk = src, mskx, dstx, m; 66 int line, x, i; 67 68 ENTER; 69 70 for (line = 0; line < height; line++) { 71 mskx = msk; 72 dstx = dst; 73#ifndef SX_SINGLE 74 int rest; 75 for (x = 0; x < width; x += 4) { 76 rest = width - x; 77 /* fetch 4 mask values */ 78 sxm(SX_LDUQ0, mskx, 12, 3); 79 /* fetch destination pixels */ 80 sxm(SX_LDUQ0, dstx, 60, 3); 81 /* duplicate them for all channels */ 82 sxi(SX_ORS(0, 12, 13, 2)); 83 sxi(SX_ORS(0, 16, 17, 2)); 84 sxi(SX_ORS(0, 20, 21, 2)); 85 sxi(SX_ORS(0, 24, 25, 2)); 86 /* generate inverted alpha */ 87 sxi(SX_XORS(12, 8, 28, 15)); 88 /* multiply source */ 89 sxi(SX_MUL16X16SR8(8, 12, 44, 3)); 90 sxi(SX_MUL16X16SR8(8, 16, 48, 3)); 91 sxi(SX_MUL16X16SR8(8, 20, 52, 3)); 92 sxi(SX_MUL16X16SR8(8, 24, 56, 3)); 93 /* multiply dest */ 94 sxi(SX_MUL16X16SR8(28, 60, 76, 15)); 95 /* add up */ 96 sxi(SX_ADDV(44, 76, 92, 15)); 97 /* write back */ 98 if (rest < 4) { 99 sxm(SX_STUQ0C, dstx, 92, rest - 1); 100 } else { 101 sxm(SX_STUQ0C, dstx, 92, 3); 102 } 103 dstx += 16; 104 mskx += 16; 105 } 106#else /* SX_SINGLE */ 107 for (x = 0; x < width; x++) { 108 m = *(volatile uint32_t *)(p->fb + mskx); 109 m = m >> 24; 110 if (m == 0) { 111 /* nothing to do - all transparent */ 112 } else if (m == 0xff) { 113 /* all opaque */ 114 sxm(SX_STUQ0, dstx, 8, 0); 115 } else { 116 /* fetch alpha value, stick it into scam */ 117 /* mask is in R[12:15] */ 118 /*write_sx_io(p, mskx, 119 SX_LDUQ0(12, 0, mskx & 7));*/ 120 write_sx_reg(p, SX_QUEUED(12), m); 121 /* fetch dst pixel */ 122 sxm(SX_LDUQ0, dstx, 20, 0); 123 sxi(SX_ORV(12, 0, R_SCAM, 0)); 124 /* 125 * src * alpha + R0 126 * R[9:11] * SCAM + R0 -> R[17:19] 127 */ 128 sxi(SX_SAXP16X16SR8(9, 0, 17, 2)); 129 130 /* invert SCAM */ 131 sxi(SX_XORV(12, 8, R_SCAM, 0)); 132#ifdef SX_DEBUG 133 sxi(SX_XORV(12, 8, 13, 0)); 134#endif 135 /* dst * (1 - alpha) + R[13:15] */ 136 sxi(SX_SAXP16X16SR8(21, 17, 25, 2)); 137 sxm(SX_STUQ0C, dstx, 24, 0); 138 } 139 dstx += 4; 140 mskx += 4; 141 } 142#endif /* SX_SINGLE */ 143 dst += dstpitch; 144 msk += srcpitch; 145 } 146} 147 148void CG14Comp_Over8Solid(Cg14Ptr p, 149 uint32_t src, uint32_t srcpitch, 150 uint32_t dst, uint32_t dstpitch, 151 int width, int height) 152{ 153 uint32_t msk = src, mskx, dstx, m; 154 int line, x, i; 155#ifdef SX_DEBUG 156 char buffer[256]; 157#endif 158 ENTER; 159 160 DPRINTF(X_ERROR, "src: %d %d %d, %08x\n", read_sx_reg(p, SX_QUEUED(9)), 161 read_sx_reg(p, SX_QUEUED(10)), read_sx_reg(p, SX_QUEUED(11)), 162 *(uint32_t *)(p->fb + p->srcoff)); 163 for (line = 0; line < height; line++) { 164 mskx = msk; 165 dstx = dst; 166#ifndef SX_SINGLE 167 int rest; 168 for (x = 0; x < width; x += 4) { 169 rest = width - x; 170 /* fetch 4 mask values */ 171 sxm(SX_LDB, mskx, 12, 3); 172 /* fetch destination pixels */ 173 sxm(SX_LDUQ0, dstx, 60, 3); 174 /* duplicate them for all channels */ 175 sxi(SX_ORS(0, 13, 16, 3)); 176 sxi(SX_ORS(0, 14, 20, 3)); 177 sxi(SX_ORS(0, 15, 24, 3)); 178 sxi(SX_ORS(0, 12, 13, 2)); 179 /* generate inverted alpha */ 180 sxi(SX_XORS(12, 8, 28, 15)); 181 /* multiply source */ 182 sxi(SX_MUL16X16SR8(8, 12, 44, 3)); 183 sxi(SX_MUL16X16SR8(8, 16, 48, 3)); 184 sxi(SX_MUL16X16SR8(8, 20, 52, 3)); 185 sxi(SX_MUL16X16SR8(8, 24, 56, 3)); 186 /* multiply dest */ 187 sxi(SX_MUL16X16SR8(28, 60, 76, 15)); 188 /* add up */ 189 sxi(SX_ADDV(44, 76, 92, 15)); 190 /* write back */ 191 if (rest < 4) { 192 sxm(SX_STUQ0C, dstx, 92, rest - 1); 193 } else { 194 sxm(SX_STUQ0C, dstx, 92, 3); 195 } 196 dstx += 16; 197 mskx += 4; 198 } 199#else /* SX_SINGLE */ 200 for (x = 0; x < width; x++) { 201 m = *(volatile uint8_t *)(p->fb + mskx); 202#ifdef SX_DEBUG 203 buffer[x] = c[m >> 5]; 204#endif 205 if (m == 0) { 206 /* nothing to do - all transparent */ 207 } else if (m == 0xff) { 208 /* all opaque */ 209 sxm(SX_STUQ0, dstx, 8, 0); 210 } else { 211 /* fetch alpha value, stick it into scam */ 212 /* mask is in R[12:15] */ 213 /*write_sx_io(p, mskx & ~7, 214 SX_LDB(12, 0, mskx & 7));*/ 215 write_sx_reg(p, SX_QUEUED(12), m); 216 /* fetch dst pixel */ 217 sxm(SX_LDUQ0, dstx, 20, 0); 218 sxi(SX_ORV(12, 0, R_SCAM, 0)); 219 /* 220 * src * alpha + R0 221 * R[9:11] * SCAM + R0 -> R[17:19] 222 */ 223 sxi(SX_SAXP16X16SR8(9, 0, 17, 2)); 224 225 /* invert SCAM */ 226 sxi(SX_XORV(12, 8, R_SCAM, 0)); 227#ifdef SX_DEBUG 228 sxi(SX_XORV(12, 8, 13, 0)); 229#endif 230 /* dst * (1 - alpha) + R[13:15] */ 231 sxi(SX_SAXP16X16SR8(21, 17, 25, 2)); 232 sxm(SX_STUQ0C, dstx, 24, 0); 233 } 234 dstx += 4; 235 mskx += 1; 236 } 237#endif /* SX_SINGLE */ 238#ifdef SX_DEBUG 239 buffer[x] = 0; 240 xf86Msg(X_ERROR, "%s\n", buffer); 241#endif 242 dst += dstpitch; 243 msk += srcpitch; 244 } 245} 246 247void CG14Comp_Add32(Cg14Ptr p, 248 uint32_t src, uint32_t srcpitch, 249 uint32_t dst, uint32_t dstpitch, 250 int width, int height) 251{ 252 int line; 253 uint32_t srcx, dstx; 254 int full, part, x; 255 256 ENTER; 257 full = width >> 3; /* chunks of 8 */ 258 part = width & 7; /* leftovers */ 259 /* we do this up to 8 pixels at a time */ 260 for (line = 0; line < height; line++) { 261 srcx = src; 262 dstx = dst; 263 for (x = 0; x < full; x++) { 264 sxm(SX_LDUQ0, srcx, 8, 31); 265 sxm(SX_LDUQ0, dstx, 40, 31); 266 sxi(SX_ADDV(8, 40, 72, 15)); 267 sxi(SX_ADDV(24, 56, 88, 15)); 268 sxm(SX_STUQ0, dstx, 72, 31); 269 srcx += 128; 270 dstx += 128; 271 } 272 273 /* do leftovers */ 274 sxm(SX_LDUQ0, srcx, 8, part - 1); 275 sxm(SX_LDUQ0, dstx, 40, part - 1); 276 if (part & 16) { 277 sxi(SX_ADDV(8, 40, 72, 15)); 278 sxi(SX_ADDV(24, 56, 88, part - 17)); 279 } else { 280 sxi(SX_ADDV(8, 40, 72, part - 1)); 281 } 282 sxm(SX_STUQ0, dstx, 72, part - 1); 283 284 /* next line */ 285 src += srcpitch; 286 dst += dstpitch; 287 } 288} 289 290void CG14Comp_Add8(Cg14Ptr p, 291 uint32_t src, uint32_t srcpitch, 292 uint32_t dst, uint32_t dstpitch, 293 int width, int height) 294{ 295 int line; 296 uint32_t srcx, dstx, srcoff, dstoff; 297 int pre, full, part, x; 298 uint8_t *d; 299 char buffer[256]; 300 ENTER; 301 302 srcoff = src & 7; 303 src &= ~7; 304 dstoff = dst & 7; 305 dst &= ~7; 306 full = width >> 5; /* chunks of 32 */ 307 part = width & 31; /* leftovers */ 308 309#ifdef SX_DEBUG 310 xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch, 311 width, height, full, part); 312#endif 313 /* we do this up to 32 pixels at a time */ 314 for (line = 0; line < height; line++) { 315 srcx = src; 316 dstx = dst; 317#ifdef SX_ADD_SOFTWARE 318 uint8_t *s = (uint8_t *)(p->fb + srcx + srcoff); 319 d = (uint8_t *)(p->fb + dstx + dstoff); 320 for (x = 0; x < width; x++) { 321 d[x] = min(255, s[x] + d[x]); 322 } 323#else 324 for (x = 0; x < full; x++) { 325 write_sx_io(p, srcx, SX_LDB(8, 31, srcoff)); 326 write_sx_io(p, dstx, SX_LDB(40, 31, dstoff)); 327 sxi(SX_ADDV(8, 40, 72, 15)); 328 sxi(SX_ADDV(24, 56, 88, 15)); 329 write_sx_io(p, dstx, SX_STBC(72, 31, dstoff)); 330 srcx += 32; 331 dstx += 32; 332 } 333 334 if (part > 0) { 335 /* do leftovers */ 336 write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff)); 337 write_sx_io(p, dstx, SX_LDB(40, part - 1, dstoff)); 338 if (part > 16) { 339 sxi(SX_ADDV(8, 40, 72, 15)); 340 sxi(SX_ADDV(24, 56, 88, part - 17)); 341 } else { 342 sxi(SX_ADDV(8, 40, 72, part - 1)); 343 } 344 write_sx_io(p, dstx, SX_STBC(72, part - 1, dstoff)); 345 } 346#endif 347#ifdef SX_DEBUG 348 d = (uint8_t *)(p->fb + src + srcoff); 349 for (x = 0; x < width; x++) { 350 buffer[x] = c[d[x]>>5]; 351 } 352 buffer[x] = 0; 353 xf86Msg(X_ERROR, "%s\n", buffer); 354#endif 355 /* next line */ 356 src += srcpitch; 357 dst += dstpitch; 358 } 359} 360 361void CG14Comp_Add8_32(Cg14Ptr p, 362 uint32_t src, uint32_t srcpitch, 363 uint32_t dst, uint32_t dstpitch, 364 int width, int height) 365{ 366 int line; 367 uint32_t srcx, dstx, srcoff, dstoff; 368 int pre, full, part, x; 369 uint8_t *d; 370 char buffer[256]; 371 ENTER; 372 373 srcoff = src & 7; 374 src &= ~7; 375 dstoff = dst & 7; 376 dst &= ~7; 377 full = width >> 5; /* chunks of 32 */ 378 part = width & 31; /* leftovers */ 379 380#ifdef SX_DEBUG 381 xf86Msg(X_ERROR, "%d %d, %d x %d, %d %d\n", srcpitch, dstpitch, 382 width, height, full, part); 383#endif 384 /* we do this up to 32 pixels at a time */ 385 for (line = 0; line < height; line++) { 386 srcx = src; 387 dstx = dst; 388 for (x = 0; x < full; x++) { 389 /* load source bytes */ 390 write_sx_io(p, srcx, SX_LDB(8, 31, srcoff)); 391 /* load alpha from destination */ 392 write_sx_io(p, dstx, SX_LDUC0(40, 31, dstoff)); 393 sxi(SX_ADDV(8, 40, 72, 15)); 394 sxi(SX_ADDV(24, 56, 88, 15)); 395 /* write clamped values back into dest alpha */ 396 write_sx_io(p, dstx, SX_STUC0C(72, 31, dstoff)); 397 srcx += 32; 398 dstx += 128; 399 } 400 401 if (part > 0) { 402 /* do leftovers */ 403 write_sx_io(p, srcx, SX_LDB(8, part - 1, srcoff)); 404 write_sx_io(p, dstx, SX_LDUC0(40, part - 1, dstoff)); 405 if (part > 16) { 406 sxi(SX_ADDV(8, 40, 72, 15)); 407 sxi(SX_ADDV(24, 56, 88, part - 17)); 408 } else { 409 sxi(SX_ADDV(8, 40, 72, part - 1)); 410 } 411 write_sx_io(p, dstx, SX_STUC0C(72, part - 1, dstoff)); 412 } 413#ifdef SX_DEBUG 414 d = (uint8_t *)(p->fb + src + srcoff); 415 for (x = 0; x < width; x++) { 416 buffer[x] = c[d[x]>>5]; 417 } 418 buffer[x] = 0; 419 xf86Msg(X_ERROR, "%s\n", buffer); 420#endif 421 /* next line */ 422 src += srcpitch; 423 dst += dstpitch; 424 } 425} 426 427void CG14Comp_Over32(Cg14Ptr p, 428 uint32_t src, uint32_t srcpitch, 429 uint32_t dst, uint32_t dstpitch, 430 int width, int height, int flip) 431{ 432 uint32_t srcx, dstx, mskx, m; 433 int line, x, i, num; 434 435 ENTER; 436 437 write_sx_reg(p, SX_QUEUED(8), 0xff); 438 for (line = 0; line < height; line++) { 439 srcx = src; 440 dstx = dst; 441 442 for (x = 0; x < width; x += 4) { 443 /* we do up to 4 pixels at a time */ 444 num = min(4, width - x); 445 if (num <= 0) { 446 xf86Msg(X_ERROR, "wtf?!\n"); 447 continue; 448 } 449 /* fetch source pixels */ 450 sxm(SX_LDUQ0, srcx, 12, num - 1); 451 if (flip) { 452 sxi(SX_GATHER(13, 4, 40, num - 1)); 453 sxi(SX_GATHER(15, 4, 44, num - 1)); 454 sxi(SX_SCATTER(40, 4, 15, num - 1)); 455 sxi(SX_SCATTER(44, 4, 13, num - 1)); 456 } 457 /* fetch dst pixels */ 458 sxm(SX_LDUQ0, dstx, 44, num - 1); 459 /* now process up to 4 pixels */ 460 for (i = 0; i < num; i++) { 461 int ii = i << 2; 462 /* write inverted alpha into SCAM */ 463 sxi(SX_XORS(12 + ii, 8, R_SCAM, 0)); 464 /* dst * (1 - alpha) + src */ 465 sxi(SX_SAXP16X16SR8(44 + ii, 12 + ii, 76 + ii, 3)); 466 } 467 sxm(SX_STUQ0C, dstx, 76, num - 1); 468 srcx += 16; 469 dstx += 16; 470 } 471 src += srcpitch; 472 dst += dstpitch; 473 } 474} 475 476void CG14Comp_Over32Mask(Cg14Ptr p, 477 uint32_t src, uint32_t srcpitch, 478 uint32_t msk, uint32_t mskpitch, 479 uint32_t dst, uint32_t dstpitch, 480 int width, int height, int flip) 481{ 482 uint32_t srcx, dstx, mskx, m; 483 int line, x, i, num; 484 485 ENTER; 486 487 write_sx_reg(p, SX_QUEUED(8), 0xff); 488 for (line = 0; line < height; line++) { 489 srcx = src; 490 mskx = msk; 491 dstx = dst; 492 493 for (x = 0; x < width; x += 4) { 494 /* we do up to 4 pixels at a time */ 495 num = min(4, width - x); 496 if (num <= 0) { 497 xf86Msg(X_ERROR, "wtf?!\n"); 498 continue; 499 } 500 /* fetch source pixels */ 501 sxm(SX_LDUQ0, srcx, 12, num - 1); 502 if (flip) { 503 sxi(SX_GATHER(13, 4, 40, num - 1)); 504 sxi(SX_GATHER(15, 4, 44, num - 1)); 505 sxi(SX_SCATTER(40, 4, 15, num - 1)); 506 sxi(SX_SCATTER(44, 4, 13, num - 1)); 507 } 508 /* fetch mask */ 509 sxm(SX_LDB, mskx, 28, num - 1); 510 /* fetch dst pixels */ 511 sxm(SX_LDUQ0, dstx, 44, num - 1); 512 /* now process up to 4 pixels */ 513 for (i = 0; i < num; i++) { 514 int ii = i << 2; 515 /* mask alpha to SCAM */ 516 sxi(SX_ORS(28 + i, 0, R_SCAM, 0)); 517 /* src * alpha */ 518 sxi(SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3)); 519 /* write inverted alpha into SCAM */ 520 sxi(SX_XORS(28 + i, 8, R_SCAM, 0)); 521 /* dst * (1 - alpha) + R[60:] */ 522 sxi(SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3)); 523 } 524 sxm(SX_STUQ0C, dstx, 76, num - 1); 525 srcx += 16; 526 mskx += 4; 527 dstx += 16; 528 } 529 src += srcpitch; 530 msk += mskpitch; 531 dst += dstpitch; 532 } 533} 534 535void CG14Comp_Over32Mask_noalpha(Cg14Ptr p, 536 uint32_t src, uint32_t srcpitch, 537 uint32_t msk, uint32_t mskpitch, 538 uint32_t dst, uint32_t dstpitch, 539 int width, int height, int flip) 540{ 541 uint32_t srcx, dstx, mskx, m; 542 int line, x, i, num; 543 544 ENTER; 545 546 write_sx_reg(p, SX_QUEUED(8), 0xff); 547 write_sx_reg(p, SX_QUEUED(9), 0xff); 548 sxi(SX_ORS(8, 0, 10, 1)); 549 for (line = 0; line < height; line++) { 550 srcx = src; 551 mskx = msk; 552 dstx = dst; 553 554 for (x = 0; x < width; x += 4) { 555 /* we do up to 4 pixels at a time */ 556 num = min(4, width - x); 557 if (num <= 0) { 558 xf86Msg(X_ERROR, "wtf?!\n"); 559 continue; 560 } 561 /* fetch source pixels */ 562 sxm(SX_LDUQ0, srcx, 12, num - 1); 563 if (flip) { 564 sxi(SX_GATHER(13, 4, 40, num - 1)); 565 sxi(SX_GATHER(15, 4, 44, num - 1)); 566 sxi(SX_SCATTER(40, 4, 15, num - 1)); 567 sxi(SX_SCATTER(44, 4, 13, num - 1)); 568 } 569 /* fetch mask */ 570 sxm(SX_LDB, mskx, 28, num - 1); 571 /* fetch dst pixels */ 572 sxm(SX_LDUQ0, dstx, 44, num - 1); 573 /* set src alpha to 0xff */ 574 sxi(SX_SCATTER(8, 4, 12, num - 1)); 575 /* now process up to 4 pixels */ 576 for (i = 0; i < num; i++) { 577 int ii = i << 2; 578 /* mask alpha to SCAM */ 579 sxi(SX_ORS(28 + i, 0, R_SCAM, 0)); 580 /* src * alpha */ 581 sxi(SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3)); 582 /* write inverted alpha into SCAM */ 583 sxi(SX_XORS(28 + i, 8, R_SCAM, 0)); 584 /* dst * (1 - alpha) + R[60:] */ 585 sxi(SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3)); 586 } 587 sxm(SX_STUQ0C, dstx, 76, num - 1); 588 srcx += 16; 589 mskx += 4; 590 dstx += 16; 591 } 592 src += srcpitch; 593 msk += mskpitch; 594 dst += dstpitch; 595 } 596} 597 598void CG14Comp_Over32Mask32_noalpha(Cg14Ptr p, 599 uint32_t src, uint32_t srcpitch, 600 uint32_t msk, uint32_t mskpitch, 601 uint32_t dst, uint32_t dstpitch, 602 int width, int height, int flip) 603{ 604 uint32_t srcx, dstx, mskx, m; 605 int line, x, i, num; 606 607 ENTER; 608 609 write_sx_reg(p, SX_QUEUED(8), 0xff); 610 write_sx_reg(p, SX_QUEUED(9), 0xff); 611 sxi(SX_ORS(8, 0, 10, 1)); 612 for (line = 0; line < height; line++) { 613 srcx = src; 614 mskx = msk; 615 dstx = dst; 616 617 for (x = 0; x < width; x += 4) { 618 /* we do up to 4 pixels at a time */ 619 num = min(4, width - x); 620 if (num <= 0) { 621 xf86Msg(X_ERROR, "wtf?!\n"); 622 continue; 623 } 624 /* fetch source pixels */ 625 sxm(SX_LDUQ0, srcx, 12, num - 1); 626 if (flip) { 627 sxi(SX_GATHER(13, 4, 40, num - 1)); 628 sxi(SX_GATHER(15, 4, 44, num - 1)); 629 sxi(SX_SCATTER(40, 4, 15, num - 1)); 630 sxi(SX_SCATTER(44, 4, 13, num - 1)); 631 } 632 /* fetch mask */ 633 sxm(SX_LDUQ0, mskx, 28, num - 1); 634 /* fetch dst pixels */ 635 sxm(SX_LDUQ0, dstx, 44, num - 1); 636 /* set src alpha to 0xff */ 637 sxi(SX_SCATTER(8, 4, 12, num - 1)); 638 /* now process up to 4 pixels */ 639 for (i = 0; i < num; i++) { 640 int ii = i << 2; 641 /* mask alpha to SCAM */ 642 sxi(SX_ORS(28 + ii, 0, R_SCAM, 0)); 643 /* src * alpha */ 644 sxi(SX_SAXP16X16SR8(12 + ii, 0, 60 + ii, 3)); 645 /* write inverted alpha into SCAM */ 646 sxi(SX_XORS(28 + ii, 8, R_SCAM, 0)); 647 /* dst * (1 - alpha) + R[60:] */ 648 sxi(SX_SAXP16X16SR8(44 + ii, 60 + ii, 76 + ii, 3)); 649 } 650 sxm(SX_STUQ0C, dstx, 76, num - 1); 651 srcx += 16; 652 mskx += 16; 653 dstx += 16; 654 } 655 src += srcpitch; 656 msk += mskpitch; 657 dst += dstpitch; 658 } 659} 660