4,887,235 473 474 ;; <- 32-s -> ;; ................................|SSSSSSSSSSssssssssssssssssssssss ;; DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign b-temp bb-s-bitpos) (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))) (incr-wrap-s-offset) ;; <----- s-d ----> <- 32-s -> (32-d)-(32-s)-s-d ;; ssssssssssssssssSSSSSSSSSSSSSSSS|1111111111...................... ;; DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd (parallel-with-s-access bb-s-offset (assign byte-r (2- bb-s-bitpos)) (assign byte-s (- b-temp bb-d-bitpos 1)) (assign bb-s-word (logxor bb-constant memory-data))) (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (assign bb-s-bitpos (- b-temp bb-d-bitpos)) ;;XXXbrad - missing ;;alu depends only on source bits (defucode ubitbit-long-row-source (parallel (assign b-temp bb-d-bitpos) (if (zero-fixnum bb-d-bitpos) (if (zero-fixnum bb-s-bitpos) (parallel (assign bb-s-offset (1- bb-s-oftset)) ;bb-aligned-row-source will increment first (lisp (trace-path #/a)) (jump ubitblt-aligned-row-source)) ;; SSSSSSSSSS3SSSSSSSSSSSSSSsssssss ;; dddddddddddddddddddddddddddddddd (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (parallel (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))) (lisp (trace-path #/c)) (jump ubitblt-d-aligned-row-source)))) (if (equal-fixnum b-temp bb-s-bitpos) ;;SSSSSSSSSSSSSSSSSSSSSSSSSsssssss ;;DDDDDDDDDDDDDDDDDDDDDDDDDddddddd (sequential (parallel-with-s-access bb-s-offset (assign a-temp (32- bb-d-bitpos)) (assign byte-r a-temp) (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))) (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-s (1- a-temp)) (store-word (dpb bb-s-word byte-s byte-r memory-data))) (incr-d-offset) (parallel (assign bb-width (- bb-width a-temp)) (lisp (trace-path #/b)) (jump ubitblt-aligned-row-source))) (if (lesser-fixnum bb-s-bitpos b-temp) ;;sssssssssSSSSSSSSSSSSSSSS....... ;; DDDDDDDDDDDDDDDDdddddddddddddddd ;; <- 32-d.bitpos-> (sequential (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (parallel (assign b-temp (32- bb-d-bitpos)) (assign a-temp obus)) (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))) ;;.......sssssssssSSSSSSSSSSSSSSSS (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-s (1- b-temp)) (store-word (dpb bb-s-word byte-s byte-r memory-data))) (incr-d-offset) ;;rotate s-word further to right by 32-d.bitpos ;;SSSSSSSSSSSSSSSS.......sssssssss (assign byte-r- bb-d-bitpos) ;or left by -(32-d.bitpos) (assign bb-s-word (rotate bb-s-word byte-r)) (assign bb-width (- bb-width a-temp)) (parallel (assign bb-s-bitpos (+ bb-s-bitpos b-temp)) (lisp (trace-path #/d)) (jump ubitblt-d-aligned-row-source))) (sequential ;;The high part of the first source word is not as long as the high part of the ;;first destination word. So extract the useful part of the first source word, ;;and deposit into it as much of the secand source word as needed to fill out the rest ;;of the first destination word. Then position the rest of the second source word ;;appropriately for the inner loop. 4,887,235 475 476 ;; <- 32-s -> ;; ................................|SSSSSSSSSSsssssssssssssssssssssss ;; DDDDDDDDDDDDDDDD DDDDDDDDDDdddddddd (parallel-with-d-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign b-temp bb-s-bitpos) (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))) (incr-wrap-s-offset) ;; <----- s-d ----> <- 32-s -> (32-d)-(32-s)-s-d ;; ssssssssssssssssSSSSSSSSSSSSSSSS|1111111111...................... ;; DDDDDDDDDDDDDDDD DDDDDDDDDDdddddd (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign byte-s (- b-temp bb-d-bitpos 1)) (assign bb-s-word2 (logxor bb-constant memory-data))) (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word))a (assign bb-s-bitpos (- b-temp bb-d-bitpos)) (assign a-temp (32- bb-d-bitpos)) (assign bb-width (- bb-width a-temp)) (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-s (1- a-temp)) (store-word (dpb bb-s-word byte-s byte-r memory-data))) (incr-d-offset) (assign byte-r (32- bb-s-bitpos)) (parallel (assign bb-s-word (rotate bb-s-word2 byte-r)) (lisp (trace-path #/e)) (jump ubitblt-d-aligned-row-source1)))))))) (defucode ubitblt-aligned-row-source (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.))) ;;Fetch a block of words onto the block of amem past top of stack, and move sp there. (sequential (assign b-temp (+ bb-s-offset (a-constant 8.))) (if (greater-or-equal-fixnum b-temp bb-s-row-length) (goto ubitblt-aligned-row-source-slow-loop) (sequential (assign-vma-offset s 1) (parallel (assign a-temp (b-constant 8.)) (assign b-temp obus) (start-memory block read)) ;start first word (parallel (waiting-for-memory) (start-memory block read) ;waiting for first word (call ubitblt-block-read-push-8)) ;start eccond word (parallel (assign-vma-offset d) (call ubitblt-block-write-pop-8)) (parallel (assign bb-s-offset (+ bb-s-offset (a-constant 8.))) (jump ubitblt-aligned-row-source))))) ;;Frob with what’s left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (a-constant (* 4 32.))) (sequential (assign b-temp (+ bb-s-offset (a-constant 4))) (it (greater-or-equal-fixnum b-temp bb-s-row-length) (goto ubitblt-aligned-row-source-slow-loop) (sequential (assign-vma-offset s 1) (parallel (assign a-temp (b-constant 4)) (assign b-temp obus) (start-memory block read)) ;start first word (parallel (waiting-for-memory) ;waiting for first word (start-memory block read) ;start second word (call ubitblt-block-read-push-4)) (parallel (assign-vma-offset d) (call ubitblt-block-write-pop-4)) (parallel (assign bb-s-offset (+ bb-s-offset (a-constant 4))) (jump ubitblt-aligned-row-source-slow-loop))))) (goto ubitbtt-aligned-row-source-slow-loop)))) (defucode ubitblt-aligned-row-source-slow-loop ;9 cycles per word (parallel (assign bb-width (- bb-width (a-constant 32.))) ;1 cycle (trap-if (minus-fixnum obus) ubitblt-aligned-row-source-slow-loop-done)) (incr-wrap-s-offset) ;2 (parallel-with-s-access ;3 bb-s-offset (assign bb-s-word (logxor bb-constant memory-data))) (assign-vma-offset d) ;1 (store-word bb-s-word) ;1 (parallel ;1 4,887,235 477 478 (incr-d-offset) (lisp (trace-path #/,)) (jump ubitblt-aligned-row-source-slow-loop))) (defucode ubitblt-aligned-row-source-slow-loop-done (trap-no-save) (if (plus-fixnum bb-width) (sequential (incr-wrap-s-offset) (parallel-with-s-access bb-s-offset (assign bb-s-word (logxor bb-constant memory-data))) (parallel-with-d-access bb-d-offset (assign byte-r (a-constant 0)) (assign byte-s (1- bb-width)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data)) (lisp (trace-path #/2))))) (parallel-with-return (lisp (trace-path #/1))))) ;;Each pass through this loop stores exactly one d word. Each time through, ;;bb-s-word will have the bits to use for the lower part of the d word (already ;;rotated into position), and another s word will bo fetched into bb-s-word. ;;Then s-word2 will get rotated when transferred into s-word in preparation for ;;next loop pass. (defucode ubitblt-d-aligned-row-source (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.))) ;;Fetch a block of words onto the block of amem past top of stack, and move ep there. (sequential (assign b-temp (+ bb-s-offset (a-constant 8.))) (if (greater-or-equal-fixnum b-temp bb-s-row-length) (goto ubitblt-d-aligned-row-source-slow-loop) (sequential (assign-vma-offset s 1) (parallel (assign a-temp (b-constant 8.)) (assign b-temp obus) (start-memory block read)) ;start first word (parallel (waiting-for-memory) ;waiting for first word (start-memory block read) ;start second word (call ubitblt-block-read-push-8)) (parallel (assign-vma-offset d) (call ubitblt-d-aligned-block-write-pop-8)) (parallel (assign bb-s-offset (+ bb-s-offset (a-constant 8.))) (jump ubitblt-d-aligned-row-source))))) (if (greater-or-equal-fixnum bb-width (a-constant (* 4. 32.))) (sequential (assign b-temp (+ bb-s-offset (a-constant 4))) (if (greater-or-equal-fixnum b-temp bb-s-row-length) (goto ubitblt-d-aligned-row-source-slow-loop) (sequential (assign-vma-offset s 1) (parallel (assign a-temp (b-constant 4.)) (assign b-temp obus) (start-memory block read)) ;start first word (parallel (waiting-for-memory) ;waiting for first word (start-memory block read) ;start second word (call ubitblt-block-read-push-4)) (parallel (assign-vma-offset d) (call ubitblt-d-aligned-block-write-pop-4)) (parallel (assign bb-s-offset (+ bb-s-offset (a-constant 4.))) (jump ubitblt-d-aligned-row-source))))) (goto ubitblt-d-aligned-row-source-slow-loop)))) (defmacro def-d-aligned-block-write-pop (name n) `(defucode ,name (assign byte-s (1- bb-s-bitpos)) (assign byte-r (- (b-constant 32.) bb-s-bitpos)) ,@(loop for i from n downto 1 append ((parallel (assign memory-data (dpb (amem (stack-pointer ,(- (- n i)))) byte-s byte-r bb-s-word)) (start-memory block write) (lisp (trace-path #/.))) (assign bb-s-word (rotate (amem (stack-pointer ,(- (- n i)))) byte-r)))) (assign stack-pointer (- stack-pointer b-temp)) (assign first-part-done (b-constant 0)) (assign bb-d-offset (+ bb-d-offset a-temp)) (parallel-with-return (assign bb-width (- bb-width (rotate a-temp 5))) ;2’~S - bits-per-word ))) 4,887,235 479 480 (def-d-aligned-block-write-pop ubitblt-d-aligned-block-write-pop-8 8.) (def-d-aligned-block-write-pop ubitblt-d-aligned-block-write-pop-4 4.) (defucode ubitblt-d-aligned-row-source-slow-loop (parallel (assign bb-width (- bb-width (a-constant 32.))) (trap-if (minus-fixnum obus) ubitblt-d-aligned-row-source-done)) ;aborts the assign (incr-wrap-s-offset) (assign-vma-offset s) (parallel (assign byte-s (1- bb-s-bitpos)) (start-memory read)) (parallel (assign byte-r (- (b-constant 32.) bb-s-bitpos)) (waiting-for-memory)) (abus-array-data (assign bb-s-word2 (logxor bb-constant memory-data))) (assign-vma-offset d) (store-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (incr-d-offset) (parallel (assign bb-s-word (rotate bb-s-word2 byte-r)) (lisp (trace-path #/.)) (jump ubitblt-d-aligned-row-source))) (defucode ubitblt-d-aligned-row-source-done (trap-no-save) (if (plus-fixnum bb-width) (sequential (assign a-temp (32- bb-s-bitpos)) ;how many bits are valid in bb-s-word (if (lesser-or-equal-fixnum bb-width a-temp) ;;we have enough s bits (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (assign byte-r (a-constant 0)) (parallel (lisp (trace-path #/4)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data))))) ;;need to get another source word (sequential (incr-wrap-s-offset) (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word2 (logxor bb-constant memory-data))) (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (lisp (trace-path #/5)) (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (assign byte-r (a-constant 0)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data))))))) (parallel (lisp (trace-path #/3)) (return)))) ;;alu depends only on destination bits (defucode ubitblt-long-row-destination (if (bit first-part-done) (goto ubitblt-long-row-destination-pclsr-restart) (if (plus-fixnum bb-d-bitpos) (sequential ;frob the first partial word (assign a-temp (32- bb-d-bitpos)) (assign byte-r bb-d-bitpos) (parallel-with-d-accees bb-d-offset (assign byte-s (1- a-temp)) (assign b-temp (dpb bb-constant byte-s byte-r (a-constant 0))) (store-word (logxor b-temp memory-data))) (incr-d-offset) (parallel (assign bb-width (- bb-width a-temp)) (lisp (trace-path #/b)) (jump ubitblt-long-row-destination-loop))) (parallel (lisp (trace-path #/a)) ;---this debug crap costs a cycle here. (jump ubitblt-long-row-destination-loop))))) ;---shouid be goto. not jump. (defucode ubitblt-long-row-destination-loop (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.))) ;;Fetch a block of words onto the block of amem past top of stack, and move sp there. (sequential (assign-vma-offset d) (parallel (assign a-temp (b-constant 8.)) (assign b-temp obus) (start-memory block read)) ;start first word 4,887,235 481 482 (parallel (waiting-for-memory) ;waiting for first word (start-memory block read) ;start second word (call ubitblt-block-read-push-8)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-write-pop-8 ubitblt-long-row-destination-loop))) ;;Frob with what’s left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (a-constant (* 4 32.))) (sequential (assign-vma-offset d) (parallel (assign a-temp (b-constant 4)) (assign b-temp obus) (start-memory block read)) ;start first word (parallel (waiting-for-memory) ;waiting for- first word (start-memory block read) ;start second-word (call ubitbtt-block-read-push-4)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-write-pop-4 ubitbtt-long-row-destination-slow-loop))) (goto ubitblt-long-row-destination-slow-loop)))) ;;Write this when pclsring can happen (defucode ubitblt-long-row-destination-pclsr-restart (lisp (tell-the-simulator-that-it-is-supposed-to-halt-the-machine)) (halt bitblt-pclsring-now-yet-written)) (defucode ubitblt-long-row-destination-slow-loop (parallel (assign bb-width (- bb-width (a-constant 32.))) (trap-if (minus-fixnum obus) ubitblt-long-row-destination-done)) ;aborts the assign (lisp (trace-path #/,)) (parallel-with-d-access bb-d-offset (incr-d-offset) (parallel (store-word (logxor bb-constant memory-data)) (jump ubitblt-long-row-destination-slow-loop)))) (defucode ubitblt-long-row-destination-done (trap-no-save) (if (plus-fixnum bb-width) (sequential (assign byte-r (a-constant 0)) (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (assign b-temp (dpb bb-constant byte-s byte-r (a-constant 0))) (parallel (lisp (trace-path #/2)) (parallel-with-return (store-word (logxor b-temp memory-data)))))) (parallel (lisp (trace-path #/1)) (return)))) (defmacro def-block-read-push (name n) `(defucode ,name ,@(loop for i from n downto 1 collect `(parallel (declare-memory-timing data-cycle) (check-data-type memory-data dtp-fix) (assign (amem (stack-pointer ,i)) (logxor bb-constant memory-data)) ,(when (> i 2) '(start-memory block read)))) (assign first-part-done (b-constant 1)) (parallel-with-return (assign stack-pointer (+ stack-pointer b-temp))))) (def-block-read-push ubitblt-block-read-push-8 8) ;I suppose this when interned... (def-block-read-push ubitblt-block-read-push-4 4) ;... will subsume this. (defmacro def-block-write-pop (name n) `(defucode ,name ,@(loop for i from n downto 1 collect `(parallel (assign memory-data (amem (stack-pointer ,(- (- n i))))) (start-memory block write) (lisp (trace-path #/.)))) (assign stack-pointer (- stack-pointer b-temp)) (assign first-part-done (b-constant 0)) (assign bb-d-offset (+ bb-d-offset a-temp)) (parallel-with-return (assign bb-width (- bb-width (rotate a-temp 5))) ;2^5 = bits-per-word ))) (def-block-write-pop ubitblt-block-write-pop-8 8) (def-block-write-pop ubitblt-block-write-pop-4 4) 4,887,235 483 484 ;;alu depends on neither source nor destination bits (defucode ubitblt-long-row-neither (if (plus-fixnum bb-d-bitpos) (sequential (assign a-temp (32- bb-d-bitpos)) (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-s (1- a-temp)) (store-word (dpb bb-constant byte-s byte-r memory-data))) (incr-d-offset) (parallel (assign bb-width (- bb-width a-temp)) (lisp (trace-path #/b)) (jump ubitblt-long-row-neither-loop))) (parallel (lisp (trace-path #/a)) (jump ubitblt-long-row-neither-loop)))) (defucode ubitblt-long-row-neither-loop (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.))) (sequential (parallel (assign-vma-offset d) (call store-block-bb-constant-8)) (assign bb-d-offset (+ bb-d-offset (a-constant 8.))) (parallel (assign bb-width (- bb-width (a-constant (* 8. 32.)))) (jump ubitblt-long-row-neither-loop))) (sequential (dispatch-after-next (parallel (assign a-temp (ldb bb-width 3 5)) (ldb bb-width 3 5)) ((7) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-7 ubitblt-long-row-neither-finish))) ((6) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-6 ubitblt-long-row-neither-finish))) ((5) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-5 ubitbtt-long-row-neither-finish))) ((4) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-4 ubitblt-long-row-neither-finish))) ((3) (parallel (assign-vma-offset d) (call-and-return-to store-b lock-bb-constant-3 ubitblt-long-row-neither-finish))) ((2) (parallel (assign-vma-offset d) (call-and-return-to store-b lock-bb-constant-2 ubitblt-long-row-neither-finish))) ((1) (assign-vma-offset d) (parallel (lisp (trace-path #/.)) (store-word bb-constant) (jump ubitblt-long-row-neither-finish))) (otherwise (goto cant-happen))) (if (zero-fixnum a-temp) (goto ubitblt-long-row-neither-finish) (take-dispatch))))) (defucode ubitblt-long-row-neither-finish (assign bb-d-offset (+ bb-d-offset a-temp)) (assign bb-width (logand bb-width (a-constant #o37))) (if (plus-fixnum bb-width) (parallel-with-d-accees bb-d-offset (assign bute-r (a-constant 0)) (assign byte-s (1- bb-width)) (parallel (lisp (trace-path #/2)) (store-word (dpb bb-constant byte-s byte-r memory-data)) (return))) (parallel (lisp (trace-path #/1)) (return)))) (defmacro store-block-bb-constant-routines (n) `(progn 'compile ,@(loop with s = "STORE-BLOCK-BB-CONSTANT-~d" for i from n downto 1 collect `(defucode ,(fintern s i) (parallel (assign memory-data (set-type bb-constant dtp-fix)) ,(if (> i 1) '(start-memory block write) '(start-memory write)) (lisp (trace-path #/,)) ,(if (> i 1) `(jump ,(fintern s (1- i))) (return))))))) 4,887,235 485 486 (store-block-bb-constant-routines 8.) ;;alu depends both source and destination bits (defucode ubitblt-long-row-both (parallel (assign b-temp bb-d-bitpos) (if (zero-fixnum bb-d-bitpos) (if (zero-fixnum bb-s-bitpos) (parallel (lisp (trace-path #/a)) (assign bb-s-offset (1- bb-s-offset)) ;bb-aligned-row-both will increment first (jump ubitblt-aligned-row-both)) (parallel-with-s-access bb-s-offset ;; SSSSSSSSSSSSSSSSSSSSSSSSSSSS.ssss ;;dddddddddddddddddddddddddddddddd. (ansign byte-r (32- bb-s-bitpos)) (parallel (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))) (lisp (trace-path #/c)) (jump ubitblt-d-aligned-row-both)))) (if (equal-fixnum bb-s-bitpos b-temp) (sequential (parallel-with-s-access bb-s-offset ;;SSSSSSSSSSSSSSSSSSSSSSSSSS.ssssss ;;dddddddddddddddddddddddddd.dddddd (parallel (assign byte-r (32- bb-s-bitpos)) (assign a-temp obus)) (assign byte-s (31- bb-s-bitpos)) (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r)))) (assign byte-r bb-s-bitpos) (parallel (assign-vma-offset d) ;;sssssssssssssssssssmssssss.ssssss ;;DDDDDDDDDDDDDDDDDDDDDODDDD.dddddd (call bb-byte-alu-operation-dispatch)) (incr-d-offset) (parallel (assign bb-width (- bb-width a-temp)) (lisp (trace-path #/b)) (jump ubitblt-aligned-row-both))) (if (lesser-fixnum bb-s-bitpos b-temp) (goto ubitblt-long-row-both-s-longer) (goto ubitblt-long-row-both-s-shorter)))))) (defucode ubitblt-long-row-both-s-longer (assign a-temp (32- bb-d-bitpos)) (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign byte-s (1- a-temp)) (assign bb-s-word2 (logxor bb-constant memory-data))) ;;ssssSSSSSSSSSSSSSSSSSSSS........ ;; DDDDDDDDDDDDDDDDDDDDdddddddddddd ;; <----- a-temp -----> (assign bb-s-word (rotate bb-s-word2 byte-r)) ;;........ssssSSSSSSSSSSSSSSSSSSSS (assign byte-r bb-d-bitpos) (parallel (assign-vma-offset d) ;;ssssssssssssssssssssssss.ssssssss ;; DDDDDDDDDDDDDDDDDDDD.dddddddddddd (call bb-byte-alu-operation-dispatch)) (incr-d-offset) ;;Remaining are (32-(s.bitpos+(32-d.bitpos))) - d.bitpos-s.bitpos ;; <-- 32-d.bitpos ---> <-s.bitpos-> ;;SSSSssssssssssssssssssss.ssssssss ;; dddddddddddddddddddd.dddddddddddd (assign b-temp bb-s-bitpos) (assign byte-r (- bb-d-bitpos b-temp)) (assign bb-s-word (rotate bb-s-word2 byte-r)) (assign bb-width (- bb-width a-temp)) (parallel (assign bb-s-bitpos (+ b-temp a-temp)) (lisp (trace-path #/d)) (jump ubitblt-d-aligned-row-both))) (defucode ubitblt-long-row-both-s-shorter ;; sssssssssssessssssssssss.ssssssss ;;dddddddddddddddddddddddddddd.dddd (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign byte-s (31- bb-s-bitpos)) ;; SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss ;;dddddddddddddddddddddddddddd.dddd (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r)))) (incr-wrap-s-offset) ;; <--> s.bitpos-d.bitpos ;;...SSSS|ssssssssssssssssssssssss.ssssssss 4,887,235 487 488 ;; dddd dddddddddddddddddddddddd.dddd (assign b-temp bb-d-bitpos) (parallel-with-s-access bb-s-oftset (assign byte-s (- bb-s-bitpos b-temp 1)) (assign byte-r (32- bb-s-bitpos)) (assign bb-s-word2 (logxor bb-constant memory-data))) ;;...SSSS|SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss ;; dddd dddddddddddddddddddddddd.dddd (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (assign byte-r bb-d-bitpos) (assign byte-s (31- bb-d-bitpos)) ;;...ssss|ssssssssssssssssssssssss.ssssssss ;; DDDD DDDDDDDDDDDDDDDDDDDDDDDD.dddd (parallel (assign-vma-offset d) (call bb-byte-alu-operation-dispatch)) (incr-d-offset) ;;...SSSssss|ssssssssssssssssssssssss.ssssssss ;; dddd dddddddddddddddddddddddd.dddd (assign a-temp (32- bb-d-bitpos)) (assign bb-width (- bb-width a-temp)) ;Try to find some more cleverness here. (assign b-temp bb-d-bitpos) (assign byte-r (- b-temp bb-s-bitpos)) (assign bb-s-bitpos (- bb-s-bitpos b-temp)) (parallel (assign bb-s-word (rotate bb-s-word2 byte-r)) (lisp (trace-path #/e)) (jump ubitblt-d-aligned-row-both))) (defucode ubitblt-aligned-row-both (if (greater-or-equal-fixnum bb-width (a-constant (* 8. 32.))) ;;Fetch a block of words onto the block of amem past top of stack, and move sp there. (sequential (assign b-temp (+ bb-s-offset (a-constant 8.))) (if (greater-or-egual-fixnum b-temp bb-s-row-length) (goto ubitblt-aligned-row-both-slow-loop) (sequential (assign-vma-offset s 1) (parallel (assign a-temp (b-constant 8.)) (assign b-temp obus) (start-memory block read)) ;start first word (parallel (waiting-for-memory) ;waiting for first word (start-memory block read) ;start second word (cal ubitblt-block-read-push-8)) (assign-vma-offset d) (dispatch-after-this (ldb bb-alu-operation 4 0) (parallel (assign a-temp (a-constant 8.)) (assign b-temp (a-constant 8.)) (start-memory block read)) ;start first word ((1 2) ;; x*y ~x*y (goto ubitblt-block-logand-8)) ((4 8.) ;; x*~y ~x*~y (goto ubitblt-block-andc2-8)) ((6 9.) ;; x xor y, ~x xor y (goto ubitblt-block-logxor-8)) ((7 11.) ;; x+y ~x+y (goto ubitblt-block-logior-8)) ((13. 14.) ;; ~(~x*y), ~(x*y) (goto ubitblt-block-lognand-8)) (otherwise (goto cant-happen)))))) ;;Frob with what’s left. Too bad dispatch blocks are expensive. ;;(if (greater-or-equal-fixnum bb-width (a-constant (* 4 32.))) ...) (goto ubitblt-aligned-row-both-slow-loop))) (defmacro def-block-aluop (name n alu &optional complement) `(defucode ,name ,@(loop for i from n downto 1 append `((parallel (declare-memory-timing active-cycle) ;wait for first word (waiting-for-memory) (assign b-temp-2 (amem (stack-pointer ,(- (- n i)))))) ,@(if (not complement) `((parallel (abus-array-data (assign (amem (stack-pointer ,i)) (,alu b-temp-a memory-data))) ,(when (> i 1) '(start-memory block read) ;start next word ))) `((abus-array-data (assign a-temp-2 (,alu b-temp-2 memory-data))) (parallel (assign (amem (stack-pointer ,i)) (logxor a-temp-2 (b-constant -1))) ,(when (> I 1) '(start-memory block read) )))))) (parallel 4,887,235 489 490 (assign stack-pointer (+ stack-pointer b-temp)) (jump ,(fintern "UBITBLT-BL0CK-ALU-WRITE-~d" n))))) (def-block-aluop ubitblt-block-logand-8 8 logand) (def-block-aluop ubitblt-block-logior-8 8 logior) (def-block-aluop ubitblt-block-logxor-8 8 logxor) (def-block-aluop ubitblt-block-andc2-8 8 andc2) (def-block-aluop ubitblt-block-lognand-8 8 logand complement) (defmacro def-block-alu-write (name n) `(defucode ,name (assign-vma-offset d) ,@(loop for i from n downto 1 collect `(parallel (assign memory-data (amem (stack-pointer ,(- (- n i))))) (start-memory block write) (lisp (trace-path #/.)))) (assign stack-pointer (- stack-pointer (rotate b-temp 1))) (assign first-part-done (b-constant 0)) (assign bb-d-offset (+ bb-d-offset a-temp)) (assign bb-width (- bb-width (rotate a-temp 5))) ;2^5 - bits-per-word (parallel (assign bb-s-offset (+ bb-s-offset a-temp)) (jump ubitblt-alignod-row-both)))) (def-block-alu-write ubitblt-block-alu-write-8 8) (defucode ubitblt-aligned-row-both-slow-loop ;11 cycles per word, or 12 for nand (parallel ;1 cycle (assign bb-width (- bb-width (a-constant 32.))) (trap-if (minus-fixnum obus) ubitblt-aligned-row-both-slow-loop-done)) (incr-wrap-s-offset) ;2 cycles (parallel-with-s-access ;3 cycles bb-s-offset (assign bb-s-word (logxor bb-constant memory-data))) (parallel ;1+3 cycles, or 1+4 for nand (assign-vma-offset d) (call bb-word-alu-operation-dispatch)) (parallel ;1 cycle (incr-d-offset) (lisp (trace-path #/,)) (jump ubitblt-aligned-row-both))) (defucode ubitblt-aligned-row-both-slow-loop-done (if (plus-fixnum bb-width) (sequential (incr-wrap-s-offset) (parallel-with-s-access bb-s-offset (assign byte-r (b-constant 0)) (assign byte-s (1- bb-width)) (assign bb-s-word (logxor bb-constant memory-data))) (parallel (lisp (trace-path #/2)) (assign-vma-offset d) (jump bb-byte-alu-operation-dispatch))) ;jcall (parallel-with-return (lisp (trace-path #/1))))) ;;Each time through the loop, s-word was fetched from memory like ;; <------s.bitpos------> ;;ssssssssss...................... ;;and then rotated so it looks like ;;......................sssssssses ;;<------s.bitpos------> ;; ;;Each time, another s-word2 gets fetched and deposited into s-word like ;; |<------s.bitpos------> ;; |......................1111111111 ;;2222222222 2222222222222222222222 ;; ;;The rotation for the dpb equals the rotation for setup for next loop. (defucode ubitblt-d-aligned-row-both (parallel (assign bb-width (- bb-width (a-constant 32.))) (trap-if (minus-fixnum obus) ubitblt-d-aligned-row-both-done)) ;aborts assign (incr-wrap-s-offset) (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word2 (logxor bb-constant memory-data))) (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (parallel (assign-vma-offset d) (call bb-word-alu-operation-dispatch)) (incr-d-offset (parallel (assign bb-s-word (rotate bb-s-word2 byte-r)) (lisp (trace-path #/.)) (jump ubitblt-d-aligned-row-both))) 4,887,235 491 492 ;;At entry, we have s-word fetched from memory like ;; <------s.bitpos------> ;;ssssssssss...................... ;;but then rotated so it looks like ;;......................ssssssssss ;;<------s.bitpos------> ;; ;;This is to be combined with d-word which looks like ;;....................dddddddddddd ;; <---width--> (defucode ubitblt-d-aligned-row-both-done (trap-no-save) (if (plus-fixnum bb-width) (sequential (assign a-temp (32- bb-s-bitpos)) (if (lesser-or-equal-fixnum bb-width a-temp) ;;we have enough s bits ;;<----s.bitpos---><----a.temp---> ;;.................sssssssssssssss ;;....................dddddddddddd ;; <---width--> (sequential (assign byte-r (b-constant 0)) (assign byte-s (1- bb-width)) (parallel (assign-vma-offset d) (lisp (trace-path #/4)) (jump bb-byte-alu-operation-dispatch))) ;jcall ;;need to get another source word ;;<----s.bitpos---><----a.temp---> ;;.................sssssssssssssss ;;............dddddddddddddddddddd ;; <-------width------> (sequential (incr-wrap-s-offset) (parallel-with-s-access bb-s-offset (assign byte-r a-temp) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word2 (logxor memory-data bb-constant))) (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (assign bute-r (b-constant 0)) (assign byte-s (1- bb-width)) (parallel (assign-vma-offset d) (lisp (trace-path #/5)) (jump bb-byte-alu-operation-dispatch))))) ;jcall (parallel-with-return (lisp (trace-path #/3))))) (defucode ubitblt-long-row-source-backwards (parallel (assign b-temp bb-d-bitpos) (if (zero-fixnum bb-d-bitpos) (if (zero-fixnum bb-s-bitpos) (parallel (assign bb-s-offset (1+ bb-s-offset)) ;the loop will decr first (lisp (trace-path #/a)) (jump ubitbtt-aligned-row-source-backwards)) (sequential (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (parallel (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))) (lisp (trace-path #/c)) (jump ubitblt-d-aligned-row-source-backwards))))) (if (equal-fixnum b-temp bb-s-bitpos) (sequential (parallel-with-s-access bb-s-offset (assign byte-s (1- bb-s-bitpos)) (assign byte-r (b-constant 0)) (assign bb-s-word (logxor memory-data bb-constant))) (parallel-with-d-access bb-d-offset (decr-d-offset) (assign bb-width (- bb-width bb-s-bitpos)) (parallel (store-word (dpb bb-s-word byte-s byte-r memory-data)) (lisp (trace-path #/b)) (jump ubitblt-aligned-row-source-backwards)))) (if (greater-fixnum bb-s-bitpos b-temp) ;s > d, enough in the current word (sequential (parallel-with-s-access bb-s-offset (assign bb-width (- bb-width bb-d-bitpos)) ;has to be done somewhere (assign bb-s-word (logxor bb-constant memory-data))) (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-d-bitpos))