4,887,235 313 314 (parallel-with-s-access bb-s-offset (assign bb-s-word (logxor bb-constant memory-data))) (parallel-with-d-access bb-d-offset (assign byte-r (a-constant 0)) (assign byte-s (1- bb-width)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data)) (lisp (trace-path #/2))))) (parallel-with-return (lisp (trace-path #/1))))) ;bb-s-word2 has the partial previous source word whose address is in bb-s-offset. ;rotated into aliagnment with the destination (defucode ubitblt-d-aligned-row-source (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into the buffer- (sequential (assign b-temp (+ bb-s-offset (b-constant 8.))) (if (lesser-or-equal-fixnum bb-s-row-length b-temp) (goto ubitblt-d-aligned-row-source-slow-loop) (sequential (parallel (assign-vma-offset s 1) (call ubitblt-block-read-8)) (parallel (assign-vma-offset d) (call ubitblt-d-aligned-block-write-8)) (parallel (assign bb-s-offset (+ bb-s-offset b-block-size)) (jump ubitblt-d-aligned-row-source))))) (if (greater-or-equal-fixnum bb-width (b-constant (* 4. 32.))) (sequential (assign b-temp (+ bb-s-offset (b-constant 4))) (if (lesser-or-equal-fixnum bb-s-row-length b-temp) (goto ubitblt-d-aligned-row-source-slow-loop) (sequential (parallel (assign-vma-offset s 1) (call ubitblt-block-read-4)) (parallel (assign-vma-offset d) (call ubitblt-d-aligned-block-write-4)) (parallel (assign bb-s-offset (+ bb-s-offset b-block-size)) (jump ubitblt-d-aligned-row-source))))) (goto ubitblt-d-aligned-row-source-slow-loop)))) ;;Each pass through this loop stores exactly one d word. Each time through, ;:bb-s-word2 will have the bits to use for the lower part of the d word (already ;;rotated into position), and another s wore will be fetched into bb-s-word. ;;Then s-word will get rotated when transferred into s-word2 in preporation for ;;next loop pass. (defucode ubitblt-d-aligned-row-source-slow-loop ;13 cycles per- word (incr-wrap-s-offset-ahead) ;2 (parallel-with-s-access bb-s-offset-ahead ;4 (trap-if (lesser-fixnum bb-width (b-constant 32.)) ubitblt-d-aligned-row-source-done) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word (logxor bb-constant memory-data))) (assign byte-r (- (b-constant 32.) bb-s-bitpos)) ;1 (assign-vma-offset d) ;1 (store-word (dpb bb-s-word byte-s byte-r bb-s-word2)) ;1 (assign bb-width (- bb-width (b-constant 32.))) ;1 (incr-d-offset) ;1 (assign bb-s-offset bb-s-offset-ahead) ;1 (parallel ;1 (assign bb-s-word2 (rotate bb-s-word byte-r)) (lisp (trace-path #/.)) (jump ubitblt-d-aligned-row-source))) (defucode ubitblt-d-aligned-row-source-done (if (plus-fixnum bb-width) (sequential (assign b-temp (32- bb-s-bitpos)) ;how many bits are valid in bb-s-word2 (if (lesser-or-equal-fixnum bb-width b-temp) ;;we have enough s bits (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (parallel (assign byte-r (b-constant 0)) (assign bb-s-word bb-s-word2)) (parallel (lisp (trace-path #/4)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data))))) ;;need to get another source word (sequential 4,887,235 315 316 (parallel-with-s-access bb-s-offset-ahead (assign byte-r (32- bb-s-bitpos)) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word (logxor bb-constant memory-data))) (assign bb-s-word (dpb bb-s-word byte-s byte-r bb-s-word2)) (lisp (trace-path #/5)) (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (assign byte-r (a-constant 0)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data))))))) (parallel (lisp (trace-path #/3)) (return)))) ;alu depends only on destination bits (defucode ubitblt-long-row-destination (if (plus-fixnum bb-d-bitpos) (sequential ;frob the first partial word (assign b-temp (32- bb-d-bitpos)) (parallel-with-d-access bb-d-offset (assign byte-s (1- b-temp)) (assign byte-r bb-d-bitpos) (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data))) (incr-d-offset) (assign bb-width (- bb-width b-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/b)) (jump ubitblt-long-row-destination-loop))) (machine-version-case ((sim) (parallel (lisp (trace-path #/a)) (jump ubitblt-long-row-destination-loop))) (otherwise (goto ubitblt-long-row-destination-loop))))) (defucode ubitblt-long-row-destination-loop ;25 cycles per 8 words (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into the buffer (sequential (parallel (assign-vma-offset d) (call ubitblt-block-read-8)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-write-8 ubitblt-long-row-destination-loop))) ;;Frob with what's left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.))) (sequential (parallel (assign-vma-offset d) (call ubitblt-block-read-4)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-write-4 ubitblt-long-row-destination-slow-loop))) (goto ubitblt-long-row-destination-slow-loop)))) (defucode ubitblt-long-row-destination-slow-loop ;5 cycles per word (bus interference) (parallel-with-d-access-check-write bb-d-offset (parallel (assign bb-width (- bb-width (b-constant 32.))) (trap-if (minus-fixnum obus) ubitblt-long-row-destination-done)) ;aborts the assign (parallel (lisp (trace-path #/1)) (waiting-for-memory) (incr-d-offset)) (parallel (store-word (logxor bb-constant memory-data)) (jump ubitblt-long-row-destination-slow-loop)))) (defucode ubitblt-long-row-destination-done (if (plus-fixnum bb-width) (parallel-with-d-access bb-d-offset (assign byte-s (1- bt-width)) (assign byte-r (a-constant 0)) (parallel-with-return (lisp (trace-path #/2)) (store-word (logxor (dpb bb-constant byte-s byte-r 0) memory-data)))) (parallel (lisp (trace-path #/1)) (return)))) (defmacro def-bitblt-block-read (name n) '(defucode ,name (parallel (assign a-block-size (b-constant ,n)) ;Used later to advance offsets (assign b-block-size obus) (start-memory block read)) ;start first word 4,887,235 317 318 (parallel (waiting-for-memory) ;waiting for first word (start-memory block read)) ;start second word ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect '(abus-array-data (assign (bitblt-buffer ,1) (set-type (logxor bb-constant memory-data) dtp-fix)) ,(selectq (- n-bitblt-buffers i) (1 '(return)) (2 nil) (otherwise '(start-memory block read))))))) (def-bitblt-block-read ubitblt-block-read-8 8) ;I suppose this when interned... (def-bitblt-block-read ubitblt-block-read-4 4) ;... Will subsume this. (defmacro def-bitblt-block-write (name n) '(defucode ,name (activate-bitblt-buffer) ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect '(parallel (store-word (bitblt-buffer ,i) block) (lisp (trace-path #/.)))) (parallel (assign bb-d-offset (+ bb-d-offset b-block-size)) (call deactivate-bitblt-buffer)) (parallel-with-return (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word ))) (def-bitblt-block-write ubitblt-block-write-8 8) (def-bitblt-block-write ubitblt-block-write-4 4) (defmacro def-d-aligned-block-write (name n) '(defucode ,name (assign byte-s (1- bb-s-bitpos)) (parallel (assign byte-r (- (b-constant 32.) bb-s-bitpos)) (call activate-bitblt-buffer)) ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers append '((parallel (store-word (dpb (bitblt-buffer ,i) byte-s byte-r bb-s-word2) block) (lisp (trace-path #/.))) (assign bt-s-word2 (rotate (bitblt-buffer ,i) byte-r)))) (parallel (assign bb-d-offset (+ bb-d-offset b-block-size)) (call deactivate-bitblt-buffer)) (parallel-with-return (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word ))) (def-d-aligned-block-write ubitblt-d-aligned-block-write-8 8.) (def-d-aligned-block-write ubittlt-d-aligned-block-write-4 4.) ;;alu depends on neither source nor destination bits (defucode ubitblt-long-row-neither (if (plus-fixnum bb-d-bitpos) (sequential (assign b-temp (32- bb-d-bitpos)) (parallel-with-d-access bb-d-offset (assign byte-r bb-d-bitpos) (assign byte-S (1- b-temp)) (store-word (cpb bt-constant byte-s byte-r memory-data))) (incr-d-offset) (assign bb-width (- bb-width b-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/b)) (jump ubitblt-long-row-neither-loop))) (parallel (lisp (trace-path #/a)) (jump ubitblt-long-row-neither-loop)))) (defucode ubitblt-long-row-neither-loop (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) (sequential (parallel (assign-vma-offset d) (call store-block-bb-constant-8)) (assign bb-d-offset (+ bb-d-offset (b-constant 8.))) (parallel (assign bb-width (- bb-width (b-constant (* 8. 32.)))) (jump ubitblt-long-row-neither-loop))) (sequential (dispatch-after-next (parallel (assign b-block-size (ldb bb-width 3 5)) (ldb bt-width 3 5)) ((7) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-7 ubitblt-long-row-neither-finish))) ((6) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-6 ubitblt-long-row-neither-finish))) 4,887,235 319 320 ((5) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-5 ubitblt-long-row-neither-finish))) ((4) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-4 ubitblt-long-row-neither-finish))) ((3) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-3 ubitblt-long-row-neither-finish))) ((2) (parallel (assign-vma-offset d) (call-and-return-to store-block-bb-constant-2 ubitblt-long-row-neither-finish))) ((1) (assign-vma-offset d) (parallel (lisp (trace-path #/.)) (store-word bb-constant) (jump ubitblt-long-row-neither-finish)))) (parallel (take-dispatch) (trap-if (zero-fixnum b-block-size) ubitblt-long-row-neither-finish))))) (defucode ubitblt-long-row-neither-finish (assign bb-d-offset (+ bb-d-offset b-block-size)) (assign bb-width (logand bb-width (b-constant #o37))) (if (plus-fixnum bb-width) (parallel-with-d-access bb-d-offset (assign byte-r (a-constant 0)) (assign byte-s (1- to-width)) (parallel (lisp (trace-path #/2)) (store-word (dpb bb-constant byte-s byte-r memory-data)) (return))) (parallel (lisp (trace-path U/1)) (return)))) (defmacro store-block-bb-constant-routines (n) `(progn 'compile ,@(loop with s = "ST0RE-BLOCK-BB-CONSTANT-~d" for i from n downto 1 collect '(defucode ,(fintern s i) (parallel (store-word bb-constant block) (lisp (trace-path #/,)) ,(if (> i 1) ,(jump ,(fintern s (1- i))) '(return))))))) (store-block-bb-constant-routines 8.) ;;alu depends both source and destination bits (defucode ubitblt-long-row-both (parallel (assign b-temp bb-d-bitpos) (if (zero-fixnum bb-d-bitpos) (if (zero-fixnum bb-s-bitpos) (goto ubitblt-aligned-row-both) (parallel-with-s-access bb-s-offset ;; SSSSSSSSSSSSSSSSSSSSSSSSSSSSS.ssss ;;ddddddddddadddddddddddddddddddddd. (assign byte-r (32- bb-s-bitpos)) (parallel (assign bb-s-word (rotate memory-data byte-r)) (lisp (trace-path #/c)) (jump ubitblt-d-aligned-row-both)))) (if (equal-fixnum bb-s-bitpos b-temp) (sequential (parallel-with-s-access bb-s-offset ;;SSSSSSSSSSSSSSSSSSSSSSSSSS.ssssss ;;dddddddddddddddddddddddddd.dddddd (parallel (assign byte-r (32- bb-s-bitpos)) (assign b-temp obus)) (assign byte-s (31- bb-s-bitpos)) (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r)))) (assign byte-r bb-s-bitpos) (parallel (assign-vma-offset d) ;;ssssssssssssssssssssssssss.ssssss ;;DDDDDDDDDDDDDDDDDDDDDDDDDD.dddddd (call bb-byte-alu-operation-dispatch)) ;; First partial word stored, turn into aligned case (incr-wrap-s-offset) (incr-d-offset) (assign bb-width (- bb-width b-temp)) (assign bb-s-bitpos (b-constant 0)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/b)) (jump ubitblt-aliqned-row-both))) 4,887,235 321 322 (if (lesser-fixnum bb-s-bitpos b-temp) (goto ubitblt-long-row-both-s-longer) (goto ubitblt-long-row-both-s-shorter)))))) (defucode ubitblt-long-row-both-s-longer (assign b-temp (32- bb-d-bitpos)) (parallel-with-s-access bb-s-offset (assign type-r (32- bb-s-bitpos)) (assign byte-s (1- b-temp)) (assign bb-s-word2 memory-data)) ;;ssssSSSSSSSSSSSSSSSSSSSS........ ;; DDDDDDDDDDDDDDDDDDDDdddddddddddd ;; <----- b-temp -----> (assign bb-s-word (logxor bb-constant (rotate bb-s-word2 byte-r))) ;;........ssssSSSSSSSSSSSSSSSSSSSS (parallel (assign byte-r bb-d-bitpos) (assign b-temp-2 bb-d-bitpos)) (parallel (assign-vma-offset d) ;;ssssssssssssssssssssssss.ssssssss ;; DDDDDDDDDDDDDDDDDDDD.dddddddddddd (call bb-byte-alu-operation-dispatch)) (incr-d-offset) ;;Remaining are (32-(s.bitpos+(32-d.bitpos))) = d.bitpos-s.bitpos ;; <-- 32-d.bitpos ---> <-s.bitpos-> ;;SSSSssssssssssssssssssss.ssssssss ;; dddddddddddddddddddd.dddddddddddd (assign byte-r (- b-temp-2 bb-s-bitpos)) (assign bb-s-bitpos (+ bb-s-bitpos b-temp)) (assign to-s-word (rotate bb-s-word2 byte-r)) (assign bb-width (- bb-width b-temp)) (parallel (assign bt-d-bitpos (b-constant 0)) (lisp (trace-path #/d)) (jump ubitblt-d-aligned-row-both))) ;Need two S words to do the first partial S word (defucode ubitblt-long-row-both-s-shorter ;; ssssssssssssssssssssssss.ssssssss ;;dddddddddddddddddddddddddddd.dddd (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (assign byte-s (31- bb-s-bitpos)) ;; SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss ;;dddddddddddddddddddddddddddd.dddd (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r)))) (incr-wrap-s-offset-ahead) ;; <--> s.bitpos-d.bitpos ;;...SSSS|ssssssssssssssssssssssss.ssssssss ;;dddddddddddddddddddooddddddddddd.dddd (parallel-with-s-access bb-s-offset-ahead (assign byte-s (- bb-s-bitpos b-temp 1)) (assign byte-r (32- bb-s-bitpos)) (assign bb-s-word2 (logxor bb-constant memory-data))) ;;...SSSS|SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss ;; dddd dddddddddddddddddddddddd.dddd (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (assign byte-r bb-d-bitpos) (assign byte-s (31- bb-d-bitpos)) ;;...SSSS|SSSSSSSSSSSSSSSSSSSSSSSS.ssssssss ;; DDDD DDDDDDDDDDDDDDDDDDDDDDDD.dddd (parallel (assign-vma-offset d) (call bb-type-alu-operation-dispatch)) (incr-d-offset) (assign bb-s-offset bb-s-offset-ahead) ;;...SSSssss|sssssssssssssssssssssssss.ssssssss ;; dddd ddddddddddddddddddddddddd.dddd (assign byte-r (- b-temp bb-s-bitpos)) (assign bb-s-bitpos (- bb-s-bitpos t-temp)) (assign b-temp (32- bb-d-bitpos)) (assign bb-s-word (logxor (rotate bb-s-word2 byte-r) bb-constant)) (assign bb-width (- bb-width b-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/e)) (jump ubitblt-d-aligned-row-both))) (defucode ubitblt-aligned-row-both (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into the buffer- (sequential (assign b-temp (+ bb-s-offset (b-constant 8.))) (if (lesser-fixnum bt-s-row-length t-temp) (goto ubitblt-aligned-row-both-slow-loop) (sequential (parallel 4,887,235 323 324 (assign-vma-offset s) (call ubitblt-block-read-8)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-alu-8 ubitblt-aligned-row-both))))) ;;Frob with what's left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.))) (sequential (assign b-temp (+ bb-s-offset (b-constant 4.))) (if (lesser-fixnum bb-s-row-length b-temp) (goto ubitblt-aligned-row-both-slow-loop) (sequential (parallel (assign-vma-offset s) (call ubitblt-block-read-4)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-alu-4 ubitblt-aligned-row-both-slow-loop))))) (goto ubitblt-aligned-row-both-slow-loop)))) (defucode ubitblt-aligned-row-both-slow-loop ;12 cycles per word (parallel-with-s-access bb-s-offset ;4 cycles (trap-if (lesser-fixnum bb-width (b-constant 32.)) ubitblt-aligned-row-both-slow-loop-done) (waiting-for-memory) (assign bb-s-word (logxor bb-constant memory-data))) (parallel (assign-vma-offset d) ;1+3 cycles (call bb-word-alu-operation-dispatch)) (assign bb-width (- bb-width (b-constant 32.))) ;1 cycle (incr-wrap-s-offset) ;2 cycles (parallel ;1 cycle (incr-d-offset) (lisp (trace-path #/.)) (jump ubitblt-aligned-row-both))) (defucode ubitblt-aligned-row-both-slow-loop-done (if (plus-fixnum bb-width) (sequential (parallel-with-s-access bb-s-offset (assign byte-r (b-constant 0)) (assign byte-s (1- bb-width)) (assign bb-s-word (logxor bb-constant memory-data))) (parallel (lisp (trace-path #/2)) (assign-vma-offset d) (jump bb-byte-alu-operation-dispatch))) ;jcall (parallel-with-return (lisp (trace-path #/1))))) (defucode ubitblt-block-alu-8 (dispatch-after-this (ldb bt-alu-operation 4 0) (parallel (assign a-block-size (a-constant 8.)) (assign b-block-size (a-constant 8.)) (start-memory block read)) ;start first word ((1 2) (goto ubitblt-block-logand-8)) ; x*y ~x*y ((4 8.) (goto ubitblt-block-andc2-8)) ; x*~y ~x*~y ((6 9.) (goto ubitblt-block-logxor-8)) ; x xor y, ~x xor y ((7 11.) (goto ubitblt-block-logior-7)) ; x+y, ~x+y ((13. 14.) (goto ubitblt-block-lognand-8)))) ; ~(~x*y), ~(x*y) (defucode ubitblt-block-alu-4 (dispatch-after-this (ldb bb-alu-operation 4 0) (parallel (assign a-block-size (a-constant 4.)) (assign b-block-size (a-constant 4.)) (start-memory block read)) ;start first word ((1 2) (goto ubitblt-block-logand-4)) ; x*y ~x*y ((4 8.) (goto ubitblt-block-andc2-4)) ; x*~y ~x*~y ((6 9.) (goto ubitblt-block-logxor-4)) ; x xor y, ~x xor y ((7 11.) (goto ubitblt-block-logior-4)) ; x+y, ~x+y ((13. 14.) (goto ubitblt-block-lognand-4)))) ; ~(~x*y), ~(x*y) (defmacro def-block-aluop (name n alu) (if (memq (get (caddr (microexpand '(,alu a-temp b-temp))) 'alu) weird-alu-functions) ;; Cannot simultaneously run ALU and store into the bitblt-buffer '(defucode ,name (parallel (waiting-for-memory) ;first word already started (declare-memory-timing active-cycle)) (loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect '(sequential (abus-array-data (assign b-temp (,alu (bitblt-buffer ,i) memory-data)) ,(if (> (- n-bitblt-buffers i) 1) '(start-memory block read))) ;start next word 4,887,235 325 326 (parallel (assign (bitblt-buffer ,i) (set-type b-temp dtp-fix)) ,(if (= (- n-bitblt-buffers i) 1) '(jump ,(fintern "UBITBLT-BLOCK-ALU-WRITE-~d" n))))))) ;;Normal case '(defucode ,name (parallel (waiting-for-memory) ;first word already started (declare-memory-timing active-cycle) (start-memory read block)) ;start second word ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect '(parallel (abus-array-data (assign (bitblt-buffer ,i) (set-type (,alu (bitblt-buffer ,i) memory-data) dtp-fix))) ,(selectq (- n-bitblt-buffers 1) (1 '(jump ,(fintern "UBITBLT-BLOCK-ALU-WRITE-~d" n))) (2 nil) (otherwise (start-memory block read))) ;start word after next ))))) (def-block-aluop ubitblt-block-logand-8 8 logand) (def-block-aluop ubitblt-block-logior-8 8 logior) (def-block-aluop ubitblt-block-logxor-8 8 logxor) (def-block-aluop ubitblt-block-andc2-8 8 andc2) (def-block-aluop ubitblt-block-lognand-8 8 lognand) (def-block-aluop ubitblt-block-logand-4 4 logand) (def-block-aluop ubitblt-block-logior-4 4 logior) (def-block-aluop ubitblt-block-logxor-4 4 logxor) (def-block-aluop ubitblt-block-andc2-4 4 andc2) (def-block-aluop ubitblt-block-lognand-4 4 lognand) (defmacro def-block-alu-write (name n) '(defucode ,name (parallel (assign-vma-offset d) (call activate-bitblt-buffer)) ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers collect '(parallel (store-word (bitblt-buffer ,i) block) (lisp (trace-path #/.)))) (parallel (assign bb-d-offset (+ bb-d-offset b-block-size)) (call deactivate-bitblt-buffer)) (assign bb-width (- bb-width (rotate b-block-size 5))) ;2^5 = bits-per-word (parallel (assign bb-s-offset (+ bb-s-offset b-block-size)) (return)))) (def-block-alu-write ubitblt-block-alu-write-8 8) (def-block-alu-write ubitblt-block-alu-write-4 4) ;;Each time through the loop, s-word was fetched from memory like ;; <----- s.bitpos -----> ;;ssssssssss...................... ;;and then rotated so it looks like ;;......................ssssssssss ;;<----- s.bitpos -----> ;; ;:Each time, another s-word2 gets fetched and deposited into s-word like ;; |<----- s.bitpos -----> ;; |......................1111111111 ;;2222222222 2222222222222222222222 ;; ;;The rotation for the dpb equals the rotation for setup for next loop. ;bb-s-word has the partial previous source word whose address is in bb-s-offset. ;rotated into alignment with the destination, but not xored with bb-constant (defucode ubitblt-d-aligned-row-both (if (greater-or-equal-fixnum bb-width (b-constant (* 8. 32.))) ;;Fetch a block of words into tho buffer (sequential (assign b-temp (+ bb-s-offset (b-constant 8.))) (if (lesser-or-equal-fixnum bb-s-row-length b-temp) (goto ubitblt-d-aligned-row-both-slow-loop) (sequential (parallel (assign-vma-offset s 1) (call ubitblt-rotated-block-read-8)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-alu-8 ubitblt-d-aligned-row-both))))) ;;Frot with what's left. Too bad dispatch blocks are expensive. (if (greater-or-equal-fixnum bb-width (b-constant (* 4 32.))) (sequential (assign b-temp (+ bb-s-offset (b-constant 4.))) (if (lesser-or-equal-fixnum bb-s-row-length t-temp) (goto ubitblt-d-aligned-row-both-slow-loop) 4,887,235 327 328 (sequential (parallel (assign-vma-offset s 1) (call ubitblt-rotated-block-read-4)) (parallel (assign-vma-offset d) (call-and-return-to ubitblt-block-alu-4 ubitblt-d-aligned-row-both-slow-loop))))) (goto ubitblt-d-aligned-row-both-slow-loop)))) (defucode ubitblt-d-aligned-row-both-slow-loop ;17 cycles per word (incr-wrap-s-offset-ahead) ;2 (parallel-with-s-access bb-s-offset-ahead ;4 (trap-if (lesser-fixnum bb-width (n-constant 32.)) ubitblt-d-aligned-row-both-done) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word2 memory-data)) (assign byte-r (32- bb-s-bitpos)) ;1 (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bt-s-word)) ;1 (assign bt-s-word (logxor bb-constant-a bb-s-word)) ;1 (parallel ;1+3 (assign-vma-offset d) (call bb-word-alu-operation-dispatch)) (assign bb-width (- bb-width (b-constant 32.))) ;1 (incr-d-offset) ;1 (assign bb-s-offset bb-s-offset-ahead) ;1 (parallel (assign bb-s-word (rotate bb-s-word2 byte-r)) (lisp (trace-path #/.)) (jump ubitblt-d-aligned-row-both))) ;;At entry, we have s-word fetched from memory like ;; <------s.bitpos------> ;;ssssssssss...................... ;;but then rotated so it looks like ;;......................ssssssssss ;;<------s.bitpos------> ;; ;;This is to be combined with d-word which looks like ;;....................dddddddddddd ;; <---width--> (defucode ubitblt-d-aligned-row-both-done (assign bb-s-word (logxor b-constant-a bb-s-word)) (if (plus-fixnum bb-width) (sequential (assign b-temp (32- bb-s-bitpos)) (if (lesser-or-equal-fixnum bb-width b-temp) ;;we have enouqh s bits ;;<----s.bitpos---><--a.temp---> ;;.................sssssssssssssss ;;....................dddddddddddd ;; <---width--> (sequential (assign byte-r (b-constant 0)) (assign byte-s (1- bb-width)) (parallel (assign-vma-offset d) (lisp (trace-path #/4)) (jump bb-byte-alu-operation-dispatch))) ;jcall ;;need to get another source word ;;<----s.bitpos---><----a.temp---> ;;.................sssssssssssssss ;;............dddddddddddddddddddd ;; <-------width------> (sequential (parallel-with-s-access bb-s-offset-ahead (assign byte-r b-temp) (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word2 (logxor memory-data bb-constant))) (assign bb-s-word (dpb bb-s-word2 byte-s byte-r bb-s-word)) (assign byte-r (b-constant 0)) (assign byte-s (1- bb-width)) (parallel (assign-vma-offset d) (lisp (trace-path #/5)) (jump bb-byte-alu-operation-dispatch))))) ;jcall (parallel-with-return (lisp (trace-path #/3))))) ;;bb-s-word has the previous source word, rotated but not xored with bb-constant ;;3 cycles per word seems to be the best I can do (can't rotate while storing in bitblt-buffer) ;;If bb-s-word was xored already, it would take 4 cycles per word here (defmacro def-bitblt-rotated-block-read (name n) '(defucode ,name (assign byte-s (1- bb-s-bitpos)) (parallel (assign a-block-size (b-constant ,n)) ;Used later to advance offsets (assign b-block-size obus) (start-memory block read)) ;start first word 4,887,235 329 330 (parallel (waiting-for-memory) ;waiting for first word (assign byte-r (32- bb-s-bitpos))) ,@(loop for i from (- n-bitblt-buffers n) below n-bitblt-buffers append '((abus-array-data (assign bb-s-word2 (dpb memory-data byte-s byte-r bb-s-word))) (parallel (declare-memory-timing data-cycle) ;MD holds (assign bb-s-word (rotate memory-data byte-r)) ,(and (> (- n-bitblt-buffers i) 1) '(start-memory block read))) (parallel (assign (bitblt-buffer ,i) (set-type (logxor bb-constant bb-s-word2) dtp-fix)) ,(if (= (- n-bitblt-buffers i) 1) '(return))))))) (def-bitblt-rotated-block-read ubitblt-rotated-block-read-8 8) (def-bitblt-rotated-block-read ubitblt-rotated-block-read-4 4) (defucode ubitblt-long-row-source-backwards (parallel (assign b-temp bb-d-bitpos) (if (zero-fixnum bb-d-bitpos) (if (zero-fixnum bb-s-bitpos) (parallel (assign bb-s-offset (1+ bb-s-offset)) ;the loop will decr first, before pclsr (lisp (trace-path #/a)) (jump ubitblt-aligned-row-source-backwards)) (sequential (parallel-with-s-access bb-s-offset (assign byte-r (32- bb-s-bitpos)) (parallel (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r))) (lisp (trace-path #/c)) (jump ubitblt-d-aligned-row-source-backwards))))) (if (equal-fixnum b-temp bt-s-bitpos) (sequential (parallel-with-s-access bb-s-offset (assign byte-s (1- bb-s-bitpos)) (assign bb-s-word (logxor memory-data bb-constant))) (parallel-with-d-access-check-write bb-d-offset (decr-d-offset) (parallel (assign byte-r (b-constant 0)) (assign bb-s-bitpos (b-constant 0))) (store-word (dpb bb-s-word byte-s byte-r memory-data))) ;; Now we can turn into the aligned case (assign bb-width (- bb-width t-temp)) (parallel (assign bb-d-bitpos (b-constant 0)) (lisp (trace-path #/b)) (jump ubitblt-aligned-row-source-backwards))) (if (greater-fixnum bb-s-bitpos t-temp) ;s > d, enough in the current word (sequential (parallel-with-s-access bb-s-offset (assign byte-s (1- bb-d-bitpos)) (assign byte-r (- b-temp bb-s-bitpos)) (assign bb-s-word (logxor bb-constant memory-data))) (parallel-with-d-access-check-write bb-d-offset (assign bb-s-bitpos (- bb-s-bitpos b-temp)) (assign bb-d-bitpos (b-constant 0)) (store-word (ldb bb-s-word byte-s byte-r memory-data))) (assign bb-s-word (rotate bb-s-word byte-r)) (assign bb-width (- bb-width b-temp)) (parallel (decr-d-offset) (lisp (trace-path #/d)) (jump ubitblt-d-aligned-row-source-backwards))) (sequential ;s < d, need to fetch another word (parallel-with-s-access bb-s-offset (parallel (assign byte-r (- b-temp bb-s-bitpos)) (assign a-temp (- b-temp bb-s-bitpos))) (assign byte-s (1- a-temp)) (assign bb-s-word (logxor bb-constant (rotate memory-data byte-r)))) (decr-wrap-s-offset-ahead) (parallel-with-s-access bb-s-offset-ahead (assign bb-s-word2 (logxor bb-constant memory-data))) (assign bb-s-word (ldb bb-s-word byte-s byte-r bb-s-word)) (parallel-with-d-access bb-d-offset (assign byte-r (b-constant 0d8)) (assign byte-s (1- bb-d-bitpos)) (store-word (ldb bb-s-word byte-s byte-r memory-data))) (assign bb-s-bitpos (32- a-temp)) (assign byte-r a-temp) (assign bb-s-word (rotate bb-s-word2 byte-r)) (assign bb-s-offset bb-s-offset-ahead) (assign bb-width (- bb-width b-temp)) (assign bb-d-bitpos (b-constant 0)) (parallel 4,887,235 331 332 (decr-d-offset) (lisp (trace-path #/e)) (jump ubitblt-d-aligned-row-source-backwards)))))))) ;bb-s-offset is 1+ the "real" value at this point (defucode ubitblt-aligned-row-source-backwards ;9 cycles per ward (decr-wrap-s-offset) ;1 (parallel-with-s-access bb-s-offset ;4 (trap-if (lesser-fixnum bb-width (b-constant 32.)) ubitblt-aligned-row-source-backwards-done) (waiting-for-memory) (assign bb-s-word (logxor bb-constant memory-data))) (assign-vma-offset d) ;1 (store-word bb-s-word) ;1 (assign bb-width (- bb-width (b-constant 32.))) ;1 (parallel ;1 (decr-d-offset) (lisp (trace-path #/,)) (jump ubitblt-aligned-row-source-backwards))) (defucode ubitblt-aligned-row-source-backwards-done (if (plus-fixnum bb-width) (sequential (parallel-with-s-accees bb-s-offset (assign byte-s (1- bb-width)) (assign byte-r bb-width) (assign bb-s-word (logxor bb-constant (ldb memory-data byte-s byte-r)))) (parallel-with-d-access bb-d-offset (assign byte-r (32- bb-width)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data)) (lisp (trace-path #/2))))) (parallel-with-return (lisp (trace-path #/1))))) ;;each time through the loop, bb-s-word has the low part of the previous word ;;rotated to be at the high end of the word. We use it as background to LDB the ;;high part of the next word into it. ;bb-s-offset is 1+ the "real" value at this point ;could bum one cycle by moving assignment to byte-s out of loop, ;but this should use block mode anyway (defucode ubitblt-d-aligned-row-source-backwards ;11 cycles per word (decr-wrap-s-offset) ;1 (parallel-with-s-access bb-s-offset ;4 (trap-if (lesser-fixnum bb-width (b-constant 32.)) ubitblt-d-aligned-row-source-backwards-done) (assign byte-r (32- bb-s-bitpos)) (assign bb-s-word2 (logxor bb-constant memory-data))) (assign byte-s (31- bb-s-bitpos)) ;1 (assign-vma-offset d) ;1 (store-word (ldb bb-s-word2 byte-s byte-r bb-s-word)) ;1 (assign bb-width (- bb-width (b-constant 32.))) ;1 (decr-d-offset) ;1 (parallel ;1 (assign bb-s-word (rotate bb-s-word2 byte-r)) (lisp (trace-path #/.)) (jump ubitblt-d-aligned-row-source-backwards))) (defucode ubitblt-d-aligned-row-source-backwards-done (parallel (assign bb-width-b bb-width) (if (plus-fixnum bb-width) (if (greater-or-equal-fixnum bb-s-bitpos bb-width-b) (parallel-with-d-access bb-d-offset (assign byte-r (b-constant 0)) (assign byte-s (31- bb-width)) (parallel-with-return (store-word (ldb memory-data byte-s byte-r bb-s-word)) (lisp (trace-path #/4)))) (sequential (parallel-with-s-access bb-s-offset (assign byte-r bb-width) (assign bb-s-word (rotate bb-s-word byte-r)) (assign bb-s-word2 (logxor bb-constant memory-data))) (parallel (assign byte-r (- bb-width-b bb-s-bitpos)) (assign a-temp obus)) (assign byte-s (1- a-temp)) (assign bb-s-word (ldb bb-s-word2 byte-s byte-r- bb-s-word)) (parallel-with-d-access bb-d-offset (assign byte-s (1- bb-width)) (assign byte-r (32- bb-width)) (parallel-with-return (store-word (dpb bb-s-word byte-s byte-r memory-data)) (lisp (trace-path #/5)))))) (parallel-with-return (lisp (trace-path #/3))))))