I looked at some old code I did for an 8051. I did some similar division
where I divided a 32 bit number by a 16 bit number. I wrote down a
reference that I used, but alas, I don't have the book. All I have is my
code and a short description of the algorithm [see it in the code
below]. I remember the book was good would recomend it. I found this
web page that seems to use a similar algorithm:
http://www.bearcave.com/software/divide.htm
>Division is the process of repeated subtraction. Like the long division
we learned in grade school, a binary division
>algorithm works from the high order digits to the low order digits and
generates a quotient (division result) with each
>step. The division algorithm is divided into two steps:
>
> 1.Shift the upper bits of the dividend (the number we are dividing
into) into the remainder.
>
> 2.Subtract the divisor from the value in the remainder. The high
order bit of the result become a bit of the
> quotient (division result).
I have some of my old code, but since it's for the wrong type of cpu and
it's not that well commented, it's probably easier to re-create it from
scratch based on the algorithm.
My problem was to divide a fixed 32-bit number (either $28F5A079 or
$51EB40F2, set at compile time) by a variable 16 bit number to get 24
bits for a timer (delay high and low, and a fraction). Speed was of the
essence, so I also used a neat-o routine that would calculate the first
16 bits using this type of routine (it generates one output bit per
loop), and then uses a table to estimate the remaining 8 bits. [I didn't
include that routine].
; Converts a 16-bit velocity to a 24-bit timer value.
; The output is scaled to fit timer0 (H,L), plus a fraction.
;
;
;input/output variables
;
;v2t_in_l .ds 1 ; speed input
;v2t_in_h .ds 1 ; speed input
;v2t_out_f .ds 1 ; intended for FRACstep
;v2t_out_l .ds 1 ; intended for DELAY0L
;v2t_out_h .ds 1 ; intended for DELAY0H
;
;temporary value
;
;v2t_temp .ds 1 ; temporary...
#if CALCULATE_TIMER
; constant= 1000000*65535/12207*256 for full stepping
; constant= 1000000*65535/12207*256/2 for half stepping
;
; time value = $28F5A079/speed for half-stepping
; time value = $51EB40F2/speed for full-stepping
;
#if HALFSTEPPING
#define CNST3 $28
#define CNST2 $F5
#define CNST1 $A0
#define CNST0 $79
#else
#define CNST3 $51
#define CNST2 $EB
#define CNST1 $40
#define CNST0 $F2
#endif
; table time
; value value
; speed 1: D70B.C9 28F5.C9 10485.7 uS
; speed 2: EB86.E5 147A.E5 5242.8 uS
; speed 3: F259.43 DA7.43 3495.2 uS
; ...
; speed 128: FFAF.EC 51.EC 81.9 uS
; ...
; speed 254: FFD7.48 29.48 41.2 uS
; speed 255: FFD7.1F 29.1F 41.1 uS
;
;
; Reference: Numerical Methods
; (real time and embedded systems programming)
; Don Morgon
; M&T Books
; ISBN 1-55851-232-2
;
; Algorithim: 1. Init variables
; 2. Left shift dividend into the remainder
; 3. compare remainder to divisor
; 3a. if greater or equal, set low bit of dividend
; 3b. and subtract divisor from
remainder
; 4. loop steps 2-3 32 times.
;
;
;
; remainder dividend divisor: 2 bytes
; xx xx <- xx xx xx xx xx xx
; | | d0 d1 d2 d3 | \__ v2t_in_l
; | | | | | \_v2t_out_f \_____ v2t_in_h
; | \_rem0 | | \____v2t_out_l
; \____rem1 | \_______v2t_out_h
; \__________v2t_extra (should be 00 or overflowed)
;
; * 24-bit modification. 32-bit version takes about 1 msec... about 80%
; of the time. We need to improve. A 24-bit version will pre-check
; that the divisor won't cause an overflow, and then get only 24 bits
; of result (that's all we actually use). The 32-bit version is kept
; for reference.
;
; The initialiaztion is a bit different... It's like usual, except
; already shifted 8 bits. We don't use the v2t_extra, because it's
; extra shifting.
;
; This takes about 730 us (24-bit) instead of 1000 us (32-bit).
; Shifting takes about 360 us (24 shifts * 5 bytes * 3 cycles).
;
; Shifting can be reduced to ~2 cycles if the XCH instruction is used,
but
; this would complicate things and force the loop to be unrolled
because
; the bytes would be shifting places.
;
; A 16-bit version takes about 466 uS, with
; shifting taking 16 shifts * 4 bytes * 3 cyc=192 us
;
; before optimizing this, remember that 1 cycle on outisr will save 64
; cycles of time compared to this routine. (half stepping)
vel2time: mov a,v2t_in_h ; do we have an overflow? (>$28 or >$52)
jnz v2t_noover ; >= 0x0100, so can't overflow
mov a,v2t_in_l
clr c
subb a,#CNST3+1
jnc v2t_noover ;it's equal or above number, so ok.
ajmp v2t_overflow ;it's below, so overflow!
goinit24: ajmp init24 ;branch too far for below
v2t_noover: mov a,v2t_in_h
clr c
subb a,#CNST3
jc goinit24 ;divisor<CNST3:CNST2
jnz init16 ;divisor>CNST3:CNST2
mov a,v2t_in_l ;divisor=CNST3:xxxxx
clr c
subb a,#CNST2+1
jc goinit24 ;divisor<CNST3:CNST2-1
sjmp init16 ;divisor
;********************************
;**
;** 24-bit version
;**
;********************************
init24: mov rem1,#$00 ; init variables.... (STEP 1)
mov rem0,#CNST3 ; remainder=00xx
mov v2t_out_h,#CNST2
mov v2t_out_l,#CNST1
mov v2t_out_f,#CNST0
mov v2t_count,#24 ; we'll get 24 bits of output
v2t_loop24: clr C ; left shift (shift in a zero)
mov a,v2t_out_f ; (STEP 2)
rlc a
mov v2t_out_f,a
mov a,v2t_out_l
rlc a
mov v2t_out_l,a
mov a,v2t_out_h
rlc a
mov v2t_out_h,a
mov a,rem0
rlc a
mov rem0,a
mov a,rem1
rlc a
mov rem1,a
; (STEP 3) - compare remainder to divisor:
jc remain_above ;overflow into remainder, so it's larger
than divisor
subb A,v2t_in_h ;get rem1-v2t_in_h
jz v2t_c2 ;equal... must compare lsb (C=0)
jc remain_below ;borrowed... so rem1<v2t_in_h
sjmp remain_above ; rem1>v2t_in_h
v2t_c2: mov A,rem0
subb A,v2t_in_l ;get rem0-v2t_in_l
jc remain_below ; rem0<v2t_in_l
; else rem0>=v2t_in_l and fall into
remain_above
remain_above: inc v2t_out_f ;(STEP 3a) Set low bit of dividend
; (STEP 3b) subtract divisor from remainder:
clr c
mov a,rem0 ; subtract lsb
subb a,v2t_in_l
mov rem0, a
mov a,rem1 ; subtract msb
subb a,v2t_in_h
mov rem1, a ; don't care about underflow because
we've
; checked this.
remain_below: djnz v2t_count, v2t_loop24 ; (STEP 4)
;********************************
;**
;** 32-bit version (for reference only)
;**
;********************************
#if 0
vel2time: mov rem0,#$00 ; init variables.... (STEP 1)
mov rem1,#$00 ; remainder=0
mov v2t_extra,#CNST3
mov v2t_out_h,#CNST2
mov v2t_out_l,#CNST1
mov v2t_out_f,#CNST0
mov v2t_count,#32 ; we'll get 32 bits of output
v2t_loop: clr C ; left shift (shift in a zero)
mov a,v2t_out_f ; (STEP 2)
rlc a
mov v2t_out_f,a
mov a,v2t_out_l
rlc a
mov v2t_out_l,a
mov a,v2t_out_h
rlc a
mov v2t_out_h,a
mov a,v2t_extra
rlc a
mov v2t_extra,a
mov a,rem0
rlc a
mov rem0,a
mov a,rem1
rlc a
mov rem1,a
; (STEP 3) - compare remainder to divisor:
jc remain_above ;overflow into remainder, so it's larger
than divisor
subb A,v2t_in_h ;get rem1-v2t_in_h
jz v2t_c2 ;equal... must compare lsb (C=0)
jc remain_below ;borrowed... so rem1<v2t_in_h
sjmp remain_above ; rem1>v2t_in_h
v2t_c2: mov A,rem0
subb A,v2t_in_l ;get rem0-v2t_in_l
jc remain_below ; rem0<v2t_in_l
; else rem0>=v2t_in_l and fall into
remain_above
remain_above: inc v2t_out_f ;(STEP 3a) Set low bit of dividend
; (STEP 3b) subtract divisor from remainder:
clr c
mov a,rem0 ; subtract lsb
subb a,v2t_in_l
mov rem0, a
mov a,rem1 ; subtract msb
subb a,v2t_in_h
mov rem1, a ; don't care about underflow because
we've
; checked this.
remain_below: djnz v2t_count, v2t_loop ; (STEP 4)
mov a,v2t_extra ; do we have an overflow?
jnz v2t_overflow ; yes...
#endif ; end of 32-bit version kept for reference
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<