esp8266 · devyte · Mar 12, 2018 · Jan 30, 2018 · Feb 1, 2018 · Mar 8, 2018
diff --git a/cores/esp8266/core_esp8266_wiring.c b/cores/esp8266/core_esp8266_wiring.c
@@ -61,11 +61,130 @@ void micros_overflow_tick(void* arg) {
     micros_at_last_overflow_tick = m;
 }
 
-unsigned long ICACHE_RAM_ATTR millis() {
-    uint32_t m = system_get_time();
-    uint32_t c = micros_overflow_count + ((m < micros_at_last_overflow_tick) ? 1 : 0);
-    return c * 4294967 + m / 1000;
-}
+//---------------------------------------------------------------------------
+// millis() 'magic multiplier' approximation
+//
+// This function corrects the cumlative (296us / usec overflow) drift
+// seen in the orignal 'millis()' function.
+//
+// Input:
+//    'm' - 32-bit usec counter,           0 <= m <= 0xFFFFFFFF
+//    'c' - 32-bit usec overflow counter   0 <= c <  0x00400000
+// Output:
+//    Returns milliseconds in modulo 0x1,0000,0000 (0 to 0xFFFFFFFF)
+//
+// Notes:
+//
+// 1) This routine approximates the 64-bit integer division,
+//
+//    quotient =  ( 2^32 c + m ) / 1000,
+//
+//    through the use of 'magic' multipliers. A slow division is replaced by
+//    a faster multiply using a scaled multiplicative inverse of the divisor:
+//
+//    quotient =~ ( 2^32 c + m ) * k,  where k = Ceiling[ 2^n / 1000 ]
+//
+//    The precision difference between multiplier and divisor sets the
+//    upper-bound of the dividend which can be successfully divided.
+//
+//    For this application, n = 64, and the divisor (1000) has 10-bits of 
+//    precision. This sets the dividend upper-bound to (64 - 10) = 54 bits,
+//    and that of 'c' to (54 - 32) = 22 bits. This corresponds to a value
+//    for 'c' = 0x0040,0000 , or +570 years of usec counter overflows.
+//
+// 2) A distributed multiply with offset-summing is used find k( 2^32 c + m ):
+//
+//      prd = (2^32 kh + kl) * ( 2^32 c + m )
+//          = 2^64 kh c + 2^32 kl c + 2^32 kh m + kl m
+//               (d)         (c)         (b)       (a)
+//
+//    Graphically, the offset-sums align in little endian like this:
+//                  LS -> MS
+//            32       64       96      128
+//    | a[-1]  |  a[0]  |  a[1]  |  a[2]  |
+//    |       m kl      |    0   |    0   |  a[-1] not needed
+//    |        |       m kh      |        |
+//    |        |       c kl      |        |  a[1] holds the result
+//    |        |        |       c kh      |  a[2] can be discarded
+//
+//    As only the high-word of 'm kl' and low-word of 'c kh' contribute to the
+//    overall result, only (2) 32-bit words are needed for the accumulator.
+//
+// 3) As C++ does not intrinsically test for addition overflows, one must
+//    code specifically to detect them. This approximation skips these
+//    overflow checks for speed, hence the sum,
+//
+//    highword( m kl ) + m kh + c kl  <  (2^64-1),  MUST NOT OVERFLOW.
+//
+//    To meet this criteria, not only do we have to pick 'k' to achieve our
+//    desired precision, we also have to split 'k' appropriately to avoid
+//    any addition overflows.
+//
+//    'k' should be also chosen to align the various products on byte
+//    boundaries to avoid any 64-bit shifts before additions, as they incur
+//    major time penalties. The 'k' chosen for this specific division by 1000
+//    was picked primarily to avoid shifts as well as for precision.
+//
+//    For the reasons list above, this routine is NOT a general one.
+//    Changing divisors could break the overflow requirement and force
+//    picking a 'k' split which requires shifts before additions.
+//
+//              ** Test THOROUGHLY after making changes **
+//
+// 4) Results of time benchmarks run on an ESP8266 Huzzah feather are:
+//
+//         usec   x Orig   Comment
+// Orig:   3.18   1.00     Original code
+// Corr:  13.21   4.15     64-bit reference code
+// Test:   4.60   1.45     64-bit magic multiply, 4x32
+//
+// The magic multiplier routine runs ~3x faster than the reference. Execution
+// times can vary considerably with the numbers being multiplied, so one
+// should derate this factor to around 2x, worst case.
+//
+//   Reference function: corrected millis(), 64-bit arithmetic,
+//                       truncated to 32-bits by return
+//   unsigned long ICACHE_RAM_ATTR millis_corr_DEBUG( void )
+//   {
+//     // Get usec system time, usec overflow conter
+//     ......
+//     return ( (c * 4294967296 + m) / 1000 );  // 64-bit division is SLOW
+//   } //millis_corr
+//
+// 5) See this link for a good discussion on magic multipliers:
+//    http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+//
+
+#define  MAGIC_1E3_wLO  0x4bc6a7f0    // LS part
+#define  MAGIC_1E3_wHI  0x00418937    // MS part, magic multiplier
+
+unsigned long ICACHE_RAM_ATTR millis()
+{
+  uint32_t  a[2];  // Accumulator, little endian
+  a[1] = 0;        // Zero high-acc
+
+  // Get usec system time, usec overflow counter
+  uint32_t  m = system_get_time();
+  uint32_t  c = micros_overflow_count +
+                   ((m < micros_at_last_overflow_tick) ? 1 : 0);
+
+  // (a) Init. low-acc with high-word of 1st product. The right-shift
+  //     falls on a byte boundary, hence is relatively quick.
+  ((uint64_t *)(&a[0]))[0]  =
+     ( (uint64_t)( m * (uint64_t)MAGIC_1E3_wLO ) >> 32 );
+
+  ((uint64_t *)(&a[0]))[0] +=              // (b) Offset sum, low-acc
+     ( m * (uint64_t)MAGIC_1E3_wHI );
+
+  ((uint64_t *)(&a[0]))[0] +=              // (c) Offset sum, low-acc
+     ( c * (uint64_t)MAGIC_1E3_wLO );
+
+  ((uint32_t *)(&a[1]))[0] +=              // (d) Truncated sum, low-acc
+     (uint32_t)( c * (uint64_t)MAGIC_1E3_wHI );
+
+  return ( a[1] );  // Extract result, high-acc
+
+} //millis
 
 unsigned long ICACHE_RAM_ATTR micros() {
     return system_get_time();