mozilla · brizental · Aug 4, 2020 · Aug 3, 2020 · Aug 3, 2020 · Aug 3, 2020
@@ -101,6 +101,7 @@ docstrings
 dokka
 dropdown
 enqueue
+enqueued
 enum
 envs
 exe

@@ -3,12 +3,17 @@
 [Full changelog](https://github.com/mozilla/glean/compare/v32.0.0...main)
 
 * General
+  * Move logic to limit the number of retries on ping uploading "recoverable failures" to glean-core. ([#1120](https://github.com/mozilla/glean/pull/1120))
+    * The functionality to limit the number of retries in these cases was introduced to the Glean SDK in `v31.1.0`. The work done now was to move that logic to the glean-core in order to avoid code duplication throughout the language bindings.
   * Update `glean_parser` to `v1.28.3`
     * BUGFIX: Generate valid C# code when using Labeled metric types.
     * BUGFIX: Support `HashSet` and `Dictionary` in the C# generated code.
 * C#
   * Add support for the String List metric type.
   * Enable generating the C# APIs using the glean_parser.
+* Python
+  * BUGFIX: Limit the number of retries for 5xx server errors on ping uploads. ([#1120](https://github.com/mozilla/glean/pull/1120))
+    * This kinds of failures yield a "recoverable error", which means the ping gets re-enqueued. That can cause infinite loops on the ping upload worker. For python we were incorrectly only limiting the number of retries for I/O errors, another type of "recoverable error".
 
 # v32.0.0 (2020-08-03)
 

@@ -117,7 +117,15 @@ internal sealed class PingUploadTask {
     object Wait : PingUploadTask()
 
     /**
-     * A flag signaling that the pending pings queue is empty and requester is done.
+     * A flag signaling that requester doesn't need to request any more upload tasks at this moment.
+     *
+     * There are two possibilities for this scenario:
+     * * Pending pings queue is empty, no more pings to request;
+     * * Requester has reported more max recoverable upload failures on the same uploading_window[1]
+     *   and should stop requesting at this moment.
+     *
+     * [1]: An "uploading window" starts when a requester gets a new `PingUploadTask::Upload(PingRequest)`
+     *      response and finishes when they finally get a `PingUploadTask::Done` or `PingUploadTask::Wait` response.
      */
     object Done : PingUploadTask()
 }
@@ -19,7 +19,6 @@ import mozilla.telemetry.glean.Glean
 import mozilla.telemetry.glean.net.FfiPingUploadTask
 import mozilla.telemetry.glean.utils.testFlushWorkManagerJob
 import mozilla.telemetry.glean.net.PingUploadTask
-import mozilla.telemetry.glean.rust.Constants
 
 /**
  * Build the constraints around which the worker can be run, such as whether network
@@ -53,8 +52,6 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
     companion object {
         internal const val PING_WORKER_TAG = "mozac_service_glean_ping_upload_worker"
 
-        internal const val MAX_RETRIES = 3
-
         /**
          * Function to aid in properly enqueuing the worker in [WorkManager]
          *
@@ -98,11 +95,6 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
      */
     @Suppress("ReturnCount")
     override fun doWork(): Result {
-        // We counte the number of failures,
-        // if we get more than three failures in upload we will stop this worker.
-        //
-        // This is a hack before a more robust mechanism is implemented in Rust.
-        var uploadFailures = 0
         do {
             // Create a slot of memory for the task: glean-core will write data into
             // the allocated memory.
@@ -121,17 +113,14 @@ class PingUploadWorker(context: Context, params: WorkerParameters) : Worker(cont
                         Glean.configuration
                     ).toFfi()
 
-                    if (result == Constants.UPLOAD_RESULT_RECOVERABLE) {
-                        uploadFailures++
-                    }
-
                     // Process the upload response
                     LibGleanFFI.INSTANCE.glean_process_ping_upload_response(incomingTask, result)
                 }
                 PingUploadTask.Wait -> return Result.retry()
                 PingUploadTask.Done -> return Result.success()
             }
-        } while (uploadFailures < MAX_RETRIES)
-        return Result.failure()
+        } while (true)
+        // Limits are enforced by glean-core to avoid an inifinite loop here.
+        // Whenever a limit is reached, this binding will receive `PingUploadTask.Done` and step out.
     }
 }
@@ -22,9 +22,6 @@ internal class BaseUploader
         // How many times to attempt waiting when told to by glean-core's upload API.
         private const int MAX_WAIT_ATTEMPTS = 3;
 
-        // Maximum number of recoverable errors allowed before aborting the ping uploader.
-        private const int MAX_RETRIES = 3;
-
         private readonly IPingUploader uploader;
 
         /// <summary>
@@ -100,9 +97,10 @@ internal void TriggerUpload(Configuration config)
             // FOR TESTING Implement the upload worker here and call this from Glean.cs
 
             int waitAttempts = 0;
-            int uploadFailures = 0;
 
-            while (uploadFailures < MAX_RETRIES)
+            // Limits are enforced by glean-core to avoid an inifinite loop here.
+            // Whenever a limit is reached, this binding will receive `UploadTaskTag.Done` and step out.
+            while (true)
             {
                 FfiUploadTask incomingTask = new FfiUploadTask();
                 LibGleanFFI.glean_get_upload_task(ref incomingTask);
@@ -123,11 +121,6 @@ internal void TriggerUpload(Configuration config)
                             // Delegate the actual upload and get its return value.
                             UploadResult result = Upload(path, body, headers, config);
 
-                            if (result is RecoverableFailure)
-                            {
-                                uploadFailures += 1;
-                            }
-
                             // Copy the `FfiUploadTask` to unmanaged memory, because
                             // `glean_process_ping_upload` assumes it has to free the memory.
                             IntPtr ptrCopy = Marshal.AllocHGlobal(Marshal.SizeOf(incomingTask));

@@ -18,9 +18,6 @@ public class HttpPingUploader {
         static let recoverableErrorStatusCode: UInt16 = 500
         // For this error, the ping data will be deleted and no retry happens
         static let unrecoverableErrorStatusCode: UInt16 = 400
-
-        // Maximum number of recoverable errors allowed before aborting the ping uploader
-        static let maxRetries = 3
     }
 
     private let logger = Logger(tag: Constants.logTag)
@@ -98,18 +95,16 @@ public class HttpPingUploader {
     /// It will report back the task status to Glean, which will take care of deleting pending ping files.
     /// It will continue upload as long as it can fetch new tasks.
     func process() {
-        var uploadFailures = 0
-        while uploadFailures < Constants.maxRetries {
+        // Limits are enforced by glean-core to avoid an inifinite loop here.
+        // Whenever a limit is reached, this binding will receive `.done` and step out.
+        while true {
             var incomingTask = FfiPingUploadTask()
             glean_get_upload_task(&incomingTask)
             let task = incomingTask.toPingUploadTask()
 
             switch task {
             case let .upload(request):
                 self.upload(path: request.path, data: request.body, headers: request.headers) { result in
-                    if case .recoverableFailure = result {
-                        uploadFailures += 1
-                    }
                     glean_process_ping_upload_response(&incomingTask, result.toFfi())
                 }
             case .wait:

@@ -17,7 +17,6 @@
 from .._glean_ffi import ffi as ffi_support  # type: ignore
 from .._dispatcher import Dispatcher
 from .._process_dispatcher import ProcessDispatcher
-from .ping_uploader import RecoverableFailure
 
 
 log = logging.getLogger(__name__)
@@ -26,9 +25,6 @@
 # How many times to attempt waiting when told to by glean-core's upload API.
 MAX_WAIT_ATTEMPTS = 3
 
-# Maximum number of recoverable errors allowed before aborting the ping uploader
-MAX_RETRIES = 3
-
 
 class PingUploadWorker:
     @classmethod
@@ -120,9 +116,9 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:
 
     wait_attempts = 0
 
-    upload_failures = 0
-
-    while upload_failures < MAX_RETRIES:
+    # Limits are enforced by glean-core to avoid an inifinite loop here.
+    # Whenever a limit is reached, this binding will receive `UploadTaskTag.DONE` and step out.
+    while True:
         incoming_task = ffi_support.new("FfiPingUploadTask *")
         _ffi.lib.glean_get_upload_task(incoming_task)
 
@@ -146,9 +142,6 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:
                 url_path, body, _parse_ping_headers(headers, doc_id), configuration
             )
 
-            if isinstance(upload_result, RecoverableFailure):
-                upload_failures = upload_failures + 1
-
             # Process the response.
             _ffi.lib.glean_process_ping_upload_response(
                 incoming_task, upload_result.to_ffi()
@@ -161,9 +154,7 @@ def _process(data_dir: Path, application_id: str, configuration) -> bool:
             else:
                 return False
         elif tag == UploadTaskTag.DONE:
-            break
-
-    return True
+            return True
 
 
 __all__ = ["PingUploadWorker"]
@@ -81,11 +81,12 @@ def test_500_error_submit(safe_httpserver, monkeypatch):
     ProcessDispatcher._wait_for_last_process()
 
     # This kind of recoverable error will be tried 10 times
-    assert 10 == len(safe_httpserver.requests)
+    # The number of retries is defined on glean-core
+    assert 3 == len(safe_httpserver.requests)
 
     metric = get_upload_failure_metric()
     assert not metric["status_code_4xx"].test_has_value()
-    assert 10 == metric["status_code_5xx"].test_get_value()
+    assert 3 == metric["status_code_5xx"].test_get_value()
 
 
 def test_500_error_submit_concurrent_writing(slow_httpserver, monkeypatch):
@@ -113,12 +114,13 @@ def test_500_error_submit_concurrent_writing(slow_httpserver, monkeypatch):
         counter.add()
         times += 1
 
-    # This kind of recoverable error will be tried 10 times
-    assert 10 == len(slow_httpserver.requests)
+    # This kind of recoverable error will be tried 3 times
+    # The number of retries is defined on glean-core
+    assert 3 == len(slow_httpserver.requests)
 
     metric = get_upload_failure_metric()
     assert not metric["status_code_4xx"].test_has_value()
-    assert 10 == metric["status_code_5xx"].test_get_value()
+    assert 3 == metric["status_code_5xx"].test_get_value()
 
     assert times > 0
     assert times == counter.test_get_value()

@@ -490,7 +490,7 @@ impl Glean {
     ///
     /// * `Wait` - which means the requester should ask again later;
     /// * `Upload(PingRequest)` - which means there is a ping to upload. This wraps the actual request object;
-    /// * `Done` - which means there are no more pings queued right now.
+    /// * `Done` - which means requester should stop asking for now.
     ///
     /// # Return value
     ///