EOFError when parsing headers (dupe of #214) #220

ararslan · 2018-03-13T19:56:11Z

Seen on Nanosoldier:

ERROR (unhandled task failure): IOError(EOFError() during request(https://api.github.com/repos/JuliaLang/julia/statuses/5621642cc480266f018ad1c1892ec9e8d2d3cf7f))
Stacktrace:
 [1] readuntil(::HTTP.ConnectionPool.Transaction{MbedTLS.SSLContext}, ::HTTP.Parsers.#find_end_of_header) at /home/nanosoldier/.julia/v0.6/HTTP/src/IOExtras.jl:169
 [2] readheaders(::HTTP.ConnectionPool.Transaction{MbedTLS.SSLContext}, ::HTTP.Messages.Response) at /home/nanosoldier/.julia/v0.6/HTTP/src/Messages.jl:464
 [3] startread(::HTTP.Streams.Stream{HTTP.Messages.Response,HTTP.ConnectionPool.Transaction{MbedTLS.SSLContext}}) at /home/nanosoldier/.julia/v0.6/HTTP/src/Streams.jl:145
 [4] macro expansion at /home/nanosoldier/.julia/v0.6/HTTP/src/StreamRequest.jl:51 [inlined]
 [5] macro expansion at ./task.jl:302 [inlined]
 [6] #request#1(::Void, ::Void, ::Int64, ::Array{Any,1}, ::Function, ::Type{HTTP.StreamRequest.StreamLayer}, ::HTTP.ConnectionPool.Transaction{MbedTLS.SSLContext}, ::HTTP.Messages.Request, ::String) at /home/nanosoldier/.julia/v0.6/HTTP/src/StreamRequest.jl:47
 [7] (::HTTP.#kw##request)(::Array{Any,1}, ::HTTP.#request, ::Type{HTTP.StreamRequest.StreamLayer}, ::HTTP.ConnectionPool.Transaction{MbedTLS.SSLContext}, ::HTTP.Messages.Request, ::String) at ./<missing>:0
 [8] #request#1(::Type{T} where T, ::Array{Any,1}, ::Function, ::Type{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer}}, ::HTTP.URIs.URI, ::HTTP.Messages.Request, ::String) at /home/nanosoldier/.julia/v0.6/HTTP/src/ConnectionRequest.jl:32
 [9] (::HTTP.#kw##request)(::Array{Any,1}, ::HTTP.#request, ::Type{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer}}, ::HTTP.URIs.URI, ::HTTP.Messages.Request, ::String) at ./<missing>:0
 [10] (::Base.###49#50#52{ExponentialBackOff,HTTP.RetryRequest.##2#3{Bool,HTTP.Messages.Request},HTTP.#request})(::Array{Any,1}, ::Function, ::Type{T} where T, ::Vararg{Any,N} where N) at ./error.jl:139
 [11] (::Base.#kw###49#51)(::Array{Any,1}, ::Base.##49#51, ::Type{T} where T, ::Vararg{Any,N} where N) at ./<missing>:0
 [12] #request#1(::Int64, ::Bool, ::Array{Any,1}, ::Function, ::Type{HTTP.RetryRequest.RetryLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer}}}, ::HTTP.URIs.URI, ::HTTP.Messages.Request, ::String) at /home/nanosoldier/.julia/v0.6/HTTP/src/RetryRequest.jl:43
 [13] (::HTTP.#kw##request)(::Array{Any,1}, ::HTTP.#request, ::Type{HTTP.RetryRequest.RetryLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer}}}, ::HTTP.URIs.URI, ::HTTP.Messages.Request, ::String) at ./<missing>:0
 [14] #request#1(::VersionNumber, ::String, ::Void, ::Void, ::Array{Any,1}, ::Function, ::Type{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::String) at /home/nanosoldier/.julia/v0.6/HTTP/src/MessageRequest.jl:44
 [15] (::HTTP.#kw##request)(::Array{Any,1}, ::HTTP.#request, ::Type{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::String) at ./<missing>:0
 [16] #request#1(::Int64, ::Bool, ::Array{Any,1}, ::Function, ::Type{HTTP.RedirectRequest.RedirectLayer{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer}}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::String) at /home/nanosoldier/.julia/v0.6/HTTP/src/RedirectRequest.jl:24
 [17] (::HTTP.#kw##request)(::Array{Any,1}, ::HTTP.#request, ::Type{HTTP.RedirectRequest.RedirectLayer{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer}}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::String) at ./<missing>:0
 [18] #request#8(::Array{Any,1}, ::Function, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::String) at /home/nanosoldier/.julia/v0.6/HTTP/src/HTTP.jl:289
 [19] (::HTTP.#kw##request)(::Array{Any,1}, ::HTTP.#request, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::String) at ./<missing>:0
 [20] #request#9(::Dict{String,String}, ::String, ::Void, ::Array{Any,1}, ::Function, ::String, ::String, ::Dict{String,String}, ::String) at /home/nanosoldier/.julia/v0.6/HTTP/src/HTTP.jl:303
 [21] (::HTTP.#kw##request)(::Array{Any,1}, ::HTTP.#request, ::String, ::String, ::Dict{String,String}, ::String) at ./<missing>:0
 [22] #post#13 at /home/nanosoldier/.julia/v0.6/HTTP/src/HTTP.jl:351 [inlined]
 [23] (::HTTP.#kw##post)(::Array{Any,1}, ::HTTP.#post, ::String, ::Dict{String,String}, ::String) at ./<missing>:0
 [24] #github_request#3(::GitHub.OAuth2, ::Bool, ::Dict{Any,Any}, ::Dict{String,String}, ::Bool, ::Function, ::GitHub.GitHubWebAPI, ::Function, ::String) at /home/nanosoldier/.julia/v0.6/GitHub/src/utils/requests.jl:61
 [25] (::GitHub.#kw##github_request)(::Array{Any,1}, ::GitHub.#github_request, ::GitHub.GitHubWebAPI, ::Function, ::String) at ./<missing>:0
 [26] #gh_post#5 at /home/nanosoldier/.julia/v0.6/GitHub/src/utils/requests.jl:68 [inlined]
 [27] (::GitHub.#kw##gh_post)(::Array{Any,1}, ::GitHub.#gh_post, ::GitHub.GitHubWebAPI, ::String) at ./<missing>:0
 [28] #gh_post_json#10(::Array{Any,1}, ::Function, ::GitHub.GitHubWebAPI, ::String) at /home/nanosoldier/.julia/v0.6/GitHub/src/utils/requests.jl:74
 [29] (::GitHub.#kw##gh_post_json)(::Array{Any,1}, ::GitHub.#gh_post_json, ::GitHub.GitHubWebAPI, ::String) at ./<missing>:0
 [30] #create_status#93(::Array{Any,1}, ::Function, ::GitHub.GitHubWebAPI, ::String, ::String) at /home/nanosoldier/.julia/v0.6/GitHub/src/repositories/statuses.jl:31
 [31] (::GitHub.#kw##create_status)(::Array{Any,1}, ::GitHub.#create_status, ::GitHub.GitHubWebAPI, ::String, ::String) at ./<missing>:0
 [32] #create_status#94 at ./<missing>:0 [inlined]
 [33] (::GitHub.#kw##create_status)(::Array{Any,1}, ::GitHub.#create_status, ::String, ::String) at ./<missing>:0
 [34] reply_status(::Nanosoldier.JobSubmission, ::String, ::String, ::Void) at /home/nanosoldier/.julia/v0.6/Nanosoldier/src/submission.jl:111
 [35] reply_status(::Nanosoldier.JobSubmission, ::String, ::String) at /home/nanosoldier/.julia/v0.6/Nanosoldier/src/submission.jl:107
 [36] #reply_status#7(::Array{Any,1}, ::Function, ::Nanosoldier.BenchmarkJob, ::String, ::Vararg{String,N} where N) at /home/nanosoldier/.julia/v0.6/Nanosoldier/src/jobs/jobs.jl:11
 [37] delegate_job(::Nanosoldier.Server, ::Nanosoldier.BenchmarkJob, ::Int64) at /home/nanosoldier/.julia/v0.6/Nanosoldier/src/server.jl:116
 [38] macro expansion at /home/nanosoldier/.julia/v0.6/Nanosoldier/src/server.jl:61 [inlined]
 [39] (::Nanosoldier.##31#32{Nanosoldier.Server})() at ./event.jl:73

The text was updated successfully, but these errors were encountered:

quinnj · 2018-03-13T20:05:05Z

This is probably #214. Which, IIUC, means we just need to put in appropriate idle/connect timeouts and things should work fine. @samoconnor can you confirm? Nanosoldier is another heavy github-api hitter.

samoconnor · 2018-03-13T21:20:48Z

@ararslan What versions of HTTP.jl and MbedTLS.jl are you using?

Are you using retry_non_idempotent=true?

@quinnj, that seems probable, but I don’t think we should rush to setting a connection idle timeout. We should think of that as a performance optimisation and try to ensure that the default case is handled as smoothly as possible.

@ararslan, please keep in mind that the HTTP.jl layer cannot completely hide the fact that the network is imperfect. There are some cases where it can transparently retry requests but there are others where it cannot (e.g. POST) so robust applications will always need application-logic-aware retry loops (even if they are as crude as crashing and restarting an entire process or container).

In the case of POST requests we attempt to retry in as many cases as possible, but if the error happens (from the client point of view) after body of the request has been sent, we cannot retry (to avoid double execution of a non-idempotent operation).

ararslan · 2018-03-13T21:28:20Z

What versions of HTTP.jl and MbedTLS.jl are you using?

HTTP 0.6.4 and MbedTLS 0.5.6

Are you using retry_non_idempotent=true?

I don't know what that is, so probably not

please keep in mind that the HTTP.jl layer cannot completely hide the fact that the network is imperfect.

Sure, of course. But it's important to note that Nanosoldier wasn't having this trouble with HTTP prior to the HTTP 0.6.x releases.

quinnj · 2018-03-13T21:33:13Z

Unfortunately HTTP.jl used to retry much more aggressively than it should, at the risk of applications potentially automatically retrying POST requests where that may not be entirely safe/wise (think financial transactions and such). We're now on the "safe/legal" side of what can be automatically retried, which is causing problems in applications that were relying on this automatic retry functionality, probably unknowingly. Luckily, it's pretty easy to manage: you can use retry_non_idempotent=true for POST requests where you know it's safe to retry a POST request if it fails, and we can also set idle_timeout values so that connections don't get re-used if they've been stale for a certain amount of time.

samoconnor · 2018-03-13T21:50:53Z

HTTP 0.6.4 and MbedTLS 0.5.6

The behaviour you're seeing is a known issue with 0.6.4.
Please try v0.6.6: https://github.com/JuliaWeb/HTTP.jl/releases/tag/v0.6.6 - "Improved POST retry when connection is dropped #211"

Also please try MbedTLS v0.5.7 (this includes a deadlock fix that can interact with HTTP.jl's retry-post-whenever-possible logic JuliaLang/MbedTLS.jl@e6b366c)

samoconnor · 2018-03-13T22:26:07Z

please keep in mind that the HTTP.jl layer cannot completely hide the fact that the network is imperfect.

Sure, of course. But it's important to note that Nanosoldier wasn't having this trouble with HTTP prior to the HTTP 0.6.x releases.

Understood. As @quinnj says, we've removed a lot of catch-all exception handling in HTTP.jl and instead try to detect and handle specific situations in line with the semantics defined in the RFC.

The GitHub doc says that they have been careful to use HTTP methods with semantics appropriate to the API operations.

In the case of the operation in this issue, POST /repos/:owner/:repo/statuses/:sha, it seems that double-posting would indeed create two statuses, https://developer.github.com/v3/repos/statuses/, so in the event of a failure, the GitHub.jl create_status function should do a GET to check if the status was created before retrying (or, if you're happy that duplicate statuses are not harmful to overall system correctness, just set retry_non_idempotent=true).

ararslan · 2018-03-13T22:31:48Z

Thanks so much for the info and the quick responses, both of you!

Please try v0.6.6

Ah, Nanosoldier didn't upgrade to that automatically because it's running Julia 0.6.0 and HTTP 0.6.6 requires Julia 0.6.2 at a minimum. I had to take Nanosoldier offline so I'll upgrade Julia while I'm at it.

samoconnor · 2018-03-13T22:57:39Z

No problem. Please let us know how it goes...

ararslan · 2018-03-13T23:07:28Z

Looks like I can't update to MbedTLS 0.5.7 yet: JuliaLang/METADATA.jl#13725

samoconnor · 2018-03-13T23:21:00Z

IMHO the METADATA PR should be merged as is.
It is possible that changing to a new bindeps setup causes problems in some cases, but we won't know that until it's more widely used. @staticfloat has put in the effort to build BinaryProvider.jl, @quinnj has put in the effort to try it out in MbedTLS.jl, lets get it tagged and out there, if there are issues lets fix them as they arise.

samoconnor · 2018-03-13T23:23:53Z

@ararslan it is still worth trying MbedTLS v0.5.6, that version has a fix for the deadlock condition. v0.5.7 is a cleaned-up/refined version of that fix.

quinnj · 2018-03-13T23:24:05Z

I merged it. I imagine we'll have an issue or two come in, but better to get things ironed out sooner than later w/ bindeps2.

ararslan · 2018-03-13T23:25:55Z

Great, thank you both. I'll update to 0.5.7.

ararslan · 2018-03-14T19:13:26Z

Nanosoldier is running Julia 0.6.2, HTTP 0.6.4, and MbedTLS 0.5.7. It ran successfully a couple times, but a couple hours ago got this same error: JuliaLang/julia#26435 (comment).

ararslan · 2018-03-14T19:14:17Z

Perhaps GitHub.jl needs to be updated for the recent HTTP changes?

samoconnor · 2018-03-14T19:25:03Z

IOError(EOFError() during request(https://api.github.com/repos/JuliaLang/julia/statuses/fb59643d8cfa5011d66a4d09d8e38227b977a3e5))

This looks like the same POST new status API call as before. So, on one hand you should expect that it will fail sometimes because it cannot be automatically retried if the request body was sent. On the other hand, I'm willing to dive down the rabbit hole in case there is something else going on here. Please post the logs, set verbose = 3 and set DEBUG_LEVEL=2 in HTTP.jl/src/HTTP.jl. Note that verbose=3 inserts a debug layer into the HTTP stack so it may change behaviour. If so, try verbose=1 or =2.

samoconnor · 2018-03-14T19:26:27Z

Nanosoldier is running Julia 0.6.2, HTTP 0.6.4, and MbedTLS 0.5.7.

Sorry, just read that in more detail. Please use HTTP 0.6.6 per #220 (comment).

ararslan · 2018-03-14T19:32:36Z

Sorry, I meant I was using HTTP 0.6.6, that was a typo.

samoconnor · 2018-03-14T19:42:00Z

We need to see the logs with debug info for a couple of hours of use (these will include information on which requests were retried or not and why). Hopefully we would see that POST statuses is being retried when a pooled connection drops some of the time, but still occasionally fails due to normal network glitchyness. In this case HTTP.jl is doing all it can and GitHub.jl simply needs to implement a retry strategy for POST APIs or set retry_non_idempotent=true.

ararslan · 2018-03-26T21:59:59Z

Where do I set verbose?

samoconnor · 2018-03-26T22:06:27Z

HTTP.request

HTTP.jl/README.md

Line 42 in 72dde3c

r = HTTP.request("GET", "http://httpbin.org/ip"; verbose=3)

HTTP.jl/src/HTTP.jl

Line 92 in 20c4547

- `verbose = 0`, set to `1` or `2` for extra message logging.

ararslan · 2018-03-26T22:12:20Z

So I would need to set it in GitHub.jl, which is what's actually making the request.

samoconnor · 2018-03-26T22:19:31Z

yes, and "set DEBUG_LEVEL=2 in HTTP.jl/src/HTTP.jl",
or, if you prefer you could tweak the default value of verbose on your test system here: https://github.com/JuliaWeb/HTTP.jl/blob/master/src/HTTP.jl#L549

ararslan · 2018-03-27T03:04:45Z

DEBUG_LEVEL set to 2, default value of verbose changed to 2. Full log from the last 5 hours or so, until the system had to be shut down: https://gist.github.com/ararslan/5482dcec28fde91355998d861dd107a0. Relevant error occurs at line 389.

samoconnor · 2018-03-27T05:09:11Z

Is that the whole log for 5 hours? (I only see ~400 lines, and ~18 github API requests).

There are no timestamps and very little application context in the log. Can you increase the log level in your application so that it records some information about when/what requests are being processed?

I don't see any Requests or Responses in the log above, are you sure you've enabled verbose?.
I would expect to see the Requests and Responses printed out with versbose=2. e.g.

julia> HTTP.get("http://httpbin.org/ip");nothing

julia> HTTP.get("http://httpbin.org/ip", verbose=2);nothing
HTTP.Messages.Request:
"""
GET /ip HTTP/1.1
Host: httpbin.org
Content-Length: 0

"""
HTTP.Messages.Response:
"""
HTTP/1.1 200 OK
Connection: keep-alive
---snip---
Via: 1.1 vegur

{
  "origin": "60.224.142.5"
}
"""

Are you still using Julia 0.6.2, HTTP 0.6.6, and MbedTLS 0.5.7 per your note above?

Please double-check that your copy of HTTP.jl has the following two req.txcount lines:

HTTP.jl/src/StreamRequest.jl

Line 98 in 2f4acd2

req.txcount += 1

HTTP.jl/src/RetryRequest.jl

Line 58 in 2f4acd2

(retry_non_idempotent || req.txcount == 0 || isidempotent(req))

samoconnor · 2018-03-27T05:52:44Z

Here is the relevant piece of the log:

DEBUG: ff77 ♻️  Idle:           🔗    1↑     1↓    api.github.com:443:0 ≣16 RawFD(18)
DEBUG: ff77 👁  Start write:T1  🔗    1↑     1↓    api.github.com:443:0 ≣16 RawFD(18)
DEBUG: ff77 👁  Start read: T1  🔗    1↑🔒    1↓    api.github.com:443:0 ≣16 RawFD(18)
DEBUG: a311 🔒  POST non-idempotent, holding write lock: T1  🔜💀   1↑🔒    1↓🔒   api.github.com:443:0 ≣16 31 bytes waiting RawFD(18)
DEBUG: ff77 ❗️  ConnectionLayer EOFError(). Closing: T1  💀    1↑🔒    1↓🔒   api.github.com:443:0 ≣16 RawFD(-1)
DEBUG: ff77 🗣  Write done: T1  💀    2↑     1↓🔒   api.github.com:443:0 ≣16 RawFD(-1)
DEBUG: ff77 ✉️  Read done:  T1  💀    2↑     2↓    api.github.com:443:0 ≣16 RawFD(-1)
DEBUG: ff77 🚷  No Retry: POST /repos/JuliaLang/julia/statuses/6190bf273799377db35a25bd61ddb02ad6f22ad1 HTTP/1.1, POST non-idempotent

We can see that RawFD(18) had been used for one request and one response previously and was in "connected" (🔗) state and when read and write was started.

The "holding write lock" message comes from the writebody function (in an async task a311) in the StreamLayer after the request body is sent and after req.txcount += 1.

So, at this point the No Retry is inevitable because of req.txcount += 1, i.e. we believe we have sent the request body, so we mustn't sent it again.

It is worth considering the fact that at the time we logged "holding write lock" the socket was known to be in "closing" (🔜💀) state. The req.txcount += 1 happens just before that, and the write() just before that did not result in an error.

HTTP.jl/src/StreamRequest.jl

Lines 94 to 110 in b602c4e

    
               else 
        
                   write(http, req.body) 
        
               end 
        
               req.txcount += 1 
        
               if isidempotent(req) 
        
                   closewrite(http) 
        
               else 
        
                   @debug 2 "🔒  $(req.method) non-idempotent, " * 
        
                            "holding write lock: $(http.stream)" 
        
                   # "A user agent SHOULD NOT pipeline requests after a 
        
                   #  non-idempotent method, until the final response 
        
                   #  status code for that method has been received" 
        
                   # https://tools.ietf.org/html/rfc7230#section-6.3.2 
        
               end 
        
           end

Perhaps if we query the tcpstatus before doing write(http, body) we might find that it is "closing" and be able to abort and retry. (I wouldn't expect this to be necessary because it seems that write should fail if the socket is known to be closing. But, perhaps write is just dumping the bytes in a buffer and not checking. tcpstatus calls Base.uv_status_string like this...

HTTP.jl/src/ConnectionPool.jl

Lines 518 to 529 in b602c4e

    
           function tcpstatus(c::Connection) 
        
               s = Base.uv_status_string(tcpsocket(c.io)) 
        
                   if s == "connecting" return "🔜🔗" 
        
               elseif s == "open"       return "🔗 " 
        
               elseif s == "active"     return "🔁 " 
        
               elseif s == "paused"     return "⏸ " 
        
               elseif s == "closing"    return "🔜💀" 
        
               elseif s == "closed"     return "💀 " 
        
               else 
        
                   return s 
        
               end 
        
           end

Can you try putting @debug 2 "About to send body: $(http.stream)" at the top of writebody in StreamRequest.jl ? This should print out the connection summary info before the call to write. (However, there is always a risk that doing IO for the log message will cause task switching and libuv internal state changes that result in a different outcome.

It is also worth considering the 31 bytes waiting log at the end of the "holding write lock" line. Could this be an error message from a reverse proxy saying that the connection to the real server is no longer available? Setting verbose=3 would let us see what is going on more clearly.

samoconnor · 2018-03-27T06:04:54Z

Another thought: it's hard to tell from these logs because there are no timestamps, but, if we assume that the root cause is that we're trying to reuse a connection that the other end has given up on, then setting idle_timeout=x seconds might solve the problem. The latest HTTP.jl master supports this new setting. You could try putting idle_timeout=30 in the github_request function.

See: #214 (comment)

ararslan · 2018-03-27T20:57:55Z

Nanosoldier is down for some large-scale on-site maintenance project at MIT, with no estimate for when it will be back up. So for now I unfortunately can't try out any other suggestions or debugging configurations.

Is that the whole log for 5 hours? (I only see ~400 lines, and ~18 github API requests).

It is indeed. That is the contents of the log from when I started the server to when I had to shut it down. The API requests are from GitHub events on the Julia repo that Nanosoldier doesn't ignore.

There are no timestamps and very little application context in the log. Can you increase the log level in your application so that it records some information about when/what requests are being processed?

Improving Nanosoldier's logging has been on my to-do list for a while, but I have no estimate for when I'll have time to work on that, as I'm no longer working full time on Julia.

I don't see any Requests or Responses in the log above, are you sure you've enabled verbose?.
I would expect to see the Requests and Responses printed out with versbose=2. e.g.

All I did was change the default value of verbose as you showed in a previous comment, and change DEBUG_LEVEL. I made no changes to function calls.

samoconnor · 2018-03-27T23:38:45Z

On one hand I'd like to dig deeper here to see if we can learn something that will benefit HTTP.jl.
However, at the end of the day all we can hope to do is reduce the occurrence of being unable to retry POST requests. We can't eliminate it completely, there will always be a situation where we've sent the request, so we're not sure that it wasn't received, so we can't resend it. So, to be reliable GitHub.jl needs to implement state-aware retry as described here: JuliaWeb/GitHub.jl#106.

The logs above show a situation where it becomes clear that the connection is dead shortly after the body is sent. It is tempting to try to find a way to pre-empt this situation and avoid sending the body. But, I have a feeling that whatever way we come up with to do this will in-effect involve waiting for a network round-trip delay. It seems that waiting for a round-trip before sending every POST request is not desirable in general. It seems much better to buffer/send requests with the assumption that the connection is ok (which it is most of the time) and deal with the occasional state-aware retry at the application layer (where it must be done anyway, because we can never know if the connection fails for other reasons after the body transmission).

To deal with the immediate practical problem in the Nanosoldier's use case I recommend modifying GitHub.jl to:

set retry_non_idempotent=true, restoring the old behaviour of blindly retrying POST requests; or
set idle_timeout=30 or idle_timeout=20 to remove/reduce the incidence of attempting to reuse a connection that has in-fact become unusable; or
(as a last resort) set reuse_limit=0 to disable connection re-use completely.

samoconnor · 2018-04-02T03:19:28Z

Closing. See JuliaWeb/GitHub.jl#109 (comment)

Was removed in b602c4e : > Remove yeild() after @async writebody. This should increase the chance > that the startread() realises that the connection is dead before the > body is written. This change appears to have caused problems in other situations: JuliaCloud/AWSS3.jl#26 (comment) The removal was driven by nanosoldier/GitHub.jl issues with POST requests on long-held connections. #220 (comment) GitHub.jl now handles this with the `idle_timeout=` option. JuliaWeb/GitHub.jl#109

When a connection is returned to the (read) pool, add a monitor to it for receiving unexpected data (or EOF), and kill / close the Connection object if any activity occurs before the next write (when it should have simply been waiting idle in the pool) per JuliaLang/MbedTLS.jl#145 (comment) closes #214 closes #199 closes #220 closes JuliaWeb/GitHub.jl#106

* ConnectionPool: monitor idle connections When a connection is returned to the (read) pool, add a monitor to it for receiving unexpected data (or EOF), and kill / close the Connection object if any activity occurs before the next write (when it should have simply been waiting idle in the pool) per JuliaLang/MbedTLS.jl#145 (comment) closes #214 closes #199 closes #220 closes JuliaWeb/GitHub.jl#106 * - Encapsulate read|writebusy/sequence/count logic in new isbusy function. - Move close() on eof() || !isbusy() to new monitor_idle_connection function. - Make monitor_idle_connection() a noop for ::Connection{SSLContext} * require Julia 0.6.3 #236 (comment)

ararslan mentioned this issue Mar 13, 2018

Segfault while parsing headers #188

Closed

ararslan mentioned this issue Mar 13, 2018

prevent fill! from inlining JuliaLang/julia#26418

Merged

samoconnor changed the title ~~EOFError when parsing headers~~ EOFError when parsing headers (dupe of #214) Mar 13, 2018

samoconnor added the ⏳waiting for feedback label Mar 13, 2018

samoconnor mentioned this issue Mar 17, 2018

POST operations need to handle intermittent network failure. JuliaWeb/GitHub.jl#106

Open

ararslan mentioned this issue Mar 23, 2018

enable the SLP Vectorizer optimization pass by default JuliaLang/julia#26594

Merged

samoconnor closed this as completed Apr 2, 2018

samoconnor mentioned this issue Apr 16, 2018

Check for close_notify message in Base.isopen(ctx::SSLContext) JuliaLang/MbedTLS.jl#145

Merged

vtjnash mentioned this issue Apr 17, 2018

ConnectionPool: monitor idle connections #236

Merged

mcmcgrath13 mentioned this issue May 18, 2018

Added retry_non_idempotent=true to HTTP post requests BioJulia/BioServices.jl#9

Merged

11 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

EOFError when parsing headers (dupe of #214) #220

EOFError when parsing headers (dupe of #214) #220

ararslan commented Mar 13, 2018

quinnj commented Mar 13, 2018

samoconnor commented Mar 13, 2018

ararslan commented Mar 13, 2018

quinnj commented Mar 13, 2018

samoconnor commented Mar 13, 2018

samoconnor commented Mar 13, 2018

ararslan commented Mar 13, 2018

samoconnor commented Mar 13, 2018

ararslan commented Mar 13, 2018

samoconnor commented Mar 13, 2018

samoconnor commented Mar 13, 2018

quinnj commented Mar 13, 2018

ararslan commented Mar 13, 2018

ararslan commented Mar 14, 2018

ararslan commented Mar 14, 2018

samoconnor commented Mar 14, 2018

samoconnor commented Mar 14, 2018

ararslan commented Mar 14, 2018

samoconnor commented Mar 14, 2018

ararslan commented Mar 26, 2018

samoconnor commented Mar 26, 2018

ararslan commented Mar 26, 2018

samoconnor commented Mar 26, 2018

ararslan commented Mar 27, 2018 •

edited

Loading

samoconnor commented Mar 27, 2018

samoconnor commented Mar 27, 2018

samoconnor commented Mar 27, 2018

ararslan commented Mar 27, 2018

samoconnor commented Mar 27, 2018

samoconnor commented Apr 2, 2018

EOFError when parsing headers (dupe of #214) #220

EOFError when parsing headers (dupe of #214) #220

Comments

ararslan commented Mar 13, 2018

quinnj commented Mar 13, 2018

samoconnor commented Mar 13, 2018

ararslan commented Mar 13, 2018

quinnj commented Mar 13, 2018

samoconnor commented Mar 13, 2018

samoconnor commented Mar 13, 2018

ararslan commented Mar 13, 2018

samoconnor commented Mar 13, 2018

ararslan commented Mar 13, 2018

samoconnor commented Mar 13, 2018

samoconnor commented Mar 13, 2018

quinnj commented Mar 13, 2018

ararslan commented Mar 13, 2018

ararslan commented Mar 14, 2018

ararslan commented Mar 14, 2018

samoconnor commented Mar 14, 2018

samoconnor commented Mar 14, 2018

ararslan commented Mar 14, 2018

samoconnor commented Mar 14, 2018

ararslan commented Mar 26, 2018

samoconnor commented Mar 26, 2018

ararslan commented Mar 26, 2018

samoconnor commented Mar 26, 2018

ararslan commented Mar 27, 2018 • edited Loading

samoconnor commented Mar 27, 2018

samoconnor commented Mar 27, 2018

samoconnor commented Mar 27, 2018

ararslan commented Mar 27, 2018

samoconnor commented Mar 27, 2018

samoconnor commented Apr 2, 2018

ararslan commented Mar 27, 2018 •

edited

Loading