#4685 EventQueueGet shouldn't use llcorehttp's backoff-retry logic
- Event poll already has own backoff logic with 10 retries, using llcorehttp's one on top makes no sense. - Better logging coverage - Better in-code documentation - Add timing-based error detectionmaster
parent
e33b5c5e6c
commit
42b09b236b
|
|
@ -54,13 +54,6 @@ namespace Details
|
||||||
void stop();
|
void stop();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
// We will wait RETRY_SECONDS + (errorCount * RETRY_SECONDS_INC) before retrying after an error.
|
|
||||||
// This means we attempt to recover relatively quickly but back off giving more time to recover
|
|
||||||
// until we finally give up after MAX_EVENT_POLL_HTTP_ERRORS attempts.
|
|
||||||
static const F32 EVENT_POLL_ERROR_RETRY_SECONDS;
|
|
||||||
static const F32 EVENT_POLL_ERROR_RETRY_SECONDS_INC;
|
|
||||||
static const S32 MAX_EVENT_POLL_HTTP_ERRORS;
|
|
||||||
|
|
||||||
void eventPollCoro(std::string url);
|
void eventPollCoro(std::string url);
|
||||||
|
|
||||||
void handleMessage(const LLSD &content);
|
void handleMessage(const LLSD &content);
|
||||||
|
|
@ -76,9 +69,13 @@ namespace Details
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
const F32 LLEventPollImpl::EVENT_POLL_ERROR_RETRY_SECONDS = 15.f; // ~ half of a normal timeout.
|
// We will wait RETRY_SECONDS + (errorCount * RETRY_SECONDS_INC) before retrying after an error.
|
||||||
const F32 LLEventPollImpl::EVENT_POLL_ERROR_RETRY_SECONDS_INC = 5.f; // ~ half of a normal timeout.
|
// This means we attempt to recover relatively quickly but back off giving more time to recover
|
||||||
const S32 LLEventPollImpl::MAX_EVENT_POLL_HTTP_ERRORS = 10; // ~5 minutes, by the above rules.
|
// until we finally give up after MAX_EVENT_POLL_HTTP_ERRORS attempts.
|
||||||
|
constexpr F32 EVENT_POLL_ERROR_RETRY_SECONDS = 15.f; // ~ half of a normal timeout.
|
||||||
|
constexpr F32 EVENT_POLL_ERROR_RETRY_SECONDS_INC = 5.f; // ~ half of a normal timeout.
|
||||||
|
constexpr S32 MAX_EVENT_POLL_HTTP_ERRORS = 10; // ~5 minutes, by the above rules.
|
||||||
|
constexpr F64 MIN_SECONDS_PASSED = 10.0; // Minimum time we expect the server to hold the request.
|
||||||
|
|
||||||
int LLEventPollImpl::sNextCounter = 1;
|
int LLEventPollImpl::sNextCounter = 1;
|
||||||
|
|
||||||
|
|
@ -151,11 +148,17 @@ namespace Details
|
||||||
LLSD acknowledge;
|
LLSD acknowledge;
|
||||||
int errorCount = 0;
|
int errorCount = 0;
|
||||||
int counter = mCounter; // saved on the stack for logging.
|
int counter = mCounter; // saved on the stack for logging.
|
||||||
|
LLTimer message_time;
|
||||||
|
|
||||||
LL_DEBUGS("LLEventPollImpl") << " <" << counter << "> entering coroutine." << LL_ENDL;
|
LL_DEBUGS("LLEventPollImpl") << " <" << counter << "> entering coroutine." << LL_ENDL;
|
||||||
|
|
||||||
mAdapter = httpAdapter;
|
mAdapter = httpAdapter;
|
||||||
|
|
||||||
|
// This is a loop with its own waitToRetry implementation,
|
||||||
|
// so disable retries.
|
||||||
|
LLCore::HttpOptions::ptr_t httpOpts(new LLCore::HttpOptions);
|
||||||
|
httpOpts->setRetries(0);
|
||||||
|
|
||||||
LL::WorkQueue::ptr_t main_queue = nullptr;
|
LL::WorkQueue::ptr_t main_queue = nullptr;
|
||||||
|
|
||||||
// HACK -- grab the mainloop workqueue to move execution of the handler
|
// HACK -- grab the mainloop workqueue to move execution of the handler
|
||||||
|
|
@ -172,11 +175,13 @@ namespace Details
|
||||||
request["ack"] = acknowledge;
|
request["ack"] = acknowledge;
|
||||||
request["done"] = mDone;
|
request["done"] = mDone;
|
||||||
|
|
||||||
|
message_time.reset();
|
||||||
|
|
||||||
// LL_DEBUGS("LLEventPollImpl::eventPollCoro") << "<" << counter << "> request = "
|
// LL_DEBUGS("LLEventPollImpl::eventPollCoro") << "<" << counter << "> request = "
|
||||||
// << LLSDXMLStreamer(request) << LL_ENDL;
|
// << LLSDXMLStreamer(request) << LL_ENDL;
|
||||||
|
|
||||||
LL_DEBUGS("LLEventPollImpl") << " <" << counter << "> posting and yielding." << LL_ENDL;
|
LL_DEBUGS("LLEventPollImpl") << " <" << counter << "> posting and yielding." << LL_ENDL;
|
||||||
LLSD result = httpAdapter->postAndSuspend(mHttpRequest, url, request);
|
LLSD result = httpAdapter->postAndSuspend(mHttpRequest, url, request, httpOpts);
|
||||||
|
|
||||||
// LL_DEBUGS("LLEventPollImpl::eventPollCoro") << "<" << counter << "> result = "
|
// LL_DEBUGS("LLEventPollImpl::eventPollCoro") << "<" << counter << "> result = "
|
||||||
// << LLSDXMLStreamer(result) << LL_ENDL;
|
// << LLSDXMLStreamer(result) << LL_ENDL;
|
||||||
|
|
@ -194,11 +199,30 @@ namespace Details
|
||||||
|
|
||||||
if (!status)
|
if (!status)
|
||||||
{
|
{
|
||||||
if (status == LLCore::HttpStatus(LLCore::HttpStatus::EXT_CURL_EASY, CURLE_OPERATION_TIMEDOUT))
|
if (status == LLCore::HttpStatus(LLCore::HttpStatus::EXT_CURL_EASY, CURLE_OPERATION_TIMEDOUT) // A standard timeout, no events.
|
||||||
{ // A standard timeout response we get this when there are no events.
|
|| status == LLCore::HttpStatus(HTTP_BAD_GATEWAY) // An expected 'No events' case.
|
||||||
LL_DEBUGS("LLEventPollImpl") << "All is very quiet on target server. It may have gone idle?" << LL_ENDL;
|
|| status == LLCore::HttpStatus(HTTP_INTERNAL_ERROR)
|
||||||
errorCount = 0;
|
|| status == LLCore::HttpStatus(HTTP_SERVICE_UNAVAILABLE)
|
||||||
continue;
|
|| status == LLCore::HttpStatus(HTTP_GATEWAY_TIME_OUT))
|
||||||
|
{
|
||||||
|
if (message_time.getElapsedSeconds() < MIN_SECONDS_PASSED)
|
||||||
|
{
|
||||||
|
// Server is supposed to hold request for 20 to 30 seconds.
|
||||||
|
// If it didn't hold the request at least for 10s, treat as an error.
|
||||||
|
LL_WARNS("LLEventPollImpl") << "Response arrived too early, status: " << status.toTerseString()
|
||||||
|
<< ", time passed: " << message_time.getElapsedSeconds() << LL_ENDL;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Timeout, expected and means 'no events'. Request is to be re-issued immediately.
|
||||||
|
// Current definition of a timeout is any of :
|
||||||
|
// - libcurl easy 28 status code
|
||||||
|
// - Linden 499 special http status code
|
||||||
|
// - RFC - standard 502 - 504 http status codes
|
||||||
|
LL_DEBUGS("LLEventPollImpl") << "No events, from: " << mSenderIp <<" status: " << (S32)status.getStatus() << LL_ENDL;
|
||||||
|
errorCount = 0;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if ((status == LLCore::HttpStatus(LLCore::HttpStatus::LLCORE, LLCore::HE_OP_CANCELED)) ||
|
else if ((status == LLCore::HttpStatus(LLCore::HttpStatus::LLCORE, LLCore::HE_OP_CANCELED)) ||
|
||||||
(status == LLCore::HttpStatus(HTTP_NOT_FOUND)))
|
(status == LLCore::HttpStatus(HTTP_NOT_FOUND)))
|
||||||
|
|
@ -206,13 +230,13 @@ namespace Details
|
||||||
// some cases the server gets ahead of the viewer and will
|
// some cases the server gets ahead of the viewer and will
|
||||||
// return a 404 error (Not Found) before the cancel event
|
// return a 404 error (Not Found) before the cancel event
|
||||||
// comes back in the queue
|
// comes back in the queue
|
||||||
LL_WARNS("LLEventPollImpl") << "Canceling coroutine" << LL_ENDL;
|
LL_WARNS("LLEventPollImpl") << "<" << counter << "> Canceling coroutine, status: " << status.toTerseString() << LL_ENDL;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
else if (!status.isHttpStatus())
|
else if (!status.isHttpStatus())
|
||||||
{
|
{
|
||||||
/// Some LLCore or LIBCurl error was returned. This is unlikely to be recoverable
|
/// Some LLCore or LIBCurl error was returned. This is unlikely to be recoverable
|
||||||
LL_WARNS("LLEventPollImpl") << "Critical error from poll request returned from libraries. Canceling coroutine." << LL_ENDL;
|
LL_WARNS("LLEventPollImpl") << "<" << counter << "> Critical error from poll request returned from libraries. Canceling coroutine." << LL_ENDL;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
LL_WARNS("LLEventPollImpl") << "<" << counter << "> Error result from LLCoreHttpUtil::HttpCoroHandler. Code "
|
LL_WARNS("LLEventPollImpl") << "<" << counter << "> Error result from LLCoreHttpUtil::HttpCoroHandler. Code "
|
||||||
|
|
@ -255,6 +279,10 @@ namespace Details
|
||||||
LL_WARNS("LLEventPollImpl") << "< " << counter << "> Forcing disconnect due to stalled main region event poll." << LL_ENDL;
|
LL_WARNS("LLEventPollImpl") << "< " << counter << "> Forcing disconnect due to stalled main region event poll." << LL_ENDL;
|
||||||
LLAppViewer::instance()->forceDisconnect(LLTrans::getString("AgentLostConnection"));
|
LLAppViewer::instance()->forceDisconnect(LLTrans::getString("AgentLostConnection"));
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
LL_WARNS("LLEventPollImpl") << "< " << counter << "> Stopping event poll for " << mSenderIp << " due to failures." << LL_ENDL;
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,30 @@ namespace Details
|
||||||
|
|
||||||
|
|
||||||
class LLEventPoll
|
class LLEventPoll
|
||||||
///< implements the viewer side of server-to-viewer pushed events.
|
///< Implements the viewer side of server-to-viewer pushed events.
|
||||||
|
///
|
||||||
|
/// This class implements the sole consumer of the EventQueueGet capability
|
||||||
|
/// and delivers data, including llsd-encoded llmessage payloads, from
|
||||||
|
/// simulator to viewer.
|
||||||
|
///
|
||||||
|
/// https://wiki.secondlife.com/wiki/EventQueueGet
|
||||||
|
/// The wiki page is neither complete nor entirely correct. Request timeouts
|
||||||
|
/// became the de facto method of returning an empty event set to the viewer.
|
||||||
|
/// But the timeout behavior was never defined. It was simply whatever
|
||||||
|
/// behavior a given grid implementation implemented.
|
||||||
|
///
|
||||||
|
/// In SL's case, the path may include reverse proxies, http caches, http and
|
||||||
|
/// socks proxies, transparent hijacking, and other horrors. A pitfall for
|
||||||
|
/// implementors.
|
||||||
|
///
|
||||||
|
/// Current definition of a timeout is any of :
|
||||||
|
/// - libcurl easy 28 status code
|
||||||
|
/// - Linden 499 special http status code
|
||||||
|
/// - RFC - standard 502 - 504 http status codes
|
||||||
|
/// If requests are failing too quickly with the above errors, they are treated
|
||||||
|
/// as actual errors and not an empty payload. These will count towards a final
|
||||||
|
/// error declaration and can lead to disconnection from a simulator or the
|
||||||
|
/// entire grid.
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
LLEventPoll(const std::string& pollURL, const LLHost& sender);
|
LLEventPoll(const std::string& pollURL, const LLHost& sender);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue