-
Notifications
You must be signed in to change notification settings - Fork 212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
PROTON-2818: Move epoll proctor connection logic to a task thread. #427
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,6 +52,8 @@ struct praw_connection_t { | |
bool disconnected; | ||
bool hup_detected; | ||
bool read_check; | ||
bool first_schedule; | ||
char *taddr; | ||
}; | ||
|
||
static void psocket_error(praw_connection_t *rc, int err, const char* msg) { | ||
|
@@ -145,6 +147,8 @@ static void praw_connection_init(praw_connection_t *prc, pn_proactor_t *p, pn_ra | |
|
||
prc->connected = false; | ||
prc->disconnected = false; | ||
prc->first_schedule = false; | ||
prc->taddr = NULL; | ||
prc->batch.next_event = pni_raw_batch_next; | ||
|
||
pmutex_init(&prc->rearm_mutex); | ||
|
@@ -163,6 +167,7 @@ static void praw_connection_cleanup(praw_connection_t *prc) { | |
task_finalize(&prc->task); | ||
if (prc->addrinfo) | ||
freeaddrinfo(prc->addrinfo); | ||
free(prc->taddr); | ||
free(prc); | ||
} | ||
// else proactor_disconnect logic owns prc and its final free | ||
|
@@ -177,39 +182,43 @@ pn_raw_connection_t *pn_raw_connection(void) { | |
return &conn->raw_connection; | ||
} | ||
|
||
void pn_proactor_raw_connect(pn_proactor_t *p, pn_raw_connection_t *rc, const char *addr) { | ||
assert(rc); | ||
praw_connection_t *prc = containerof(rc, praw_connection_t, raw_connection); | ||
praw_connection_init(prc, p, rc); | ||
// TODO: check case of proactor shutting down | ||
|
||
lock(&prc->task.mutex); | ||
proactor_add(&prc->task); | ||
|
||
bool notify = false; | ||
|
||
// Call from pconnection_process with task lock held. | ||
static void praw_connection_first_connect_lh(praw_connection_t *prc) { | ||
const char *host; | ||
const char *port; | ||
size_t addrlen = strlen(addr); | ||
char *addr_buf = (char*) alloca(addrlen+1); | ||
pni_parse_addr(addr, addr_buf, addrlen+1, &host, &port); | ||
|
||
unlock(&prc->task.mutex); | ||
size_t addrlen = strlen(prc->taddr); | ||
char *addr_buf = (char*) alloca(addrlen+1); | ||
pni_parse_addr(prc->taddr, addr_buf, addrlen+1, &host, &port); | ||
int gai_error = pgetaddrinfo(host, port, 0, &prc->addrinfo); | ||
lock(&prc->task.mutex); | ||
|
||
if (!gai_error) { | ||
prc->ai = prc->addrinfo; | ||
praw_connection_maybe_connect_lh(prc); /* Start connection attempts */ | ||
if (prc->disconnected) notify = schedule(&prc->task); | ||
} else { | ||
psocket_gai_error(prc, gai_error, "connect to ", addr); | ||
prc->disconnected = true; | ||
notify = schedule(&prc->task); | ||
lock(&p->task.mutex); | ||
notify |= schedule_if_inactive(p); | ||
unlock(&p->task.mutex); | ||
psocket_gai_error(prc, gai_error, "connect to ", prc->taddr); | ||
} | ||
} | ||
|
||
void pn_proactor_raw_connect(pn_proactor_t *p, pn_raw_connection_t *rc, const char *addr) { | ||
// Called from an arbitrary thread. Do setup prior to getaddrinfo, then switch to a worker thread. | ||
assert(rc); | ||
praw_connection_t *prc = containerof(rc, praw_connection_t, raw_connection); | ||
praw_connection_init(prc, p, rc); | ||
// TODO: check case of proactor shutting down | ||
|
||
/* We need to issue INACTIVE on immediate failure */ | ||
lock(&prc->task.mutex); | ||
size_t addrlen = strlen(addr); | ||
prc->taddr = (char*) malloc(addrlen+1); | ||
assert(prc->taddr); // TODO: memory safety | ||
memcpy(prc->taddr, addr, addrlen+1); | ||
prc->first_schedule = true; // Resume connection setup when next scheduled. | ||
proactor_add(&prc->task); | ||
bool notify = schedule(&prc->task); | ||
unlock(&prc->task.mutex); | ||
|
||
if (notify) notify_poller(p); | ||
} | ||
|
||
|
@@ -413,6 +422,19 @@ pn_event_batch_t *pni_raw_connection_process(task_t *t, uint32_t io_events, bool | |
} | ||
if (events & EPOLLOUT) | ||
praw_connection_connected_lh(rc); | ||
if (rc->first_schedule) { | ||
// Normal case: resumed logic from pn_proactor_raw_connect. | ||
// But possible tie: pn_raw_connection_wake() | ||
// Defer wake check until getaddrinfo is done. | ||
rc->first_schedule = false; | ||
assert(!events); // No socket yet. | ||
praw_connection_first_connect_lh(rc); // Drops and reacquires lock. | ||
if (rc->psocket.epoll_io.fd != -1 && !pni_task_wake_pending(&rc->task)) { | ||
unlock(&rc->task.mutex); | ||
return NULL; | ||
} | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder if this code should be the first piece of code in the possibilities, as it is the first that should happen; currently the logi cto kick off the connect is first; but now the logic to do the lookup must be earlier in the lifecycle of the connection so for clarity it should be the first condition in the sequence (unless the semantics mean that this doesn't work for some reason). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hopefully addressed in the reworked version. Agreed about the lifecycle state issue. This proposed fix strives for the minimal code logic changes to move the blocking activity to a different thread. The subsequent "real fix" for the parent JIRA will necessarily introduce a new state (presumably with early cancel option compared to the current blocked-until-done). The initiating of the getaddrinfo call will also presumably be sensibly moved back to the pn_xxx_connect call to avoid a pointless thread switch and the first_call boolean will have no purpose. |
||
|
||
unlock(&rc->task.mutex); | ||
return &rc->batch; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo? "possibility of"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not a typo but obviously not clear. While the original logic is preserved as much as possible, the dropping of the task lock and context switch allows competitor threads that were not possible prior to this change. Either of those two calls are possible from an arbitrary thread between the setting of first_schedule and arriving at this code.
A comment which doesn't make sense on its own is obviously not helpful. I will try to rework the comments and code structure for clarity on their own.