2011-10-23 21:21:39 +01:00
|
|
|
/* A simple test of the rx event layer */
|
|
|
|
|
|
|
|
#include <afsconfig.h>
|
|
|
|
#include <afs/param.h>
|
|
|
|
|
|
|
|
#include <roken.h>
|
|
|
|
#include <pthread.h>
|
|
|
|
|
2012-05-07 21:52:16 +01:00
|
|
|
#include <tests/tap/basic.h>
|
2011-10-23 21:21:39 +01:00
|
|
|
|
|
|
|
#include "rx/rx_event.h"
|
|
|
|
#include "rx/rx_clock.h"
|
|
|
|
|
|
|
|
#define NUMEVENTS 10000
|
|
|
|
|
|
|
|
/* Mutexes and condvars for the scheduler */
|
|
|
|
static int rescheduled = 0;
|
|
|
|
static pthread_mutex_t eventMutex;
|
|
|
|
static pthread_cond_t eventCond;
|
|
|
|
|
|
|
|
/* Mutexes and condvars for the event list */
|
|
|
|
|
|
|
|
static pthread_mutex_t eventListMutex;
|
|
|
|
struct testEvent {
|
|
|
|
struct rxevent *event;
|
|
|
|
int fired;
|
|
|
|
int cancelled;
|
|
|
|
};
|
|
|
|
|
|
|
|
static struct testEvent events[NUMEVENTS];
|
|
|
|
|
|
|
|
static void
|
|
|
|
reschedule(void)
|
|
|
|
{
|
|
|
|
pthread_mutex_lock(&eventMutex);
|
|
|
|
pthread_cond_signal(&eventCond);
|
|
|
|
rescheduled = 1;
|
|
|
|
pthread_mutex_unlock(&eventMutex);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
eventSub(struct rxevent *event, void *arg, void *arg1, int arg2)
|
|
|
|
{
|
|
|
|
struct testEvent *evrecord = arg;
|
|
|
|
|
Standardize rx_event usage
Go over all consumers of the rx event framework and normalize its usage
according to the following principles:
rxevent_Post() is used to create an event, and it returns an event
handle (with a reference on the event structure) that can be used
to cancel the event before its timeout fires. (There is also an
additional reference on the event held by the global event tree.)
In all(*) usage within the tree, that event handle is stored within
either an rx_connection or an rx_call. Reads/writes to the member variable
that holds the event handle require either the conn_data_lock or call
lock, respectively -- that means that in most cases, callers of
rxevent_Post() and rxevent_Cancel() will be holding one of those
aforementioned locks. The event handlers themselves will need to
modify the call/connection object according to the nature of the
event, which requires holding those same locks, and also a guarantee
that the call/connection is still a live object and has not been
deallocated! Whether or not rxevent_Cancel() succeeds in cancelling
the event before it fires, whenever passed a non-NULL event structure
it will NULL out the supplied pointer and drop a reference on the
event structure. This is the correct behavior, since the caller
has asked to cancel the event and has no further use for the event
handle or its reference on the event structure. The caller of
rxevent_Cancel() must check its return value to know whether or
not the event was cancelled before its handler was able to run.
The interaction window between the call/connection lock and the lock
protecting the red/black tree of pending events opens up a somewhat
problematic race window. Because the application thread is expected
to hold the call/connection lock around rxevent_Cancel() (to protect
the write to the field in the call/connection structure that holds
an event handle), and rxevent_Cancel() must take the lock protecting
the red/black tree of events, this establishes a lock order with the
call/connection lock taken before the eventTree lock. This is in
conflict with the event handler thread, which must take the eventTree
lock first, in order to select an event to run (and thus know what
additional lock would need to be taken, by virtue of what handler
function is to be run). The conflict is easy to resolve in the
standard way, by having a local pointer to the event that is obtained
while the event is removed from the red/black tree under the eventTree
lock, and then the eventTree lock can be dropped and the event run
based on the local variable referring to it. The race window occurs
when the caller of rxevent_Cancel() holds the call/connection lock,
and rxevent_Cancel() obtains the eventTree lock just after the event
handler thread drops it in order to run the event. The event handler
function begins to execute, and immediately blocks trying to obtain
the call/connection lock. Now that rxevent_Cancel() has the eventTree
lock it can proceed to search the tree, fail to find the indicated event
in the tree, clear out the event pointer from the call/connection
data structure, drop its caller's reference to the event structure,
and return failure (the event was not cancelled). Only then does the
caller of rxevent_Cancel() drop the call/connection lock and allow
the event handler to make progress.
This race is not necessarily problematic if appropriate care is taken,
but in the previous code such was not the case. In particular, it
is a common idiom for the firing event to call rxevent_Put() on itself,
to release the handle stored in the call/connection that could have
been used to cancel the event before it fired. Failing to do so would
result in a memory leak of event structures; however, rxevent_Put() does
not check for a NULL argument, so a segfault (NULL dereference) was
observed in the test suite when the race occurred and the event handler
tried to rxevent_Put() the reference that had already been released by
the unsuccessful rxevent_Cancel() call. Upon inspection, many (but not
all) of the uses in rx.c were susceptible to a similar race condition
and crash.
The test suite also papers over a related issue in that the event handler
in the test suite always knows that the data structure containing the
event handle will remain live, since it is a global array that is allocated
for the entire scope of the test. In rx.c, events are associated with
calls and connections that have a finite lifetime, so we need to take care
to ensure that the call/connection pointer stored in the event remains
valid for the duration of the event's lifecycle. In particular, even an
attempt to take the call/connection lock to check whether the corresponding
event field is NULL is fraught with risk, as it could crash if the lock
(and containing call/connection) has already been destroyed! There are
several potential ways to ensure the liveness of the associated
call/connection while the event handler runs, most notably to take care
in the call/connection destruction path to ensure that all associated
events are either successfully cancelled or run to completion before
tearing down the call/connection structure, and to give the pending event
its own reference on the associated call/connection. Here, we opt for
the latter, acknowledging that this may result in the event handler thread
doing the full call/connection teardown and delay the firing of subsequent
events. This is deemed acceptable, as pending events are for intentionally
delayed tasks, and some extra delay is probably acceptable. (The various
keepalive events and the challenge event could delay the user experience
and/or security properties if significantly delayed, but I do not believe
that this change admits completely unbounded delay in the event handler
thread, so the practical risk seems minimal.)
Accordingly, this commit attempts to ensure that:
* Each event holds a formal reference on its associated call/connection.
* The appropriate lock is held for all accesses to event pointers in
call/connection structures.
* Each event handler (after taking the appropriate lock) checks whether
it raced with rxevent_Cancel() and only drops the call/connection's
reference to the event if the race did not occur.
* Each event handler drops its reference to the associated call/connection
*after* doing any actions that might access/modify the call/connection.
* The per-event reference on the associated call/connection is dropped by
the thread that removes the event from the red/black tree. That is,
the event handler function if the event runs, or by the caller of
rxevent_Cancel() when the cancellation succeed.
* No non-NULL event handles remain in a call/connection being destroyed,
which would indicate a refcounting error.
(*) There is an additional event used in practice, to reap old connections,
but it is effectively a background task that reschedules itself
periodically, with no handle to the event retained so as to be able
to cancel it. As such, it is unaffected by the concerns raised here.
While here, standardize on the rx_GetConnection() function for incrementing
the reference count on a connection object, instead of inlining the
corresponding mutex lock/unlock and variable access.
Also enable refcount checking unconditionally on unix, as this is a
rather invasive change late in the 1.8.0 release process and we want
to get as much sanity checking coverage as possible.
Change-Id: I27bcb932ec200ff20364fb1b83ea811221f9871c
Reviewed-on: https://gerrit.openafs.org/12756
Reviewed-by: Mark Vitale <mvitale@sinenomine.net>
Reviewed-by: Michael Meffie <mmeffie@sinenomine.net>
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Reviewed-by: Benjamin Kaduk <kaduk@mit.edu>
2017-10-08 04:42:38 +01:00
|
|
|
/*
|
|
|
|
* The eventListMutex protects the contents of fields in the global
|
|
|
|
* 'events' array, including reading/writing evrecord->event.
|
|
|
|
* However, in this test code, we have an additional guarantee that
|
|
|
|
* the events array will remain allocated for the duration of the test,
|
|
|
|
* and as such that it is safe to dereference |evrecord| at all. In real
|
|
|
|
* application code where the passed args are pointers to allocated data
|
|
|
|
* structures with finite lifetime, the programmer must ensure that the
|
|
|
|
* firing event can safely access these fields (i.e., that the object
|
|
|
|
* lifetime does not permit the object to be destroyed while an event
|
|
|
|
* pointing to it is outstanding or in progress). The simplest way to
|
|
|
|
* do this (for reference counted objects) is to have the pending event
|
|
|
|
* hold a reference on the pointed-to object. This reference should be
|
|
|
|
* dropped at the end of the event handler or if the event is
|
|
|
|
* (successfully!) cancelled before it fires. Other strategies are also
|
|
|
|
* possible, such as deferring object destruction until after all pending
|
|
|
|
* events have run or gotten cancelled, noting that the calling code must
|
|
|
|
* take care to allow the event handler to obtain any needed locks and
|
|
|
|
* avoid deadlock.
|
|
|
|
*/
|
2011-10-23 21:21:39 +01:00
|
|
|
pthread_mutex_lock(&eventListMutex);
|
Standardize rx_event usage
Go over all consumers of the rx event framework and normalize its usage
according to the following principles:
rxevent_Post() is used to create an event, and it returns an event
handle (with a reference on the event structure) that can be used
to cancel the event before its timeout fires. (There is also an
additional reference on the event held by the global event tree.)
In all(*) usage within the tree, that event handle is stored within
either an rx_connection or an rx_call. Reads/writes to the member variable
that holds the event handle require either the conn_data_lock or call
lock, respectively -- that means that in most cases, callers of
rxevent_Post() and rxevent_Cancel() will be holding one of those
aforementioned locks. The event handlers themselves will need to
modify the call/connection object according to the nature of the
event, which requires holding those same locks, and also a guarantee
that the call/connection is still a live object and has not been
deallocated! Whether or not rxevent_Cancel() succeeds in cancelling
the event before it fires, whenever passed a non-NULL event structure
it will NULL out the supplied pointer and drop a reference on the
event structure. This is the correct behavior, since the caller
has asked to cancel the event and has no further use for the event
handle or its reference on the event structure. The caller of
rxevent_Cancel() must check its return value to know whether or
not the event was cancelled before its handler was able to run.
The interaction window between the call/connection lock and the lock
protecting the red/black tree of pending events opens up a somewhat
problematic race window. Because the application thread is expected
to hold the call/connection lock around rxevent_Cancel() (to protect
the write to the field in the call/connection structure that holds
an event handle), and rxevent_Cancel() must take the lock protecting
the red/black tree of events, this establishes a lock order with the
call/connection lock taken before the eventTree lock. This is in
conflict with the event handler thread, which must take the eventTree
lock first, in order to select an event to run (and thus know what
additional lock would need to be taken, by virtue of what handler
function is to be run). The conflict is easy to resolve in the
standard way, by having a local pointer to the event that is obtained
while the event is removed from the red/black tree under the eventTree
lock, and then the eventTree lock can be dropped and the event run
based on the local variable referring to it. The race window occurs
when the caller of rxevent_Cancel() holds the call/connection lock,
and rxevent_Cancel() obtains the eventTree lock just after the event
handler thread drops it in order to run the event. The event handler
function begins to execute, and immediately blocks trying to obtain
the call/connection lock. Now that rxevent_Cancel() has the eventTree
lock it can proceed to search the tree, fail to find the indicated event
in the tree, clear out the event pointer from the call/connection
data structure, drop its caller's reference to the event structure,
and return failure (the event was not cancelled). Only then does the
caller of rxevent_Cancel() drop the call/connection lock and allow
the event handler to make progress.
This race is not necessarily problematic if appropriate care is taken,
but in the previous code such was not the case. In particular, it
is a common idiom for the firing event to call rxevent_Put() on itself,
to release the handle stored in the call/connection that could have
been used to cancel the event before it fired. Failing to do so would
result in a memory leak of event structures; however, rxevent_Put() does
not check for a NULL argument, so a segfault (NULL dereference) was
observed in the test suite when the race occurred and the event handler
tried to rxevent_Put() the reference that had already been released by
the unsuccessful rxevent_Cancel() call. Upon inspection, many (but not
all) of the uses in rx.c were susceptible to a similar race condition
and crash.
The test suite also papers over a related issue in that the event handler
in the test suite always knows that the data structure containing the
event handle will remain live, since it is a global array that is allocated
for the entire scope of the test. In rx.c, events are associated with
calls and connections that have a finite lifetime, so we need to take care
to ensure that the call/connection pointer stored in the event remains
valid for the duration of the event's lifecycle. In particular, even an
attempt to take the call/connection lock to check whether the corresponding
event field is NULL is fraught with risk, as it could crash if the lock
(and containing call/connection) has already been destroyed! There are
several potential ways to ensure the liveness of the associated
call/connection while the event handler runs, most notably to take care
in the call/connection destruction path to ensure that all associated
events are either successfully cancelled or run to completion before
tearing down the call/connection structure, and to give the pending event
its own reference on the associated call/connection. Here, we opt for
the latter, acknowledging that this may result in the event handler thread
doing the full call/connection teardown and delay the firing of subsequent
events. This is deemed acceptable, as pending events are for intentionally
delayed tasks, and some extra delay is probably acceptable. (The various
keepalive events and the challenge event could delay the user experience
and/or security properties if significantly delayed, but I do not believe
that this change admits completely unbounded delay in the event handler
thread, so the practical risk seems minimal.)
Accordingly, this commit attempts to ensure that:
* Each event holds a formal reference on its associated call/connection.
* The appropriate lock is held for all accesses to event pointers in
call/connection structures.
* Each event handler (after taking the appropriate lock) checks whether
it raced with rxevent_Cancel() and only drops the call/connection's
reference to the event if the race did not occur.
* Each event handler drops its reference to the associated call/connection
*after* doing any actions that might access/modify the call/connection.
* The per-event reference on the associated call/connection is dropped by
the thread that removes the event from the red/black tree. That is,
the event handler function if the event runs, or by the caller of
rxevent_Cancel() when the cancellation succeed.
* No non-NULL event handles remain in a call/connection being destroyed,
which would indicate a refcounting error.
(*) There is an additional event used in practice, to reap old connections,
but it is effectively a background task that reschedules itself
periodically, with no handle to the event retained so as to be able
to cancel it. As such, it is unaffected by the concerns raised here.
While here, standardize on the rx_GetConnection() function for incrementing
the reference count on a connection object, instead of inlining the
corresponding mutex lock/unlock and variable access.
Also enable refcount checking unconditionally on unix, as this is a
rather invasive change late in the 1.8.0 release process and we want
to get as much sanity checking coverage as possible.
Change-Id: I27bcb932ec200ff20364fb1b83ea811221f9871c
Reviewed-on: https://gerrit.openafs.org/12756
Reviewed-by: Mark Vitale <mvitale@sinenomine.net>
Reviewed-by: Michael Meffie <mmeffie@sinenomine.net>
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Reviewed-by: Benjamin Kaduk <kaduk@mit.edu>
2017-10-08 04:42:38 +01:00
|
|
|
if (evrecord->event != NULL)
|
|
|
|
rxevent_Put(&evrecord->event);
|
2011-10-23 21:21:39 +01:00
|
|
|
evrecord->event = NULL;
|
|
|
|
evrecord->fired = 1;
|
|
|
|
pthread_mutex_unlock(&eventListMutex);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
reportSub(struct rxevent *event, void *arg, void *arg1, int arg2)
|
|
|
|
{
|
|
|
|
printf("Event fired\n");
|
|
|
|
}
|
|
|
|
|
|
|
|
static void *
|
|
|
|
eventHandler(void *dummy) {
|
|
|
|
struct timespec nextEvent;
|
|
|
|
struct clock cv;
|
|
|
|
struct clock next;
|
|
|
|
|
|
|
|
pthread_mutex_lock(&eventMutex);
|
|
|
|
while (1) {
|
|
|
|
pthread_mutex_unlock(&eventMutex);
|
|
|
|
|
|
|
|
next.sec = 30;
|
|
|
|
next.usec = 0;
|
|
|
|
clock_GetTime(&cv);
|
|
|
|
rxevent_RaiseEvents(&next);
|
|
|
|
|
|
|
|
pthread_mutex_lock(&eventMutex);
|
|
|
|
|
|
|
|
/* If we were rescheduled whilst running the event queue,
|
|
|
|
* process the queue again */
|
|
|
|
if (rescheduled) {
|
|
|
|
rescheduled = 0;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
clock_Add(&cv, &next);
|
|
|
|
nextEvent.tv_sec = cv.sec;
|
|
|
|
nextEvent.tv_nsec = cv.usec * 1000;
|
|
|
|
pthread_cond_timedwait(&eventCond, &eventMutex, &nextEvent);
|
|
|
|
}
|
|
|
|
pthread_mutex_unlock(&eventMutex);
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
int
|
|
|
|
main(void)
|
|
|
|
{
|
|
|
|
int when, counter, fail, fired, cancelled;
|
|
|
|
struct clock now, eventTime;
|
|
|
|
struct rxevent *event;
|
|
|
|
pthread_t handler;
|
|
|
|
|
|
|
|
plan(8);
|
|
|
|
|
|
|
|
pthread_mutex_init(&eventMutex, NULL);
|
|
|
|
pthread_cond_init(&eventCond, NULL);
|
|
|
|
|
2014-09-08 18:40:48 +01:00
|
|
|
memset(events, 0, sizeof(events));
|
2011-10-23 21:21:39 +01:00
|
|
|
pthread_mutex_init(&eventListMutex, NULL);
|
|
|
|
|
|
|
|
/* Start up the event system */
|
|
|
|
rxevent_Init(20, reschedule);
|
|
|
|
ok(1, "Started event subsystem");
|
|
|
|
|
|
|
|
clock_GetTime(&now);
|
|
|
|
/* Test for a problem when there is only a single event in the tree */
|
|
|
|
event = rxevent_Post(&now, &now, reportSub, NULL, NULL, 0);
|
|
|
|
ok(event != NULL, "Created a single event");
|
Standardize rx_event usage
Go over all consumers of the rx event framework and normalize its usage
according to the following principles:
rxevent_Post() is used to create an event, and it returns an event
handle (with a reference on the event structure) that can be used
to cancel the event before its timeout fires. (There is also an
additional reference on the event held by the global event tree.)
In all(*) usage within the tree, that event handle is stored within
either an rx_connection or an rx_call. Reads/writes to the member variable
that holds the event handle require either the conn_data_lock or call
lock, respectively -- that means that in most cases, callers of
rxevent_Post() and rxevent_Cancel() will be holding one of those
aforementioned locks. The event handlers themselves will need to
modify the call/connection object according to the nature of the
event, which requires holding those same locks, and also a guarantee
that the call/connection is still a live object and has not been
deallocated! Whether or not rxevent_Cancel() succeeds in cancelling
the event before it fires, whenever passed a non-NULL event structure
it will NULL out the supplied pointer and drop a reference on the
event structure. This is the correct behavior, since the caller
has asked to cancel the event and has no further use for the event
handle or its reference on the event structure. The caller of
rxevent_Cancel() must check its return value to know whether or
not the event was cancelled before its handler was able to run.
The interaction window between the call/connection lock and the lock
protecting the red/black tree of pending events opens up a somewhat
problematic race window. Because the application thread is expected
to hold the call/connection lock around rxevent_Cancel() (to protect
the write to the field in the call/connection structure that holds
an event handle), and rxevent_Cancel() must take the lock protecting
the red/black tree of events, this establishes a lock order with the
call/connection lock taken before the eventTree lock. This is in
conflict with the event handler thread, which must take the eventTree
lock first, in order to select an event to run (and thus know what
additional lock would need to be taken, by virtue of what handler
function is to be run). The conflict is easy to resolve in the
standard way, by having a local pointer to the event that is obtained
while the event is removed from the red/black tree under the eventTree
lock, and then the eventTree lock can be dropped and the event run
based on the local variable referring to it. The race window occurs
when the caller of rxevent_Cancel() holds the call/connection lock,
and rxevent_Cancel() obtains the eventTree lock just after the event
handler thread drops it in order to run the event. The event handler
function begins to execute, and immediately blocks trying to obtain
the call/connection lock. Now that rxevent_Cancel() has the eventTree
lock it can proceed to search the tree, fail to find the indicated event
in the tree, clear out the event pointer from the call/connection
data structure, drop its caller's reference to the event structure,
and return failure (the event was not cancelled). Only then does the
caller of rxevent_Cancel() drop the call/connection lock and allow
the event handler to make progress.
This race is not necessarily problematic if appropriate care is taken,
but in the previous code such was not the case. In particular, it
is a common idiom for the firing event to call rxevent_Put() on itself,
to release the handle stored in the call/connection that could have
been used to cancel the event before it fired. Failing to do so would
result in a memory leak of event structures; however, rxevent_Put() does
not check for a NULL argument, so a segfault (NULL dereference) was
observed in the test suite when the race occurred and the event handler
tried to rxevent_Put() the reference that had already been released by
the unsuccessful rxevent_Cancel() call. Upon inspection, many (but not
all) of the uses in rx.c were susceptible to a similar race condition
and crash.
The test suite also papers over a related issue in that the event handler
in the test suite always knows that the data structure containing the
event handle will remain live, since it is a global array that is allocated
for the entire scope of the test. In rx.c, events are associated with
calls and connections that have a finite lifetime, so we need to take care
to ensure that the call/connection pointer stored in the event remains
valid for the duration of the event's lifecycle. In particular, even an
attempt to take the call/connection lock to check whether the corresponding
event field is NULL is fraught with risk, as it could crash if the lock
(and containing call/connection) has already been destroyed! There are
several potential ways to ensure the liveness of the associated
call/connection while the event handler runs, most notably to take care
in the call/connection destruction path to ensure that all associated
events are either successfully cancelled or run to completion before
tearing down the call/connection structure, and to give the pending event
its own reference on the associated call/connection. Here, we opt for
the latter, acknowledging that this may result in the event handler thread
doing the full call/connection teardown and delay the firing of subsequent
events. This is deemed acceptable, as pending events are for intentionally
delayed tasks, and some extra delay is probably acceptable. (The various
keepalive events and the challenge event could delay the user experience
and/or security properties if significantly delayed, but I do not believe
that this change admits completely unbounded delay in the event handler
thread, so the practical risk seems minimal.)
Accordingly, this commit attempts to ensure that:
* Each event holds a formal reference on its associated call/connection.
* The appropriate lock is held for all accesses to event pointers in
call/connection structures.
* Each event handler (after taking the appropriate lock) checks whether
it raced with rxevent_Cancel() and only drops the call/connection's
reference to the event if the race did not occur.
* Each event handler drops its reference to the associated call/connection
*after* doing any actions that might access/modify the call/connection.
* The per-event reference on the associated call/connection is dropped by
the thread that removes the event from the red/black tree. That is,
the event handler function if the event runs, or by the caller of
rxevent_Cancel() when the cancellation succeed.
* No non-NULL event handles remain in a call/connection being destroyed,
which would indicate a refcounting error.
(*) There is an additional event used in practice, to reap old connections,
but it is effectively a background task that reschedules itself
periodically, with no handle to the event retained so as to be able
to cancel it. As such, it is unaffected by the concerns raised here.
While here, standardize on the rx_GetConnection() function for incrementing
the reference count on a connection object, instead of inlining the
corresponding mutex lock/unlock and variable access.
Also enable refcount checking unconditionally on unix, as this is a
rather invasive change late in the 1.8.0 release process and we want
to get as much sanity checking coverage as possible.
Change-Id: I27bcb932ec200ff20364fb1b83ea811221f9871c
Reviewed-on: https://gerrit.openafs.org/12756
Reviewed-by: Mark Vitale <mvitale@sinenomine.net>
Reviewed-by: Michael Meffie <mmeffie@sinenomine.net>
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Reviewed-by: Benjamin Kaduk <kaduk@mit.edu>
2017-10-08 04:42:38 +01:00
|
|
|
ok(rxevent_Cancel(&event), "Cancelled a single event");
|
2011-10-23 21:21:39 +01:00
|
|
|
rxevent_RaiseEvents(&now);
|
|
|
|
ok(1, "RaiseEvents happened without error");
|
|
|
|
|
|
|
|
ok(pthread_create(&handler, NULL, eventHandler, NULL) == 0,
|
|
|
|
"Created handler thread");
|
|
|
|
|
2017-11-06 22:37:46 +00:00
|
|
|
/* Add a number of random events to fire over the next 3 seconds, but front-loaded
|
2017-10-05 05:03:44 +01:00
|
|
|
* a bit so that we can exercise the cancel/fire race path. */
|
2011-10-23 21:21:39 +01:00
|
|
|
|
|
|
|
for (counter = 0; counter < NUMEVENTS; counter++) {
|
2017-10-05 05:03:44 +01:00
|
|
|
when = random() % 4000;
|
|
|
|
/* Put 1/4 of events "right away" so we cancel them as they fire */
|
|
|
|
if (when >= 3000)
|
|
|
|
when = random() % 5;
|
2011-10-23 21:21:39 +01:00
|
|
|
clock_GetTime(&now);
|
|
|
|
eventTime = now;
|
|
|
|
clock_Addmsec(&eventTime, when);
|
|
|
|
pthread_mutex_lock(&eventListMutex);
|
|
|
|
events[counter].event
|
|
|
|
= rxevent_Post(&eventTime, &now, eventSub, &events[counter], NULL, 0);
|
|
|
|
|
|
|
|
/* A 10% chance that we will schedule another event at the same time */
|
2017-11-06 22:37:46 +00:00
|
|
|
if (counter < (NUMEVENTS - 1) && random() % 10 == 0) {
|
2011-10-23 21:21:39 +01:00
|
|
|
counter++;
|
|
|
|
events[counter].event
|
|
|
|
= rxevent_Post(&eventTime, &now, eventSub, &events[counter],
|
|
|
|
NULL, 0);
|
|
|
|
}
|
2021-01-22 19:48:21 +00:00
|
|
|
/*
|
|
|
|
* A 25% chance that we will cancel some event.
|
|
|
|
* Randomly pick any event that was scheduled before the current event.
|
|
|
|
*/
|
|
|
|
if (counter > 0 && (random() % 4 == 0)) {
|
2011-10-23 21:21:39 +01:00
|
|
|
int victim = random() % counter;
|
|
|
|
|
Standardize rx_event usage
Go over all consumers of the rx event framework and normalize its usage
according to the following principles:
rxevent_Post() is used to create an event, and it returns an event
handle (with a reference on the event structure) that can be used
to cancel the event before its timeout fires. (There is also an
additional reference on the event held by the global event tree.)
In all(*) usage within the tree, that event handle is stored within
either an rx_connection or an rx_call. Reads/writes to the member variable
that holds the event handle require either the conn_data_lock or call
lock, respectively -- that means that in most cases, callers of
rxevent_Post() and rxevent_Cancel() will be holding one of those
aforementioned locks. The event handlers themselves will need to
modify the call/connection object according to the nature of the
event, which requires holding those same locks, and also a guarantee
that the call/connection is still a live object and has not been
deallocated! Whether or not rxevent_Cancel() succeeds in cancelling
the event before it fires, whenever passed a non-NULL event structure
it will NULL out the supplied pointer and drop a reference on the
event structure. This is the correct behavior, since the caller
has asked to cancel the event and has no further use for the event
handle or its reference on the event structure. The caller of
rxevent_Cancel() must check its return value to know whether or
not the event was cancelled before its handler was able to run.
The interaction window between the call/connection lock and the lock
protecting the red/black tree of pending events opens up a somewhat
problematic race window. Because the application thread is expected
to hold the call/connection lock around rxevent_Cancel() (to protect
the write to the field in the call/connection structure that holds
an event handle), and rxevent_Cancel() must take the lock protecting
the red/black tree of events, this establishes a lock order with the
call/connection lock taken before the eventTree lock. This is in
conflict with the event handler thread, which must take the eventTree
lock first, in order to select an event to run (and thus know what
additional lock would need to be taken, by virtue of what handler
function is to be run). The conflict is easy to resolve in the
standard way, by having a local pointer to the event that is obtained
while the event is removed from the red/black tree under the eventTree
lock, and then the eventTree lock can be dropped and the event run
based on the local variable referring to it. The race window occurs
when the caller of rxevent_Cancel() holds the call/connection lock,
and rxevent_Cancel() obtains the eventTree lock just after the event
handler thread drops it in order to run the event. The event handler
function begins to execute, and immediately blocks trying to obtain
the call/connection lock. Now that rxevent_Cancel() has the eventTree
lock it can proceed to search the tree, fail to find the indicated event
in the tree, clear out the event pointer from the call/connection
data structure, drop its caller's reference to the event structure,
and return failure (the event was not cancelled). Only then does the
caller of rxevent_Cancel() drop the call/connection lock and allow
the event handler to make progress.
This race is not necessarily problematic if appropriate care is taken,
but in the previous code such was not the case. In particular, it
is a common idiom for the firing event to call rxevent_Put() on itself,
to release the handle stored in the call/connection that could have
been used to cancel the event before it fired. Failing to do so would
result in a memory leak of event structures; however, rxevent_Put() does
not check for a NULL argument, so a segfault (NULL dereference) was
observed in the test suite when the race occurred and the event handler
tried to rxevent_Put() the reference that had already been released by
the unsuccessful rxevent_Cancel() call. Upon inspection, many (but not
all) of the uses in rx.c were susceptible to a similar race condition
and crash.
The test suite also papers over a related issue in that the event handler
in the test suite always knows that the data structure containing the
event handle will remain live, since it is a global array that is allocated
for the entire scope of the test. In rx.c, events are associated with
calls and connections that have a finite lifetime, so we need to take care
to ensure that the call/connection pointer stored in the event remains
valid for the duration of the event's lifecycle. In particular, even an
attempt to take the call/connection lock to check whether the corresponding
event field is NULL is fraught with risk, as it could crash if the lock
(and containing call/connection) has already been destroyed! There are
several potential ways to ensure the liveness of the associated
call/connection while the event handler runs, most notably to take care
in the call/connection destruction path to ensure that all associated
events are either successfully cancelled or run to completion before
tearing down the call/connection structure, and to give the pending event
its own reference on the associated call/connection. Here, we opt for
the latter, acknowledging that this may result in the event handler thread
doing the full call/connection teardown and delay the firing of subsequent
events. This is deemed acceptable, as pending events are for intentionally
delayed tasks, and some extra delay is probably acceptable. (The various
keepalive events and the challenge event could delay the user experience
and/or security properties if significantly delayed, but I do not believe
that this change admits completely unbounded delay in the event handler
thread, so the practical risk seems minimal.)
Accordingly, this commit attempts to ensure that:
* Each event holds a formal reference on its associated call/connection.
* The appropriate lock is held for all accesses to event pointers in
call/connection structures.
* Each event handler (after taking the appropriate lock) checks whether
it raced with rxevent_Cancel() and only drops the call/connection's
reference to the event if the race did not occur.
* Each event handler drops its reference to the associated call/connection
*after* doing any actions that might access/modify the call/connection.
* The per-event reference on the associated call/connection is dropped by
the thread that removes the event from the red/black tree. That is,
the event handler function if the event runs, or by the caller of
rxevent_Cancel() when the cancellation succeed.
* No non-NULL event handles remain in a call/connection being destroyed,
which would indicate a refcounting error.
(*) There is an additional event used in practice, to reap old connections,
but it is effectively a background task that reschedules itself
periodically, with no handle to the event retained so as to be able
to cancel it. As such, it is unaffected by the concerns raised here.
While here, standardize on the rx_GetConnection() function for incrementing
the reference count on a connection object, instead of inlining the
corresponding mutex lock/unlock and variable access.
Also enable refcount checking unconditionally on unix, as this is a
rather invasive change late in the 1.8.0 release process and we want
to get as much sanity checking coverage as possible.
Change-Id: I27bcb932ec200ff20364fb1b83ea811221f9871c
Reviewed-on: https://gerrit.openafs.org/12756
Reviewed-by: Mark Vitale <mvitale@sinenomine.net>
Reviewed-by: Michael Meffie <mmeffie@sinenomine.net>
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Reviewed-by: Benjamin Kaduk <kaduk@mit.edu>
2017-10-08 04:42:38 +01:00
|
|
|
if (rxevent_Cancel(&events[victim].event))
|
2011-10-23 21:21:39 +01:00
|
|
|
events[victim].cancelled = 1;
|
|
|
|
}
|
|
|
|
pthread_mutex_unlock(&eventListMutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
ok(1, "Added %d events", NUMEVENTS);
|
|
|
|
|
2020-04-26 00:21:10 +01:00
|
|
|
sleep(4);
|
2011-10-23 21:21:39 +01:00
|
|
|
|
|
|
|
fired = 0;
|
|
|
|
cancelled = 0;
|
|
|
|
fail = 0;
|
|
|
|
for (counter = 0; counter < NUMEVENTS; counter++) {
|
|
|
|
if (events[counter].fired)
|
|
|
|
fired++;
|
|
|
|
if (events[counter].cancelled)
|
|
|
|
cancelled++;
|
|
|
|
if (events[counter].cancelled && events[counter].fired)
|
|
|
|
fail = 1;
|
|
|
|
}
|
|
|
|
ok(!fail, "Didn't fire any cancelled events");
|
2020-04-26 00:21:10 +01:00
|
|
|
diag("fired %d/%d events", fired, NUMEVENTS);
|
|
|
|
diag("cancelled %d/%d events", cancelled, NUMEVENTS);
|
|
|
|
is_int(NUMEVENTS, fired+cancelled,
|
2011-10-23 21:21:39 +01:00
|
|
|
"Number of fired and cancelled events sum to correct total");
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|