Windows: Redesign of server preferences

Server rankings should be composed from three values:

 1. administrative preferences

 2. network address locality

 3. peer performance (rtt and congestion window)

This ensures that local subnet servers are always
preferred.

Add a new rank for down servers so they are always
sorted las in cm_serverRef lists.

Change-Id: Id00e151fc1acd65db558571e6a3a0e7cfbf3c6ca
Reviewed-on: http://gerrit.openafs.org/7757
Tested-by: BuildBot <buildbot@rampaginggeek.com>
Reviewed-by: Jeffrey Altman <jaltman@your-file-system.com>
This commit is contained in:
Jeffrey Altman 2012-07-09 09:34:22 -04:00
parent 65a5e3ce92
commit a9be0735b5
8 changed files with 114 additions and 72 deletions

View File

@ -312,9 +312,9 @@ static void afsd_InitServerPreferences(void)
if ( tsp ) /* an existing server - ref count increased */
{
lock_ObtainMutex(&tsp->mx);
tsp->ipRank = (USHORT)dwRank;
tsp->adminRank = (USHORT)dwRank;
_InterlockedOr(&tsp->flags, CM_SERVERFLAG_PREF_SET);
tsp->adminRank = tsp->ipRank;
cm_RankServer(tsp);
lock_ReleaseMutex(&tsp->mx);
/* set preferences for an existing vlserver */
@ -325,9 +325,9 @@ static void afsd_InitServerPreferences(void)
{
tsp = cm_NewServer(&saddr, CM_SERVER_VLDB, NULL, NULL, CM_FLAG_NOPROBE); /* refcount = 1 */
lock_ObtainMutex(&tsp->mx);
tsp->ipRank = (USHORT)dwRank;
tsp->adminRank = (USHORT)dwRank;
_InterlockedOr(&tsp->flags, CM_SERVERFLAG_PREF_SET);
tsp->adminRank = tsp->ipRank;
cm_RankServer(tsp);
lock_ReleaseMutex(&tsp->mx);
}
}
@ -388,9 +388,9 @@ static void afsd_InitServerPreferences(void)
if ( tsp ) /* an existing server - ref count increased */
{
lock_ObtainMutex(&tsp->mx);
tsp->ipRank = (USHORT)dwRank;
tsp->adminRank = (USHORT)dwRank;
_InterlockedOr(&tsp->flags, CM_SERVERFLAG_PREF_SET);
tsp->adminRank = tsp->ipRank;
cm_RankServer(tsp);
lock_ReleaseMutex(&tsp->mx);
/* find volumes which might have RO copy
@ -404,9 +404,9 @@ static void afsd_InitServerPreferences(void)
{
tsp = cm_NewServer(&saddr, CM_SERVER_FILE, NULL, NULL, CM_FLAG_NOPROBE); /* refcount = 1 */
lock_ObtainMutex(&tsp->mx);
tsp->ipRank = (USHORT)dwRank;
tsp->adminRank = (USHORT)dwRank;
_InterlockedOr(&tsp->flags, CM_SERVERFLAG_PREF_SET);
tsp->adminRank = tsp->ipRank;
cm_RankServer(tsp);
lock_ReleaseMutex(&tsp->mx);
}
}

View File

@ -33,7 +33,7 @@ osi_rwlock_t cm_cellLock;
*
* At the present time the return value is ignored by the caller.
*/
long cm_AddCellProc(void *rockp, struct sockaddr_in *addrp, char *hostnamep, unsigned short ipRank)
long cm_AddCellProc(void *rockp, struct sockaddr_in *addrp, char *hostnamep, unsigned short adminRank)
{
cm_server_t *tsp;
cm_serverRef_t *tsrp;
@ -59,8 +59,8 @@ long cm_AddCellProc(void *rockp, struct sockaddr_in *addrp, char *hostnamep, uns
else
tsp = cm_NewServer(addrp, CM_SERVER_VLDB, cellp, NULL, probe ? 0 : CM_FLAG_NOPROBE);
if (ipRank)
tsp->ipRank = ipRank;
if (adminRank)
tsp->adminRank = adminRank;
/* Insert the vlserver into a sorted list, sorted by server rank */
tsrp = cm_NewServerRef(tsp, 0);

View File

@ -478,7 +478,7 @@ long cm_SearchCellRegistry(afs_uint32 client,
DWORD dwType, dwSize;
DWORD dwCells, dwServers, dwForceDNS;
DWORD dwIndex, dwRank, dwPort;
unsigned short ipRank;
unsigned short adminRank;
unsigned short vlPort;
LONG code;
FILETIME ftLastWriteTime;
@ -659,9 +659,9 @@ long cm_SearchCellRegistry(afs_uint32 client,
code = RegQueryValueEx(hkServerName, "Rank", NULL, &dwType,
(BYTE *) &dwRank, &dwSize);
if (code == ERROR_SUCCESS && dwType == REG_DWORD) {
ipRank = (unsigned short)(dwRank <= 65535 ? dwRank : 65535);
adminRank = (unsigned short)(dwRank <= 65535 ? dwRank : 65535);
} else {
ipRank = 0;
adminRank = 0;
}
dwSize = sizeof(szAddr);
@ -690,7 +690,7 @@ long cm_SearchCellRegistry(afs_uint32 client,
vlSockAddr.sin_family = AF_INET;
/* sin_port supplied by connection code */
if (procp)
(*procp)(rockp, &vlSockAddr, s, ipRank);
(*procp)(rockp, &vlSockAddr, s, adminRank);
} else if (szAddr[0]) {
afs_uint32 ip_addr;
unsigned int c1, c2, c3, c4;
@ -713,7 +713,7 @@ long cm_SearchCellRegistry(afs_uint32 client,
vlSockAddr.sin_family = AF_INET;
/* sin_port supplied by connection code */
if (procp)
(*procp)(rockp, &vlSockAddr, s, ipRank);
(*procp)(rockp, &vlSockAddr, s, adminRank);
}
}
@ -896,7 +896,7 @@ long cm_SearchCellByDNS(char *cellNamep, char *newCellNamep, int *ttl,
int rc;
int cellHostAddrs[AFSMAXCELLHOSTS];
char cellHostNames[AFSMAXCELLHOSTS][MAXHOSTCHARS];
unsigned short ipRanks[AFSMAXCELLHOSTS];
unsigned short adminRanks[AFSMAXCELLHOSTS];
unsigned short ports[AFSMAXCELLHOSTS]; /* network byte order */
int numServers;
int i;
@ -918,7 +918,7 @@ long cm_SearchCellByDNS(char *cellNamep, char *newCellNamep, int *ttl,
return -1;
rc = getAFSServer("afs3-vlserver", "udp", cellNamep, htons(7003),
cellHostAddrs, cellHostNames, ports, ipRanks, &numServers, ttl);
cellHostAddrs, cellHostNames, ports, adminRanks, &numServers, ttl);
if (rc == 0 && numServers > 0) { /* found the cell */
for (i = 0; i < numServers; i++) {
memcpy(&vlSockAddr.sin_addr.s_addr, &cellHostAddrs[i],
@ -926,7 +926,7 @@ long cm_SearchCellByDNS(char *cellNamep, char *newCellNamep, int *ttl,
vlSockAddr.sin_port = ports[i];
vlSockAddr.sin_family = AF_INET;
if (procp)
(*procp)(rockp, &vlSockAddr, cellHostNames[i], ipRanks[i]);
(*procp)(rockp, &vlSockAddr, cellHostNames[i], adminRanks[i]);
}
if (newCellNamep) {
if(FAILED(StringCchCopy(newCellNamep, CELL_MAXNAMELEN, cellNamep)))

View File

@ -485,7 +485,7 @@ void printReplyBuffer_AFSDB(PDNS_HDR replyBuff)
};
void processReplyBuffer_AFSDB(SOCKET commSock, PDNS_HDR replyBuff, int *cellHostAddrs, char cellHostNames[][MAXHOSTCHARS],
unsigned short ports[], unsigned short ipRanks[], int *numServers, int *ttl)
unsigned short ports[], unsigned short adminRanks[], int *numServers, int *ttl)
/*PAFS_SRV_LIST (srvList)*/
{
u_char *ptr = (u_char *) replyBuff;
@ -538,7 +538,7 @@ void processReplyBuffer_AFSDB(SOCKET commSock, PDNS_HDR replyBuff, int *cellHost
memcpy(&cellHostAddrs[srvCount], &addr.s_addr, sizeof(addr.s_addr));
strncpy(cellHostNames[srvCount], hostName, CELL_MAXNAMELEN);
cellHostNames[srvCount][CELL_MAXNAMELEN-1] = '\0';
ipRanks[srvCount] = 0;
adminRanks[srvCount] = 0;
ports[srvCount] = htons(7003);
srvCount++;
}
@ -629,7 +629,7 @@ int getAFSServer(const char *service, const char *protocol, const char *cellName
unsigned short afsdbPort, /* network byte order */
int *cellHostAddrs, char cellHostNames[][MAXHOSTCHARS],
unsigned short ports[], /* network byte order */
unsigned short ipRanks[],
unsigned short adminRanks[],
int *numServers, int *ttl)
{
#ifndef DNSAPI_ENV
@ -688,7 +688,7 @@ int getAFSServer(const char *service, const char *protocol, const char *cellName
/*printReplyBuffer_AFSDB(pDNShdr);*/
if (pDNShdr)
processReplyBuffer_AFSDB(commSock, pDNShdr, cellHostAddrs, cellHostNames, ports, ipRanks, numServers, ttl);
processReplyBuffer_AFSDB(commSock, pDNShdr, cellHostAddrs, cellHostNames, ports, adminRanks, numServers, ttl);
closesocket(commSock);
if (*numServers == 0)
@ -724,7 +724,7 @@ int getAFSServer(const char *service, const char *protocol, const char *cellName
if (pDnsIter->wType == DNS_TYPE_SRV) {
StringCbCopyA(cellHostNames[*numServers], sizeof(cellHostNames[*numServers]),
pDnsIter->Data.SRV.pNameTarget);
ipRanks[*numServers] = pDnsIter->Data.SRV.wPriority;
adminRanks[*numServers] = pDnsIter->Data.SRV.wPriority;
ports[*numServers] = htons(pDnsIter->Data.SRV.wPort);
(*numServers)++;
@ -792,7 +792,7 @@ int getAFSServer(const char *service, const char *protocol, const char *cellName
if (pDnsIter->wType == DNS_TYPE_AFSDB && pDnsIter->Data.Afsdb.wPreference == 1) {
StringCbCopyA(cellHostNames[*numServers], sizeof(cellHostNames[*numServers]),
pDnsIter->Data.Afsdb.pNameExchange);
ipRanks[*numServers] = 0;
adminRanks[*numServers] = 0;
ports[*numServers] = afsdbPort;
(*numServers)++;
@ -860,7 +860,7 @@ int getAFSServerW(const cm_unichar_t *service, const cm_unichar_t *protocol, con
int *cellHostAddrs,
cm_unichar_t cellHostNames[][MAXHOSTCHARS],
unsigned short ports[], /* network byte order */
unsigned short ipRanks[],
unsigned short adminRanks[],
int *numServers, int *ttl)
{
#ifdef DNSAPI_ENV
@ -893,7 +893,7 @@ int getAFSServerW(const cm_unichar_t *service, const cm_unichar_t *protocol, con
if (pDnsIter->wType == DNS_TYPE_SRV) {
StringCbCopyW(cellHostNames[*numServers], sizeof(cellHostNames[*numServers]),
pDnsIter->Data.SRV.pNameTarget);
ipRanks[*numServers] = pDnsIter->Data.SRV.wPriority;
adminRanks[*numServers] = pDnsIter->Data.SRV.wPriority;
ports[*numServers] = htons(pDnsIter->Data.SRV.wPort);
(*numServers)++;
@ -963,7 +963,7 @@ int getAFSServerW(const cm_unichar_t *service, const cm_unichar_t *protocol, con
if (pDnsIter->wType == DNS_TYPE_AFSDB && pDnsIter->Data.Afsdb.wPreference == 1) {
StringCbCopyW(cellHostNames[*numServers], sizeof(cellHostNames[*numServers]),
pDnsIter->Data.Afsdb.pNameExchange);
ipRanks[*numServers] = 0;
adminRanks[*numServers] = 0;
ports[*numServers] = afsdbPort;
(*numServers)++;

View File

@ -1948,9 +1948,8 @@ cm_IoctlSetSPrefs(struct cm_ioctl *ioctlp, struct cm_user *userp)
if ( tsp ) /* an existing server - ref count increased */
{
lock_ObtainMutex(&tsp->mx);
tsp->ipRank = rank;
tsp->adminRank = rank;
_InterlockedOr(&tsp->flags, CM_SERVERFLAG_PREF_SET);
tsp->adminRank = tsp->ipRank;
lock_ReleaseMutex(&tsp->mx);
switch (type) {
@ -1972,11 +1971,9 @@ cm_IoctlSetSPrefs(struct cm_ioctl *ioctlp, struct cm_user *userp)
{
tsp = cm_NewServer(&tmp, type, NULL, NULL, CM_FLAG_NOPROBE); /* refcount = 1 */
lock_ObtainMutex(&tsp->mx);
tsp->ipRank = rank;
tsp->adminRank = rank;
_InterlockedOr(&tsp->flags, CM_SERVERFLAG_PREF_SET);
tsp->adminRank = tsp->ipRank;
lock_ReleaseMutex(&tsp->mx);
tsp->ipRank = rank;
}
cm_PutServer(tsp); /* decrease refcount */
}
@ -2017,7 +2014,7 @@ cm_IoctlGetSPrefs(struct cm_ioctl *ioctlp, struct cm_user *userp)
continue; /* ignore vlservers */
srvout->host = tsp->addr.sin_addr;
srvout->rank = tsp->ipRank;
srvout->rank = tsp->activeRank;
srvout++;
spout->num_servers++;
noServers--;

View File

@ -60,7 +60,11 @@ cm_RankServer(cm_server_t * tsp)
afs_int32 code = 0; /* start with "success" */
struct rx_debugPeer tpeer;
afs_uint16 port;
afs_uint16 newRank;
afs_uint64 newRank;
afs_uint64 perfRank = 0;
afs_uint64 rtt;
double log_rtt;
int isDown = (tsp->flags & CM_SERVERFLAG_DOWN);
switch(tsp->type) {
case CM_SERVER_VLDB:
@ -73,23 +77,61 @@ cm_RankServer(cm_server_t * tsp)
return -1;
}
code = rx_GetLocalPeers(tsp->addr.sin_addr.s_addr, port, &tpeer);
cm_SetServerIPRank(tsp);
/*check if rx_GetLocalPeers succeeded and if there is data for tsp */
if(code == 0 && (tpeer.rtt == 0 && tpeer.rtt_dev == 0))
code = -1;
if (isDown) {
newRank = 0xFFFF;
} else {
/*
* There are three potential components to the ranking:
* 1. Any administrative set preference whether it be
* via "fs setserverprefs", registry or dns.
*
* 2. Network subnet mask comparison.
*
* 3. Performance data.
*
* If there is an administrative rank, that is the
* the primary factor. If not the primary factor
* is the network ranking.
*/
if(code == 0) {
if((tsp->flags & CM_SERVERFLAG_PREF_SET))
newRank = tsp->adminRank +
((int)(623 * log(tpeer.rtt) / 10) * 10 + 5);
else /* rank has not been set by admin, derive rank from rtt */
newRank = (int)(7200 * log(tpeer.rtt) / 5000) * 5000 + 5000;
code = rx_GetLocalPeers(tsp->addr.sin_addr.s_addr, port, &tpeer);
if (code == 0) {
if (tpeer.rtt) {
/* rtt is ms/8 */
rtt = tpeer.rtt;
log_rtt = log(tpeer.rtt);
perfRank += (6000 * log_rtt / 5000) * 5000;
newRank += (rand() & 0x000f); /* randomize */
if (tsp->type == CM_SERVER_FILE) {
/* give an edge to servers with high congestion windows */
perfRank -= (tpeer.cwind - 1)* 15;
}
}
}
if (abs(newRank - tsp->ipRank) > 0xf) {
tsp->ipRank = newRank;
if (tsp->adminRank) {
newRank = tsp->adminRank * 0.8;
newRank += tsp->ipRank * 0.2;
} else {
newRank = tsp->ipRank;
}
if (perfRank) {
newRank *= 0.9;
newRank += perfRank * 0.1;
}
newRank += (rand() & 0x000f); /* randomize */
if (newRank > 0xFFFF)
osi_Log1(afsd_logp, "new server rank %I64u exceeds 0xFFFF", newRank);
/*
* If the ranking changes by more than the randomization
* factor, update the server reference lists.
*/
if (abs(newRank - tsp->activeRank) > 0xf) {
tsp->activeRank = newRank;
lock_ReleaseMutex(&tsp->mx);
switch (tsp->type) {
@ -765,19 +807,17 @@ void cm_SetServerNoInlineBulk(cm_server_t * serverp, int no)
lock_ReleaseMutex(&serverp->mx);
}
void cm_SetServerPrefs(cm_server_t * serverp)
void cm_SetServerIPRank(cm_server_t * serverp)
{
unsigned long serverAddr; /* in host byte order */
unsigned long myAddr, myNet, mySubnet;/* in host byte order */
unsigned long netMask;
int i;
long code;
int writeLock = 0;
lock_ObtainRead(&cm_syscfgLock);
if (cm_LanAdapterChangeDetected) {
lock_ConvertRToW(&cm_syscfgLock);
writeLock = 1;
if (cm_LanAdapterChangeDetected) {
/* get network related info */
cm_noIPAddr = CM_MAXINTERFACE_ADDR;
@ -814,19 +854,18 @@ void cm_SetServerPrefs(cm_server_t * serverp)
{
if ( (serverAddr & cm_SubnetMask[i]) == mySubnet)
{
if ( serverAddr == myAddr )
if ( serverAddr == myAddr ) {
serverp->ipRank = min(serverp->ipRank,
CM_IPRANK_TOP);/* same machine */
else serverp->ipRank = min(serverp->ipRank,
CM_IPRANK_HI); /* same subnet */
}
else serverp->ipRank = min(serverp->ipRank,CM_IPRANK_MED);
/* same net */
} else {
serverp->ipRank = min(serverp->ipRank,
CM_IPRANK_HI); /* same subnet */
}
} else {
serverp->ipRank = min(serverp->ipRank, CM_IPRANK_MED); /* same net */
}
}
} /* and of for loop */
/* random between 0..15*/
serverp->ipRank += (rand() % 0x000f);
lock_ReleaseRead(&cm_syscfgLock);
}
@ -862,7 +901,7 @@ cm_server_t *cm_NewServer(struct sockaddr_in *socketp, int type, cm_cell_t *cell
lock_InitializeMutex(&tsp->mx, "cm_server_t mutex", LOCK_HIERARCHY_SERVER);
tsp->addr = *socketp;
cm_SetServerPrefs(tsp);
cm_SetServerIPRank(tsp);
tsp->allNextp = cm_allServersp;
cm_allServersp = tsp;
@ -1074,7 +1113,7 @@ LONG_PTR cm_ChecksumServerList(cm_serverRef_t *serversp)
void cm_InsertServerList(cm_serverRef_t** list, cm_serverRef_t* element)
{
cm_serverRef_t *current;
unsigned short ipRank;
unsigned short rank;
lock_ObtainWrite(&cm_serverLock);
/*
@ -1142,10 +1181,10 @@ void cm_InsertServerList(cm_serverRef_t** list, cm_serverRef_t* element)
goto done;
}
ipRank = element->server->ipRank;
rank = element->server->activeRank;
/* insertion at the beginning of the list */
if ((*list)->server->ipRank > ipRank)
if ((*list)->server->activeRank > rank)
{
element->next = *list;
*list = element;
@ -1155,7 +1194,7 @@ void cm_InsertServerList(cm_serverRef_t** list, cm_serverRef_t* element)
/* find appropriate place to insert */
for ( current = *list; current->next; current = current->next)
{
if ( current->next->server->ipRank > ipRank )
if ( current->next->server->activeRank > rank )
break;
}
element->next = current->next;
@ -1226,10 +1265,10 @@ void cm_RandomizeServer(cm_serverRef_t** list)
}
/* count the number of servers with the lowest rank */
lowestRank = tsrp->server->ipRank;
lowestRank = tsrp->server->activeRank;
for ( count=1, tsrp=tsrp->next; tsrp; tsrp=tsrp->next)
{
if ( tsrp->server->ipRank != lowestRank)
if ( tsrp->server->activeRank != lowestRank)
break;
else
count++;
@ -1451,7 +1490,7 @@ int cm_DumpServers(FILE *outputFile, char *cookie, int lock)
"flags=0x%x waitCount=%u rank=%u downTime=\"%s\" refCount=%u\r\n",
cookie, tsp, tsp->cellp ? tsp->cellp->name : "", hoststr,
ntohs(tsp->addr.sin_port), uuidstr, type,
tsp->capabilities, tsp->flags, tsp->waitCount, tsp->ipRank,
tsp->capabilities, tsp->flags, tsp->waitCount, tsp->activeRank,
(tsp->flags & CM_SERVERFLAG_DOWN) ? down : "up",
tsp->refCount);
WriteFile(outputFile, output, (DWORD)strlen(output), &zilch, NULL);

View File

@ -34,11 +34,16 @@ typedef struct cm_server {
struct cm_cell *cellp; /* cell containing this server */
afs_int32 refCount; /* Interlocked with cm_serverLock */
osi_mutex_t mx;
unsigned short ipRank; /* server priority */
unsigned short ipRank; /* network class rank */
unsigned short adminRank; /* set if admin sets a rank
* (fs setserverpref or registry or dns)
*/
unsigned short activeRank; /* Computed rank combining ipRank, adminRank,
* and performance data.
*/
cm_server_vols_t * vols; /* by mx */
time_t downTime; /* by mx */
afsUUID uuid; /* by mx */
unsigned short adminRank; /* only set if admin sets a rank */
} cm_server_t;
enum repstate {srv_not_busy, srv_busy, srv_offline, srv_deleted};
@ -75,6 +80,7 @@ typedef struct cm_serverRef {
#define CM_IPRANK_HI 20000 /* on same subnet */
#define CM_IPRANK_MED 30000 /* on same network */
#define CM_IPRANK_LOW 40000 /* on different networks */
#define CM_IPRANK_DOWN 65535 /* unavailable */
/* the maximum number of network interfaces that this client has */
@ -115,7 +121,7 @@ extern afs_int32 cm_RankServer(cm_server_t * server);
extern void cm_RankUpServers();
extern void cm_SetServerPrefs(cm_server_t * serverp);
extern void cm_SetServerIPRank(cm_server_t * serverp);
extern void cm_InsertServerList(cm_serverRef_t** list,cm_serverRef_t* element);

View File

@ -653,7 +653,7 @@ long cm_UpdateVolumeLocation(struct cm_cell *cellp, cm_user_t *userp, cm_req_t *
/*
* Randomize RO list
*
* If the first n servers have the same ipRank, then we
* If the first n servers have the same rank, then we
* randomly pick one among them and move it to the beginning.
* We don't bother to re-order the whole list because
* the rest of the list is used only if the first server is