bug fixed: two tracker leaders occur in rare case

pull/48/head
yuqing 2014-09-13 20:11:59 +08:00
parent 5fcbffbf7a
commit fb7ae7d29a
9 changed files with 222 additions and 88 deletions

View File

@ -1,5 +1,5 @@
Version 5.04 2014-08-24
Version 5.04 2014-09-13
* add fastdfs.spec for build RPM on Linux
* depend on libfastcommon
* in multi tracker servers case, when receive higher status like
@ -7,6 +7,7 @@ Version 5.04 2014-08-24
the tracker adjust storage status to newer, and the storage rejoin
to the tracker server
* fdfs_monitor support delete empty group
* bug fixed: two tracker leaders occur in rare case
Version 5.03 2014-08-10
* network send and recv retry when error EINTR happen

View File

@ -1009,6 +1009,94 @@ static int tracker_merge_servers(ConnectionInfo *pTrackerServer, \
diffServers, pDiffServer - diffServers);
}
static int _notify_reselect_tleader(ConnectionInfo *pTrackerServer)
{
char out_buff[sizeof(TrackerHeader)];
TrackerHeader *pHeader;
int64_t in_bytes;
int result;
pHeader = (TrackerHeader *)out_buff;
memset(out_buff, 0, sizeof(out_buff));
pHeader->cmd = TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER;
if ((result=tcpsenddata_nb(pTrackerServer->sock, out_buff, \
sizeof(out_buff), g_fdfs_network_timeout)) != 0)
{
logError("file: "__FILE__", line: %d, " \
"tracker server %s:%d, send data fail, " \
"errno: %d, error info: %s.", \
__LINE__, pTrackerServer->ip_addr, \
pTrackerServer->port, \
result, STRERROR(result));
return result;
}
if ((result=fdfs_recv_header(pTrackerServer, &in_bytes)) != 0)
{
return result;
}
if (in_bytes != 0)
{
logError("file: "__FILE__", line: %d, " \
"tracker server %s:%d, recv body length: " \
"%"PRId64" != 0", __LINE__, pTrackerServer->ip_addr, \
pTrackerServer->port, in_bytes);
return EINVAL;
}
return 0;
}
static int notify_reselect_tracker_leader(ConnectionInfo *pTrackerServer)
{
int result;
pTrackerServer->sock = -1;
if ((conn=tracker_connect_server(pTrackerServer, &result)) == NULL)
{
return result;
}
result = _notify_reselect_tleader(pTrackerServer);
tracker_disconnect_server_ex(conn, result != 0);
return result;
}
static void set_tracker_leader(const int leader_index)
{
int old_index;
old_index = g_tracker_group.leader_index;
if (old_index >= 0 && old_index != leader_index)
{
TrackerRunningStatus tracker_status;
ConnectionInfo old_leader_server;
memcpy(&old_leader_server, g_tracker_group.servers + old_index,
sizeof(ConnectionInfo));
if (fdfs_get_tracker_status(&old_leader_server, &tracker_status) == 0)
{
if (tracker_status.if_leader)
{
ConnectionInfo new_leader_server;
memcpy(&new_leader_server, g_tracker_group.servers + leader_index,
sizeof(ConnectionInfo));
logWarning("file: "__FILE__", line: %d, "
"two tracker leaders occur, old leader is %s:%d, "
"new leader is %s:%d, notify to re-select "
"tracker leader", __LINE__,
old_leader_server.ip_addr, old_leader_server.port,
new_leader_server.ip_addr, new_leader_server.port);
notify_reselect_tracker_leader(&old_leader_server);
notify_reselect_tracker_leader(&new_leader_server);
g_tracker_group.leader_index = -1;
return;
}
}
}
g_tracker_group.leader_index = leader_index;
}
static int tracker_check_response(ConnectionInfo *pTrackerServer, \
bool *bServerPortChanged)
{
@ -1146,7 +1234,9 @@ static int tracker_check_response(ConnectionInfo *pTrackerServer, \
pTrackerServer->ip_addr, pTrackerServer->port,\
tracker_leader_ip, tracker_leader_port);
g_tracker_group.leader_index = leader_index;
pthread_mutex_lock(&reporter_thread_lock);
set_tracker_leader(leader_index);
pthread_mutex_unlock(&reporter_thread_lock);
}
}

View File

@ -3740,72 +3740,6 @@ static int _tracker_mem_add_storage(FDFSGroupInfo *pGroup, \
return result;
}
int tracker_mem_get_status(ConnectionInfo *pTrackerServer, \
TrackerRunningStatus *pStatus)
{
char in_buff[1 + 2 * FDFS_PROTO_PKG_LEN_SIZE];
TrackerHeader header;
char *pInBuff;
ConnectionInfo *conn;
int64_t in_bytes;
int result;
pTrackerServer->sock = -1;
if ((conn=tracker_connect_server(pTrackerServer, &result)) == NULL)
{
return result;
}
do
{
memset(&header, 0, sizeof(header));
header.cmd = TRACKER_PROTO_CMD_TRACKER_GET_STATUS;
if ((result=tcpsenddata_nb(conn->sock, &header, \
sizeof(header), g_fdfs_network_timeout)) != 0)
{
logError("file: "__FILE__", line: %d, " \
"send data to tracker server %s:%d fail, " \
"errno: %d, error info: %s", __LINE__, \
pTrackerServer->ip_addr, \
pTrackerServer->port, \
result, STRERROR(result));
result = (result == ENOENT ? EACCES : result);
break;
}
pInBuff = in_buff;
result = fdfs_recv_response(conn, &pInBuff, \
sizeof(in_buff), &in_bytes);
if (result != 0)
{
break;
}
if (in_bytes != sizeof(in_buff))
{
logError("file: "__FILE__", line: %d, " \
"tracker server %s:%d response data " \
"length: %"PRId64" is invalid, " \
"expect length: %d.", __LINE__, \
pTrackerServer->ip_addr, pTrackerServer->port, \
in_bytes, (int)sizeof(in_buff));
result = EINVAL;
break;
}
pStatus->if_leader = *in_buff;
pStatus->running_time = buff2long(in_buff + 1);
pStatus->restart_interval = buff2long(in_buff + 1 + \
FDFS_PROTO_PKG_LEN_SIZE);
} while (0);
tracker_disconnect_server_ex(conn, result != 0);
return result;
}
void tracker_calc_running_times(TrackerRunningStatus *pStatus)
{
pStatus->running_time = g_current_time - g_up_time;
@ -4186,7 +4120,7 @@ static int tracker_mem_get_tracker_server(FDFSStorageJoinBody *pJoinBody, \
}
pStatus->pTrackerServer = pTrackerServer;
r = tracker_mem_get_status(pTrackerServer, pStatus);
r = fdfs_get_tracker_status(pTrackerServer, pStatus);
if (r == 0)
{
pStatus++;
@ -5422,6 +5356,7 @@ void tracker_mem_find_trunk_servers()
tracker_mem_find_trunk_server(*ppGroup, true);
}
}
g_trunk_server_chg_count++;
pthread_mutex_unlock(&mem_thread_lock);
}

View File

@ -25,13 +25,6 @@
#define TRUNK_SERVER_CHANGELOG_FILENAME "trunk_server_change.log"
#define STORAGE_DATA_FIELD_SEPERATOR ','
typedef struct {
ConnectionInfo *pTrackerServer;
int running_time; //running seconds, more means higher weight
int restart_interval; //restart interval, less mean higher weight
bool if_leader; //if leader
} TrackerRunningStatus;
#ifdef __cplusplus
extern "C" {
#endif
@ -124,9 +117,6 @@ int tracker_mem_get_storage_index(FDFSGroupInfo *pGroup, \
void tracker_calc_running_times(TrackerRunningStatus *pStatus);
int tracker_mem_get_status(ConnectionInfo *pTrackerServer, \
TrackerRunningStatus *pStatus);
int tracker_save_groups();
void tracker_mem_find_trunk_servers();

View File

@ -617,3 +617,69 @@ int fdfs_get_ini_context_from_tracker(TrackerServerGroup *pTrackerGroup, \
return EINTR;
}
int fdfs_get_tracker_status(ConnectionInfo *pTrackerServer, \
TrackerRunningStatus *pStatus)
{
char in_buff[1 + 2 * FDFS_PROTO_PKG_LEN_SIZE];
TrackerHeader header;
char *pInBuff;
ConnectionInfo *conn;
int64_t in_bytes;
int result;
pTrackerServer->sock = -1;
if ((conn=tracker_connect_server(pTrackerServer, &result)) == NULL)
{
return result;
}
do
{
memset(&header, 0, sizeof(header));
header.cmd = TRACKER_PROTO_CMD_TRACKER_GET_STATUS;
if ((result=tcpsenddata_nb(conn->sock, &header, \
sizeof(header), g_fdfs_network_timeout)) != 0)
{
logError("file: "__FILE__", line: %d, " \
"send data to tracker server %s:%d fail, " \
"errno: %d, error info: %s", __LINE__, \
pTrackerServer->ip_addr, \
pTrackerServer->port, \
result, STRERROR(result));
result = (result == ENOENT ? EACCES : result);
break;
}
pInBuff = in_buff;
result = fdfs_recv_response(conn, &pInBuff, \
sizeof(in_buff), &in_bytes);
if (result != 0)
{
break;
}
if (in_bytes != sizeof(in_buff))
{
logError("file: "__FILE__", line: %d, " \
"tracker server %s:%d response data " \
"length: %"PRId64" is invalid, " \
"expect length: %d.", __LINE__, \
pTrackerServer->ip_addr, pTrackerServer->port, \
in_bytes, (int)sizeof(in_buff));
result = EINVAL;
break;
}
pStatus->if_leader = *in_buff;
pStatus->running_time = buff2long(in_buff + 1);
pStatus->restart_interval = buff2long(in_buff + 1 + \
FDFS_PROTO_PKG_LEN_SIZE);
} while (0);
tracker_disconnect_server_ex(conn, result != 0);
return result;
}

View File

@ -36,13 +36,14 @@
#define TRACKER_PROTO_CMD_STORAGE_GET_SERVER_ID 70 //get storage server id from tracker
#define TRACKER_PROTO_CMD_STORAGE_FETCH_STORAGE_IDS 69 //get all storage ids from tracker
#define TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START 61 //start of tracker get system data files
#define TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END 62 //end of tracker get system data files
#define TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE 63 //tracker get a system data file
#define TRACKER_PROTO_CMD_TRACKER_GET_STATUS 64 //tracker get status of other tracker
#define TRACKER_PROTO_CMD_TRACKER_PING_LEADER 65 //tracker ping leader
#define TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER 66 //notify next leader to other trackers
#define TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER 67 //commit next leader to other trackers
#define TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START 61 //start of tracker get system data files
#define TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END 62 //end of tracker get system data files
#define TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE 63 //tracker get a system data file
#define TRACKER_PROTO_CMD_TRACKER_GET_STATUS 64 //tracker get status of other tracker
#define TRACKER_PROTO_CMD_TRACKER_PING_LEADER 65 //tracker ping leader
#define TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER 66 //notify next leader to other trackers
#define TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER 67 //commit next leader to other trackers
#define TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER 68 //storage notify reselect leader when split-brain
#define TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP 90
#define TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS 91
@ -276,6 +277,9 @@ int fdfs_get_ini_context_from_tracker(TrackerServerGroup *pTrackerGroup, \
IniContext *iniContext, bool * volatile continue_flag, \
const bool client_bind_addr, const char *bind_addr);
int fdfs_get_tracker_status(ConnectionInfo *pTrackerServer, \
TrackerRunningStatus *pStatus);
#ifdef __cplusplus
}
#endif

View File

@ -191,7 +191,7 @@ static int relationship_get_tracker_leader(TrackerRunningStatus *pTrackerStatus)
pTrackerServer<pTrackerEnd; pTrackerServer++)
{
pStatus->pTrackerServer = pTrackerServer;
r = tracker_mem_get_status(pTrackerServer, pStatus);
r = fdfs_get_tracker_status(pTrackerServer, pStatus);
if (r == 0)
{
pStatus++;

View File

@ -879,7 +879,7 @@ static int tracker_deal_notify_next_leader(struct fast_task_info *pTask)
g_tracker_leader_chg_count++;
logError("file: "__FILE__", line: %d, " \
"client ip: %s, two leader occur, " \
"client ip: %s, two leaders occur, " \
"new leader is %s:%d", \
__LINE__, pTask->client_ip, \
leader.ip_addr, leader.port);
@ -1644,6 +1644,44 @@ static int tracker_deal_ping_leader(struct fast_task_info *pTask)
return 0;
}
static int tracker_deal_reselect_leader(struct fast_task_info *pTask)
{
TrackerClientInfo *pClientInfo;
pClientInfo = (TrackerClientInfo *)pTask->arg;
if (pTask->length - sizeof(TrackerHeader) != 0)
{
logError("file: "__FILE__", line: %d, " \
"cmd=%d, client ip: %s, package size " \
PKG_LEN_PRINTF_FORMAT" is not correct, " \
"expect length 0", __LINE__, \
TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER, \
pTask->client_ip, \
pTask->length - (int)sizeof(TrackerHeader));
pTask->length = sizeof(TrackerHeader);
return EINVAL;
}
pTask->length = sizeof(TrackerHeader);
if (!g_if_leader_self)
{
logError("file: "__FILE__", line: %d, " \
"cmd=%d, client ip: %s, i am not the leader!", \
__LINE__, TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER, \
pTask->client_ip);
return EOPNOTSUPP;
}
g_if_leader_self = false;
g_tracker_servers.leader_index = -1;
g_tracker_leader_chg_count++;
logWarning("file: "__FILE__", line: %d, " \
"client ip: %s, i be notified that two leaders occur, " \
"should re-select leader", __LINE__, pTask->client_ip);
return 0;
}
static int tracker_unlock_by_client(struct fast_task_info *pTask)
{
if (lock_by_client_count <= 0 || pTask->finish_callback == NULL)
@ -3730,6 +3768,9 @@ int tracker_deal_task(struct fast_task_info *pTask)
case TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER:
result = tracker_deal_commit_next_leader(pTask);
break;
case TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER:
result = tracker_deal_reselect_leader(pTask);
break;
default:
logError("file: "__FILE__", line: %d, " \
"client ip: %s, unkown cmd: %d", \

View File

@ -433,5 +433,12 @@ typedef struct {
char **paths; //file store paths
} FDFSStorePaths;
typedef struct {
ConnectionInfo *pTrackerServer;
int running_time; //running seconds, more means higher weight
int restart_interval; //restart interval, less mean higher weight
bool if_leader; //if leader
} TrackerRunningStatus;
#endif