From fb7ae7d29a700dd80cca9275a3b42b06fe0730f0 Mon Sep 17 00:00:00 2001 From: yuqing Date: Sat, 13 Sep 2014 20:11:59 +0800 Subject: [PATCH] bug fixed: two tracker leaders occur in rare case --- HISTORY | 3 +- storage/tracker_client_thread.c | 92 ++++++++++++++++++++++++++++++++- tracker/tracker_mem.c | 69 +------------------------ tracker/tracker_mem.h | 10 ---- tracker/tracker_proto.c | 66 +++++++++++++++++++++++ tracker/tracker_proto.h | 18 ++++--- tracker/tracker_relationship.c | 2 +- tracker/tracker_service.c | 43 ++++++++++++++- tracker/tracker_types.h | 7 +++ 9 files changed, 222 insertions(+), 88 deletions(-) diff --git a/HISTORY b/HISTORY index e2f5cdb..7b05eef 100644 --- a/HISTORY +++ b/HISTORY @@ -1,5 +1,5 @@ -Version 5.04 2014-08-24 +Version 5.04 2014-09-13 * add fastdfs.spec for build RPM on Linux * depend on libfastcommon * in multi tracker servers case, when receive higher status like @@ -7,6 +7,7 @@ Version 5.04 2014-08-24 the tracker adjust storage status to newer, and the storage rejoin to the tracker server * fdfs_monitor support delete empty group + * bug fixed: two tracker leaders occur in rare case Version 5.03 2014-08-10 * network send and recv retry when error EINTR happen diff --git a/storage/tracker_client_thread.c b/storage/tracker_client_thread.c index 503e507..4bfd133 100644 --- a/storage/tracker_client_thread.c +++ b/storage/tracker_client_thread.c @@ -1009,6 +1009,94 @@ static int tracker_merge_servers(ConnectionInfo *pTrackerServer, \ diffServers, pDiffServer - diffServers); } +static int _notify_reselect_tleader(ConnectionInfo *pTrackerServer) +{ + char out_buff[sizeof(TrackerHeader)]; + TrackerHeader *pHeader; + int64_t in_bytes; + int result; + + pHeader = (TrackerHeader *)out_buff; + memset(out_buff, 0, sizeof(out_buff)); + pHeader->cmd = TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER; + if ((result=tcpsenddata_nb(pTrackerServer->sock, out_buff, \ + sizeof(out_buff), g_fdfs_network_timeout)) != 0) + { + logError("file: "__FILE__", line: %d, " \ + "tracker server %s:%d, send data fail, " \ + "errno: %d, error info: %s.", \ + __LINE__, pTrackerServer->ip_addr, \ + pTrackerServer->port, \ + result, STRERROR(result)); + return result; + } + + if ((result=fdfs_recv_header(pTrackerServer, &in_bytes)) != 0) + { + return result; + } + + if (in_bytes != 0) + { + logError("file: "__FILE__", line: %d, " \ + "tracker server %s:%d, recv body length: " \ + "%"PRId64" != 0", __LINE__, pTrackerServer->ip_addr, \ + pTrackerServer->port, in_bytes); + return EINVAL; + } + + return 0; +} + +static int notify_reselect_tracker_leader(ConnectionInfo *pTrackerServer) +{ + int result; + + pTrackerServer->sock = -1; + if ((conn=tracker_connect_server(pTrackerServer, &result)) == NULL) + { + return result; + } + + result = _notify_reselect_tleader(pTrackerServer); + tracker_disconnect_server_ex(conn, result != 0); + return result; +} + +static void set_tracker_leader(const int leader_index) +{ + int old_index; + old_index = g_tracker_group.leader_index; + if (old_index >= 0 && old_index != leader_index) + { + TrackerRunningStatus tracker_status; + ConnectionInfo old_leader_server; + memcpy(&old_leader_server, g_tracker_group.servers + old_index, + sizeof(ConnectionInfo)); + if (fdfs_get_tracker_status(&old_leader_server, &tracker_status) == 0) + { + if (tracker_status.if_leader) + { + ConnectionInfo new_leader_server; + memcpy(&new_leader_server, g_tracker_group.servers + leader_index, + sizeof(ConnectionInfo)); + logWarning("file: "__FILE__", line: %d, " + "two tracker leaders occur, old leader is %s:%d, " + "new leader is %s:%d, notify to re-select " + "tracker leader", __LINE__, + old_leader_server.ip_addr, old_leader_server.port, + new_leader_server.ip_addr, new_leader_server.port); + + notify_reselect_tracker_leader(&old_leader_server); + notify_reselect_tracker_leader(&new_leader_server); + g_tracker_group.leader_index = -1; + return; + } + } + } + g_tracker_group.leader_index = leader_index; +} + static int tracker_check_response(ConnectionInfo *pTrackerServer, \ bool *bServerPortChanged) { @@ -1146,7 +1234,9 @@ static int tracker_check_response(ConnectionInfo *pTrackerServer, \ pTrackerServer->ip_addr, pTrackerServer->port,\ tracker_leader_ip, tracker_leader_port); - g_tracker_group.leader_index = leader_index; + pthread_mutex_lock(&reporter_thread_lock); + set_tracker_leader(leader_index); + pthread_mutex_unlock(&reporter_thread_lock); } } diff --git a/tracker/tracker_mem.c b/tracker/tracker_mem.c index 05716d8..78e8e37 100644 --- a/tracker/tracker_mem.c +++ b/tracker/tracker_mem.c @@ -3740,72 +3740,6 @@ static int _tracker_mem_add_storage(FDFSGroupInfo *pGroup, \ return result; } -int tracker_mem_get_status(ConnectionInfo *pTrackerServer, \ - TrackerRunningStatus *pStatus) -{ - char in_buff[1 + 2 * FDFS_PROTO_PKG_LEN_SIZE]; - TrackerHeader header; - char *pInBuff; - ConnectionInfo *conn; - int64_t in_bytes; - int result; - - pTrackerServer->sock = -1; - if ((conn=tracker_connect_server(pTrackerServer, &result)) == NULL) - { - return result; - } - - do - { - memset(&header, 0, sizeof(header)); - header.cmd = TRACKER_PROTO_CMD_TRACKER_GET_STATUS; - if ((result=tcpsenddata_nb(conn->sock, &header, \ - sizeof(header), g_fdfs_network_timeout)) != 0) - { - logError("file: "__FILE__", line: %d, " \ - "send data to tracker server %s:%d fail, " \ - "errno: %d, error info: %s", __LINE__, \ - pTrackerServer->ip_addr, \ - pTrackerServer->port, \ - result, STRERROR(result)); - - result = (result == ENOENT ? EACCES : result); - break; - } - - pInBuff = in_buff; - result = fdfs_recv_response(conn, &pInBuff, \ - sizeof(in_buff), &in_bytes); - if (result != 0) - { - break; - } - - if (in_bytes != sizeof(in_buff)) - { - logError("file: "__FILE__", line: %d, " \ - "tracker server %s:%d response data " \ - "length: %"PRId64" is invalid, " \ - "expect length: %d.", __LINE__, \ - pTrackerServer->ip_addr, pTrackerServer->port, \ - in_bytes, (int)sizeof(in_buff)); - result = EINVAL; - break; - } - - pStatus->if_leader = *in_buff; - pStatus->running_time = buff2long(in_buff + 1); - pStatus->restart_interval = buff2long(in_buff + 1 + \ - FDFS_PROTO_PKG_LEN_SIZE); - - } while (0); - - tracker_disconnect_server_ex(conn, result != 0); - - return result; -} - void tracker_calc_running_times(TrackerRunningStatus *pStatus) { pStatus->running_time = g_current_time - g_up_time; @@ -4186,7 +4120,7 @@ static int tracker_mem_get_tracker_server(FDFSStorageJoinBody *pJoinBody, \ } pStatus->pTrackerServer = pTrackerServer; - r = tracker_mem_get_status(pTrackerServer, pStatus); + r = fdfs_get_tracker_status(pTrackerServer, pStatus); if (r == 0) { pStatus++; @@ -5422,6 +5356,7 @@ void tracker_mem_find_trunk_servers() tracker_mem_find_trunk_server(*ppGroup, true); } } + g_trunk_server_chg_count++; pthread_mutex_unlock(&mem_thread_lock); } diff --git a/tracker/tracker_mem.h b/tracker/tracker_mem.h index 61073bf..9e414ea 100644 --- a/tracker/tracker_mem.h +++ b/tracker/tracker_mem.h @@ -25,13 +25,6 @@ #define TRUNK_SERVER_CHANGELOG_FILENAME "trunk_server_change.log" #define STORAGE_DATA_FIELD_SEPERATOR ',' -typedef struct { - ConnectionInfo *pTrackerServer; - int running_time; //running seconds, more means higher weight - int restart_interval; //restart interval, less mean higher weight - bool if_leader; //if leader -} TrackerRunningStatus; - #ifdef __cplusplus extern "C" { #endif @@ -124,9 +117,6 @@ int tracker_mem_get_storage_index(FDFSGroupInfo *pGroup, \ void tracker_calc_running_times(TrackerRunningStatus *pStatus); -int tracker_mem_get_status(ConnectionInfo *pTrackerServer, \ - TrackerRunningStatus *pStatus); - int tracker_save_groups(); void tracker_mem_find_trunk_servers(); diff --git a/tracker/tracker_proto.c b/tracker/tracker_proto.c index 85f3e73..d3a6667 100644 --- a/tracker/tracker_proto.c +++ b/tracker/tracker_proto.c @@ -617,3 +617,69 @@ int fdfs_get_ini_context_from_tracker(TrackerServerGroup *pTrackerGroup, \ return EINTR; } +int fdfs_get_tracker_status(ConnectionInfo *pTrackerServer, \ + TrackerRunningStatus *pStatus) +{ + char in_buff[1 + 2 * FDFS_PROTO_PKG_LEN_SIZE]; + TrackerHeader header; + char *pInBuff; + ConnectionInfo *conn; + int64_t in_bytes; + int result; + + pTrackerServer->sock = -1; + if ((conn=tracker_connect_server(pTrackerServer, &result)) == NULL) + { + return result; + } + + do + { + memset(&header, 0, sizeof(header)); + header.cmd = TRACKER_PROTO_CMD_TRACKER_GET_STATUS; + if ((result=tcpsenddata_nb(conn->sock, &header, \ + sizeof(header), g_fdfs_network_timeout)) != 0) + { + logError("file: "__FILE__", line: %d, " \ + "send data to tracker server %s:%d fail, " \ + "errno: %d, error info: %s", __LINE__, \ + pTrackerServer->ip_addr, \ + pTrackerServer->port, \ + result, STRERROR(result)); + + result = (result == ENOENT ? EACCES : result); + break; + } + + pInBuff = in_buff; + result = fdfs_recv_response(conn, &pInBuff, \ + sizeof(in_buff), &in_bytes); + if (result != 0) + { + break; + } + + if (in_bytes != sizeof(in_buff)) + { + logError("file: "__FILE__", line: %d, " \ + "tracker server %s:%d response data " \ + "length: %"PRId64" is invalid, " \ + "expect length: %d.", __LINE__, \ + pTrackerServer->ip_addr, pTrackerServer->port, \ + in_bytes, (int)sizeof(in_buff)); + result = EINVAL; + break; + } + + pStatus->if_leader = *in_buff; + pStatus->running_time = buff2long(in_buff + 1); + pStatus->restart_interval = buff2long(in_buff + 1 + \ + FDFS_PROTO_PKG_LEN_SIZE); + + } while (0); + + tracker_disconnect_server_ex(conn, result != 0); + + return result; +} + diff --git a/tracker/tracker_proto.h b/tracker/tracker_proto.h index cf293b7..3384f8c 100644 --- a/tracker/tracker_proto.h +++ b/tracker/tracker_proto.h @@ -36,13 +36,14 @@ #define TRACKER_PROTO_CMD_STORAGE_GET_SERVER_ID 70 //get storage server id from tracker #define TRACKER_PROTO_CMD_STORAGE_FETCH_STORAGE_IDS 69 //get all storage ids from tracker -#define TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START 61 //start of tracker get system data files -#define TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END 62 //end of tracker get system data files -#define TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE 63 //tracker get a system data file -#define TRACKER_PROTO_CMD_TRACKER_GET_STATUS 64 //tracker get status of other tracker -#define TRACKER_PROTO_CMD_TRACKER_PING_LEADER 65 //tracker ping leader -#define TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER 66 //notify next leader to other trackers -#define TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER 67 //commit next leader to other trackers +#define TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_START 61 //start of tracker get system data files +#define TRACKER_PROTO_CMD_TRACKER_GET_SYS_FILES_END 62 //end of tracker get system data files +#define TRACKER_PROTO_CMD_TRACKER_GET_ONE_SYS_FILE 63 //tracker get a system data file +#define TRACKER_PROTO_CMD_TRACKER_GET_STATUS 64 //tracker get status of other tracker +#define TRACKER_PROTO_CMD_TRACKER_PING_LEADER 65 //tracker ping leader +#define TRACKER_PROTO_CMD_TRACKER_NOTIFY_NEXT_LEADER 66 //notify next leader to other trackers +#define TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER 67 //commit next leader to other trackers +#define TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER 68 //storage notify reselect leader when split-brain #define TRACKER_PROTO_CMD_SERVER_LIST_ONE_GROUP 90 #define TRACKER_PROTO_CMD_SERVER_LIST_ALL_GROUPS 91 @@ -276,6 +277,9 @@ int fdfs_get_ini_context_from_tracker(TrackerServerGroup *pTrackerGroup, \ IniContext *iniContext, bool * volatile continue_flag, \ const bool client_bind_addr, const char *bind_addr); +int fdfs_get_tracker_status(ConnectionInfo *pTrackerServer, \ + TrackerRunningStatus *pStatus); + #ifdef __cplusplus } #endif diff --git a/tracker/tracker_relationship.c b/tracker/tracker_relationship.c index bd48ce8..7eb6e88 100644 --- a/tracker/tracker_relationship.c +++ b/tracker/tracker_relationship.c @@ -191,7 +191,7 @@ static int relationship_get_tracker_leader(TrackerRunningStatus *pTrackerStatus) pTrackerServerpTrackerServer = pTrackerServer; - r = tracker_mem_get_status(pTrackerServer, pStatus); + r = fdfs_get_tracker_status(pTrackerServer, pStatus); if (r == 0) { pStatus++; diff --git a/tracker/tracker_service.c b/tracker/tracker_service.c index 3dfc584..252b826 100644 --- a/tracker/tracker_service.c +++ b/tracker/tracker_service.c @@ -879,7 +879,7 @@ static int tracker_deal_notify_next_leader(struct fast_task_info *pTask) g_tracker_leader_chg_count++; logError("file: "__FILE__", line: %d, " \ - "client ip: %s, two leader occur, " \ + "client ip: %s, two leaders occur, " \ "new leader is %s:%d", \ __LINE__, pTask->client_ip, \ leader.ip_addr, leader.port); @@ -1644,6 +1644,44 @@ static int tracker_deal_ping_leader(struct fast_task_info *pTask) return 0; } +static int tracker_deal_reselect_leader(struct fast_task_info *pTask) +{ + TrackerClientInfo *pClientInfo; + + pClientInfo = (TrackerClientInfo *)pTask->arg; + if (pTask->length - sizeof(TrackerHeader) != 0) + { + logError("file: "__FILE__", line: %d, " \ + "cmd=%d, client ip: %s, package size " \ + PKG_LEN_PRINTF_FORMAT" is not correct, " \ + "expect length 0", __LINE__, \ + TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER, \ + pTask->client_ip, \ + pTask->length - (int)sizeof(TrackerHeader)); + pTask->length = sizeof(TrackerHeader); + return EINVAL; + } + + pTask->length = sizeof(TrackerHeader); + if (!g_if_leader_self) + { + logError("file: "__FILE__", line: %d, " \ + "cmd=%d, client ip: %s, i am not the leader!", \ + __LINE__, TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER, \ + pTask->client_ip); + return EOPNOTSUPP; + } + + g_if_leader_self = false; + g_tracker_servers.leader_index = -1; + g_tracker_leader_chg_count++; + + logWarning("file: "__FILE__", line: %d, " \ + "client ip: %s, i be notified that two leaders occur, " \ + "should re-select leader", __LINE__, pTask->client_ip); + return 0; +} + static int tracker_unlock_by_client(struct fast_task_info *pTask) { if (lock_by_client_count <= 0 || pTask->finish_callback == NULL) @@ -3730,6 +3768,9 @@ int tracker_deal_task(struct fast_task_info *pTask) case TRACKER_PROTO_CMD_TRACKER_COMMIT_NEXT_LEADER: result = tracker_deal_commit_next_leader(pTask); break; + case TRACKER_PROTO_CMD_TRACKER_NOTIFY_RESELECT_LEADER: + result = tracker_deal_reselect_leader(pTask); + break; default: logError("file: "__FILE__", line: %d, " \ "client ip: %s, unkown cmd: %d", \ diff --git a/tracker/tracker_types.h b/tracker/tracker_types.h index 8e3b710..fdd116f 100644 --- a/tracker/tracker_types.h +++ b/tracker/tracker_types.h @@ -433,5 +433,12 @@ typedef struct { char **paths; //file store paths } FDFSStorePaths; +typedef struct { + ConnectionInfo *pTrackerServer; + int running_time; //running seconds, more means higher weight + int restart_interval; //restart interval, less mean higher weight + bool if_leader; //if leader +} TrackerRunningStatus; + #endif