Browse Source

Zone watchdog logic -- attempt to force shutdown a hung/frozen zone

after 30 seconds of zone process hung, attempt normal shutdown.   If at 60s then force SpawnProcess/ZoneProcess threads to cancel (only linux supports 60s feature to cancel thread).  For windows 60s+ is only notification and does not attempt to cancel threads.

second stage for linux may/may not succeed (either shuts the zone down or crashes the world).  Idea being this is worst case scenario to try to recover/make server owner aware of the hung nature of the zone.
image 4 years ago
parent
commit
dc98a6035e

+ 41 - 2
EQ2/source/WorldServer/World.cpp

@@ -87,7 +87,7 @@ extern LoginServer loginserver;
 extern World world;
 extern RuleManager rule_manager;
 
-World::World() : save_time_timer(300000), time_tick_timer(3000), vitality_timer(3600000), player_stats_timer(60000), server_stats_timer(60000), /*remove_grouped_player(30000),*/ guilds_timer(60000), lotto_players_timer(500) {
+World::World() : save_time_timer(300000), time_tick_timer(3000), vitality_timer(3600000), player_stats_timer(60000), server_stats_timer(60000), /*remove_grouped_player(30000),*/ guilds_timer(60000), lotto_players_timer(500), watchdog_timer(10000) {
 	save_time_timer.Start();
 	time_tick_timer.Start();
 	vitality_timer.Start();
@@ -96,6 +96,7 @@ World::World() : save_time_timer(300000), time_tick_timer(3000), vitality_timer(
 	//remove_grouped_player.Start();
 	guilds_timer.Start();
 	lotto_players_timer.Start();
+	watchdog_timer.Start();
 	xp_rate = -1;
 	ts_xp_rate = -1;
 	vitality_frequency = 0xFFFFFFFF;
@@ -268,6 +269,8 @@ void World::Process(){
 		SaveGuilds();
 	if (lotto_players_timer.Check())
 		CheckLottoPlayers();
+	if(watchdog_timer.Check())
+		zone_list.WatchdogHeartbeat();
 }
 
 vector<Variable*>* World::GetClientVariables(){
@@ -2302,4 +2305,40 @@ void World::PurgeStartingLists()
 	starting_spells.clear();
 
 	MStartingLists.releasewritelock();
-}
+}
+
+void ZoneList::WatchdogHeartbeat()
+{
+	list<ZoneServer*>::iterator zone_iter;
+	ZoneServer* tmp = 0;
+	MZoneList.writelock(__FUNCTION__, __LINE__);
+
+	bool match = false;
+	for (zone_iter = zlist.begin(); zone_iter != zlist.end(); zone_iter++)
+	{
+		tmp = *zone_iter;
+		if (tmp)
+		{
+			int32 curTime = Timer::GetCurrentTime2();
+			sint64 diff = (sint64)curTime - (sint64)tmp->GetWatchdogTime();
+			if (diff > 60000)
+			{
+				tmp->SetWatchdogTime(Timer::GetCurrentTime2()); // reset so we don't continuously flood this heartbeat
+				LogWrite(WORLD__ERROR, 1, "World", "Zone %s is hung for %i.. attempting to cancel threads...", tmp->GetZoneName(), diff);
+				tmp->CancelThreads();
+				MZoneList.releasewritelock(__FUNCTION__, __LINE__);
+				safe_delete(tmp);
+				match = true;
+				break;
+			}
+			else if (diff > 30000 && !tmp->isZoneShuttingDown())
+			{
+				LogWrite(WORLD__ERROR, 1, "World", "Zone %s is hung for %i.. attempting shutdown", tmp->GetZoneName(), diff);
+				tmp->Shutdown();
+			}
+		}
+	}
+	if(!match)
+		MZoneList.releasewritelock(__FUNCTION__, __LINE__);
+}
+

+ 4 - 0
EQ2/source/WorldServer/World.h

@@ -484,6 +484,8 @@ class ZoneList {
 	void ShutDownZones();
 	void ReloadMail();
 	void ReloadSpawns();
+
+	void WatchdogHeartbeat();
 private:
 	Mutex				MClientList;
 	Mutex				MZoneList;
@@ -650,6 +652,8 @@ private:
 	Timer lotto_players_timer;
 	Timer group_buff_updates;
 
+	Timer watchdog_timer;
+
 	map<int32, HouseZone*> m_houseZones;
 	// Map <house id, map<char id, player house>>
 	map<int32, map<int32, PlayerHouse*> > m_playerHouses;

+ 21 - 6
EQ2/source/WorldServer/zoneserver.cpp

@@ -161,6 +161,7 @@ ZoneServer::ZoneServer(const char* name) {
 	strcpy(zonesky_file,"");
 	
 	reloading = true;
+	watchdogTimestamp = Timer::GetCurrentTime2();
 }
 
 ZoneServer::~ZoneServer() {
@@ -309,12 +310,19 @@ void ZoneServer::Init()
 	_beginthread(ZoneLoop, 0, this);
 	_beginthread(SpawnLoop, 0, this);
 #else
-	pthread_t thread;
-	pthread_create(&thread, NULL, ZoneLoop, this);
-	pthread_detach(thread);
-	pthread_t thread2;
-	pthread_create(&thread2, NULL, SpawnLoop, this);
-	pthread_detach(thread2);
+	pthread_create(&ZoneThread, NULL, ZoneLoop, this);
+	pthread_detach(ZoneThread);
+	pthread_create(&SpawnThread, NULL, SpawnLoop, this);
+	pthread_detach(SpawnThread);
+#endif
+}
+
+void ZoneServer::CancelThreads() {
+#ifdef WIN32
+	LogWrite(WORLD__ERROR, 1, "World", "Zone %s is hung, however CancelThreads is unsupported for WIN32.", GetZoneName());
+#else
+	pthread_cancel(ZoneThread);
+	pthread_cancel(SpawnThread);
 #endif
 }
 
@@ -1249,11 +1257,13 @@ void ZoneServer::RemoveDamagedSpawn(Spawn* spawn){
 bool ZoneServer::Process()
 {
 	MMasterZoneLock->lock(); //Changing this back to a recursive lock to fix a possible /reload spells crash with multiple zones running - Foof
+	SetWatchdogTime(Timer::GetCurrentTime2());
 #ifndef NO_CATCH
 	try
 	{
 #endif
 			while (zoneID == 0) { //this is loaded by world
+				SetWatchdogTime(Timer::GetCurrentTime2());
 				Sleep(10);
 			}
 
@@ -1324,6 +1334,7 @@ bool ZoneServer::Process()
 
 			while (zonemap != nullptr && zonemap->IsMapLoading())
 			{
+				SetWatchdogTime(Timer::GetCurrentTime2());
 				// Client loop
 				ClientProcess();
 				Sleep(10);
@@ -6593,13 +6604,17 @@ ThreadReturnType SpawnLoop(void* tmp) {
 #ifndef NO_CATCH
 	}
 	catch(...) {
+		zs->spawnthread_active = false;
+		zs->initial_spawn_threads_active = 0;
 		LogWrite(ZONE__ERROR, 0, "Zone",  "Error Processing SpawnLoop, shutting down zone '%s'...", zs->GetZoneName());
 		try{
 			zs->Shutdown();
 		}
 		catch(...){
 			LogWrite(ZONE__ERROR, 0, "Zone",  "Error Processing SpawnLoop while shutting down zone '%s'...", zs->GetZoneName());
+			throw;
 		}
+		throw;
 	}
 #endif
 	THREAD_RETURN(NULL);

+ 10 - 0
EQ2/source/WorldServer/zoneserver.h

@@ -641,7 +641,16 @@ public:
 	void SendHouseItems(Client* client);
 
 	MutexMap<int32, int32>							house_object_database_lookup;						// 1st int32 = model type, 2nd int32 = spawn id
+
+	int32 GetWatchdogTime() { return watchdogTimestamp; }
+	void SetWatchdogTime(int32 time) { watchdogTimestamp = time; }
+	void CancelThreads();
 private:
+#ifndef WIN32
+	pthread_t ZoneThread;
+	pthread_t SpawnThread;
+#endif
+
 	/* Private Functions */
 	void	AddTransporter(LocationTransportDestination* loc);
 	void	CheckDeadSpawnRemoval();
@@ -912,6 +921,7 @@ private:
 	// Map <transport if, map name>
 	map<int32, string> m_transportMaps;
 	
+	int32 watchdogTimestamp;
 public:
 	Spawn*				GetSpawn(int32 id);