Moved tsv handling to doreah

2025-07-06 03:39:05 +03:00 · 2019-03-29 20:23:32 +01:00 · 2019-03-29 20:23:32 +01:00 · 2f30157b04
commit 2f30157b04
parent 5765687f9d
3 changed files with 132 additions and 123 deletions
--- a/cleanup.py
+++ b/cleanup.py
@ -1,41 +1,42 @@
 import re
 import utilities
+from doreah import tsv

 # need to do this as a class so it can retain loaded settings from file
 # apparently this is not true
 # I'm dumb
 class CleanerAgent:
-	
+
 	def __init__(self):
 		self.updateRules()
-	
+
 	def updateRules(self):
-		raw = utilities.parseAllTSV("rules","string","string","string")
+		raw = tsv.parse_all("rules","string","string","string")
 		self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"]
 		self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"]
 		self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"}
 		self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"}
-		
+
 		# we always need to be able to tell if our current database is made with the current rules
 		self.checksums = utilities.checksumTSV("rules")
-			
-	
-	
+
+
+
 	def fullclean(self,artist,title):
 		artists = self.parseArtists(self.removespecial(artist))
 		title = self.parseTitle(self.removespecial(title))
 		(title,moreartists) = self.parseTitleForArtists(title)
-		artists += moreartists	
+		artists += moreartists
 		artists = list(set(artists))
 		artists.sort()
-		
+
 		return (artists,title)

 	def removespecial(self,s):
 		s = s.replace("\t","").replace("␟","").replace("\n","")
 		s = re.sub(" +"," ",s)
 		return s
-		
+

 	# if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that
 	def confirmedReal(self,a):
@ -51,54 +52,54 @@ class CleanerAgent:

 		if a.strip() == "":
 			return []
-			
+
 		if a.strip() in self.rules_notanartist:
 			return []
-			
+
 		if " performing " in a.lower():
 			return self.parseArtists(re.split(" [Pp]erforming",a)[0])
-			
+
 		if a.strip() in self.rules_belongtogether:
 			return [a.strip()]
 		if a.strip() in self.rules_replaceartist:
 			return self.rules_replaceartist[a.strip()].split("␟")
-			
-		
-		
+
+
+
 		for d in self.delimiters_feat:
 			if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None:
 				return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a))
-		
+
 		for d in self.delimiters_formal:
 			if (d in a):
 				ls = []
 				for i in a.split(d):
 					ls += self.parseArtists(i)
 				return ls
-		
+
 		for d in (self.delimiters_feat + self.delimiters):
 			if ((" " + d + " ") in a):
 				ls = []
 				for i in a.split(" " + d + " "):
 					ls += self.parseArtists(i)
 				return ls
-				
-		
-			
-		
-			
+
+
+
+
+
 		return [a.strip()]

 	def parseTitle(self,t):
 		if t.strip() in self.rules_replacetitle:
 			return self.rules_replacetitle[t.strip()]
-	
+
 		t = t.replace("[","(").replace("]",")")
-		
+
 		t = re.sub(r" \(as made famous by .*?\)","",t)
 		t = re.sub(r" \(originally by .*?\)","",t)
 		t = re.sub(r" \(.*?Remaster.*?\)","",t)
-		
+
 		return t.strip()

 	def parseTitleForArtists(self,t):
@ -115,30 +116,30 @@ class CleanerAgent:
 				(title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t))
 				artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t))
 				return (title,artists)
-		
-		return (t,[])
-		
-		

-#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)		
+		return (t,[])
+
+
+
+#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc)
 class CollectorAgent:
-	
+
 	def __init__(self):
 		self.updateRules()
-	
+
 	def updateRules(self):
-		raw = utilities.parseAllTSV("rules","string","string","string")
+		raw = tsv.parse_all("rules","string","string","string")
 		self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"}
 		self.rules_include = {} #Twice the memory, double the performance! (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!)
 		for a in self.rules_countas:
 			self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a]
-	
-	# this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of	
+
+	# this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of
 	def updateIDs(self,artistlist):
 		self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas}
 		#self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include}
 		#this needs to take lists into account
-		
+
 	def getCredited(self,artist):
 		if artist in self.rules_countas_id:
 			return self.rules_countas_id[artist]
@ -146,36 +147,36 @@ class CollectorAgent:
 			return self.rules_countas[artist]
 		else:
 			return artist
-	
-		
+
+
 	def getCreditedList(self,artists):
 		updatedArtists = []
 		for artist in artists:
 			updatedArtists.append(self.getCredited(artist))
 		return list(set(updatedArtists))
-		
+
 	def getAllAssociated(self,artist):
 		return self.rules_include.get(artist,[])
-		
+
 	# this function is there to check for artists that we should include in the database even though they never have any scrobble. important to avoid bugs when
 	# countas rules are declared preemptively
 	def getAllArtists(self):
 		return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas]))
-		
-		
-		
-		
-		
-		
-		
+
+
+
+
+
+
+
 def flatten(lis):

 	newlist = []
-		
+
 	for l in lis:
 		if isinstance(l, str):
 			newlist.append(l)
 		else:
 			newlist = newlist + l
-				
+
 	return list(set(newlist))
--- a/database.py
+++ b/database.py
@ -7,6 +7,7 @@ import datetime
 from cleanup import *
 from utilities import *
 from doreah.logging import log
+from doreah import tsv
 from malojatime import *
 import sys
 import unicodedata
@ -39,8 +40,10 @@ db_rulestate = False
 ### symmetric keys are fine for now since we hopefully use HTTPS
 def loadAPIkeys():
 	global clients
-	createTSV("clients/authenticated_machines.tsv")
-	clients = parseTSV("clients/authenticated_machines.tsv","string","string")
+	tsv.create("clients/authenticated_machines.tsv")
+	#createTSV("clients/authenticated_machines.tsv")
+	clients = tsv.parse("clients/authenticated_machines.tsv","string","string")
+	#clients = parseTSV("clients/authenticated_machines.tsv","string","string")
 	log("Authenticated Machines: " + ", ".join([m[1] for m in clients]))

 def checkAPIkey(k):
@ -550,7 +553,8 @@ def newrule():
 	keys = FormsDict.decode(request.forms)
 	apikey = keys.pop("key",None)
 	if (checkAPIkey(apikey)):
-		addEntry("rules/webmade.tsv",[k for k in keys])
+		tsv.add_entry("rules/webmade.tsv",[k for k in keys])
+		#addEntry("rules/webmade.tsv",[k for k in keys])
 		global db_rulestate
 		db_rulestate = False

@ -742,7 +746,8 @@ def build_db():


 	# parse files
-	db = parseAllTSV("scrobbles","int","string","string",escape=False)
+	db = tsv.parse_all("scrobbles","int","string","string",comments=False)
+	#db = parseAllTSV("scrobbles","int","string","string",escape=False)
 	for sc in db:
 		artists = sc[1].split("␟")
 		title = sc[2]
@ -803,7 +808,8 @@ def sync():
 			SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True)

 	for e in entries:
-		addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
+		tsv.add_entries("scrobbles/" + e + ".tsv",entries[e],comments=False)
+		#addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False)
 		combineChecksums("scrobbles/" + e + ".tsv",cla.checksums)


--- a/utilities.py
+++ b/utilities.py
@ -6,48 +6,49 @@ import pickle
 import urllib
 import datetime
 from doreah import settings
+from doreah.logging import log


 ### TSV files

-def parseTSV(filename,*args,escape=True):
-	f = open(filename)
+#def parseTSV(filename,*args,escape=True):
+#	f = open(filename)
+#
+#	result = []
+#	for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
+#
+#		l = l.replace("\n","")
+#		if escape:
+#			l = l.split("#")[0]
+#		l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason)
+#		data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
+#		entry = [] * len(args)
+#		for i in range(len(args)):
+#			if args[i]=="list":
+#				try:
+#					entry.append(data[i].split("␟"))
+#				except:
+#					entry.append([])
+#			elif args[i]=="string":
+#				try:
+#					entry.append(data[i])
+#				except:
+#					entry.append("")
+#			elif args[i]=="int":
+#				try:
+#					entry.append(int(data[i]))
+#				except:
+#					entry.append(0)
+#			elif args[i]=="bool":
+#				try:
+#					entry.append((data[i].lower() in ["true","yes","1","y"]))
+#				except:
+#					entry.append(False)
+#
+#		result.append(entry)

-	result = []
-	for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]:
-
-		l = l.replace("\n","")
-		if escape:
-			l = l.split("#")[0]
-		l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason)
-		data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing
-		entry = [] * len(args)
-		for i in range(len(args)):
-			if args[i]=="list":
-				try:
-					entry.append(data[i].split("␟"))
-				except:
-					entry.append([])
-			elif args[i]=="string":
-				try:
-					entry.append(data[i])
-				except:
-					entry.append("")
-			elif args[i]=="int":
-				try:
-					entry.append(int(data[i]))
-				except:
-					entry.append(0)
-			elif args[i]=="bool":
-				try:
-					entry.append((data[i].lower() in ["true","yes","1","y"]))
-				except:
-					entry.append(False)
-
-		result.append(entry)
-
-	f.close()
-	return result
+#	f.close()
+#	return result

 def checksumTSV(folder):

@ -110,40 +111,40 @@ def consistentRulestate(folder,checksums):
 	return True


-def parseAllTSV(path,*args,escape=True):
+#def parseAllTSV(path,*args,escape=True):
+#
+#
+#	result = []
+#	for f in os.listdir(path + "/"):
+#
+#		if (f.endswith(".tsv")):
+#
+#			result += parseTSV(path + "/" + f,*args,escape=escape)
+#
+#	return result

+#def createTSV(filename):
+#
+#	if not os.path.exists(filename):
+#		open(filename,"w").close()

-	result = []
-	for f in os.listdir(path + "/"):
-
-		if (f.endswith(".tsv")):
-
-			result += parseTSV(path + "/" + f,*args,escape=escape)
-
-	return result
-
-def createTSV(filename):
-
-	if not os.path.exists(filename):
-		open(filename,"w").close()
-
-def addEntry(filename,a,escape=True):
-
-	createTSV(filename)
-
-	line = "\t".join(a)
-	if escape: line = line.replace("#",r"\num")
-	with open(filename,"a") as f:
-		f.write(line + "\n")
-
-def addEntries(filename,al,escape=True):
-
-	with open(filename,"a") as f:
-		for a in al:
-			line = "\t".join(a)
-			if escape: line = line.replace("#",r"\num")
-			f.write(line + "\n")
+#def addEntry(filename,a,escape=True):
+#
+#	createTSV(filename)
+#
+#	line = "\t".join(a)
+#	if escape: line = line.replace("#",r"\num")
+#	with open(filename,"a") as f:
+#		f.write(line + "\n")

+#def addEntries(filename,al,escape=True):
+#
+#	with open(filename,"a") as f:
+#		for a in al:
+#			line = "\t".join(a)
+#			if escape: line = line.replace("#",r"\num")
+#			f.write(line + "\n")
+#


 ### Useful functions
@ -273,6 +274,7 @@ def cache_track(artists,title,result):
 	day = datetime.date.today().toordinal()
 	cachedTracksDays[(frozenset(artists),title)] = day
 def cache_artist(artist,result):
+	if result is None: log("Caching None for " + artist,module="debug")
 	cachedArtists[artist] = result
 	day = datetime.date.today().toordinal()
 	cachedArtistsDays[artist] = day