From 2f30157b045d8c747714f64b5da4ee1bcbf3e093 Mon Sep 17 00:00:00 2001 From: Krateng Date: Fri, 29 Mar 2019 20:23:32 +0100 Subject: [PATCH] Moved tsv handling to doreah --- cleanup.py | 101 ++++++++++++++++++------------------- database.py | 16 ++++-- utilities.py | 138 ++++++++++++++++++++++++++------------------------- 3 files changed, 132 insertions(+), 123 deletions(-) diff --git a/cleanup.py b/cleanup.py index 662cea5..25f88bd 100644 --- a/cleanup.py +++ b/cleanup.py @@ -1,41 +1,42 @@ import re import utilities +from doreah import tsv # need to do this as a class so it can retain loaded settings from file # apparently this is not true # I'm dumb class CleanerAgent: - + def __init__(self): self.updateRules() - + def updateRules(self): - raw = utilities.parseAllTSV("rules","string","string","string") + raw = tsv.parse_all("rules","string","string","string") self.rules_belongtogether = [b for [a,b,c] in raw if a=="belongtogether"] self.rules_notanartist = [b for [a,b,c] in raw if a=="notanartist"] self.rules_replacetitle = {b:c for [a,b,c] in raw if a=="replacetitle"} self.rules_replaceartist = {b:c for [a,b,c] in raw if a=="replaceartist"} - + # we always need to be able to tell if our current database is made with the current rules self.checksums = utilities.checksumTSV("rules") - - - + + + def fullclean(self,artist,title): artists = self.parseArtists(self.removespecial(artist)) title = self.parseTitle(self.removespecial(title)) (title,moreartists) = self.parseTitleForArtists(title) - artists += moreartists + artists += moreartists artists = list(set(artists)) artists.sort() - + return (artists,title) def removespecial(self,s): s = s.replace("\t","").replace("␟","").replace("\n","") s = re.sub(" +"," ",s) return s - + # if an artist appears in any created rule, we can assume that artist is meant to exist and be spelled like that def confirmedReal(self,a): @@ -51,54 +52,54 @@ class CleanerAgent: if a.strip() == "": return [] - + if a.strip() in self.rules_notanartist: return [] - + if " performing " in a.lower(): return self.parseArtists(re.split(" [Pp]erforming",a)[0]) - + if a.strip() in self.rules_belongtogether: return [a.strip()] if a.strip() in self.rules_replaceartist: return self.rules_replaceartist[a.strip()].split("␟") - - - + + + for d in self.delimiters_feat: if re.match(r"(.*) \(" + d + " (.*)\)",a) is not None: return self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\1",a)) + self.parseArtists(re.sub(r"(.*) \(" + d + " (.*)\)",r"\2",a)) - + for d in self.delimiters_formal: if (d in a): ls = [] for i in a.split(d): ls += self.parseArtists(i) return ls - + for d in (self.delimiters_feat + self.delimiters): if ((" " + d + " ") in a): ls = [] for i in a.split(" " + d + " "): ls += self.parseArtists(i) return ls - - - - - + + + + + return [a.strip()] def parseTitle(self,t): if t.strip() in self.rules_replacetitle: return self.rules_replacetitle[t.strip()] - + t = t.replace("[","(").replace("]",")") - + t = re.sub(r" \(as made famous by .*?\)","",t) t = re.sub(r" \(originally by .*?\)","",t) t = re.sub(r" \(.*?Remaster.*?\)","",t) - + return t.strip() def parseTitleForArtists(self,t): @@ -115,30 +116,30 @@ class CleanerAgent: (title,artists) = self.parseTitleForArtists(re.sub(r"(.*) " + d + " (.*)",r"\1",t)) artists += self.parseArtists(re.sub(r"(.*) " + d + " (.*).*",r"\2",t)) return (title,artists) - - return (t,[]) - - -#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc) + return (t,[]) + + + +#this is for all the runtime changes (counting Trouble Maker as HyunA for charts etc) class CollectorAgent: - + def __init__(self): self.updateRules() - + def updateRules(self): - raw = utilities.parseAllTSV("rules","string","string","string") + raw = tsv.parse_all("rules","string","string","string") self.rules_countas = {b:c for [a,b,c] in raw if a=="countas"} self.rules_include = {} #Twice the memory, double the performance! (Yes, we're saving redundant information here, but it's not unelegant if it's within a closed object!) for a in self.rules_countas: self.rules_include[self.rules_countas[a]] = self.rules_include.setdefault(self.rules_countas[a],[]) + [a] - - # this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of + + # this agent needs to be aware of the current id assignment in the main program. unelegant, but the best way i can think of def updateIDs(self,artistlist): self.rules_countas_id = {artistlist.index(a):artistlist.index(self.rules_countas[a]) for a in self.rules_countas} #self.rules_include_id = {artistlist.index(a):artistlist.index(self.rules_include[a]) for a in self.rules_include} #this needs to take lists into account - + def getCredited(self,artist): if artist in self.rules_countas_id: return self.rules_countas_id[artist] @@ -146,36 +147,36 @@ class CollectorAgent: return self.rules_countas[artist] else: return artist - - + + def getCreditedList(self,artists): updatedArtists = [] for artist in artists: updatedArtists.append(self.getCredited(artist)) return list(set(updatedArtists)) - + def getAllAssociated(self,artist): return self.rules_include.get(artist,[]) - + # this function is there to check for artists that we should include in the database even though they never have any scrobble. important to avoid bugs when # countas rules are declared preemptively def getAllArtists(self): return list(set([a for a in self.rules_countas] + [self.rules_countas[a] for a in self.rules_countas])) - - - - - - - + + + + + + + def flatten(lis): newlist = [] - + for l in lis: if isinstance(l, str): newlist.append(l) else: newlist = newlist + l - + return list(set(newlist)) diff --git a/database.py b/database.py index 3d51ba7..fdc276b 100644 --- a/database.py +++ b/database.py @@ -7,6 +7,7 @@ import datetime from cleanup import * from utilities import * from doreah.logging import log +from doreah import tsv from malojatime import * import sys import unicodedata @@ -39,8 +40,10 @@ db_rulestate = False ### symmetric keys are fine for now since we hopefully use HTTPS def loadAPIkeys(): global clients - createTSV("clients/authenticated_machines.tsv") - clients = parseTSV("clients/authenticated_machines.tsv","string","string") + tsv.create("clients/authenticated_machines.tsv") + #createTSV("clients/authenticated_machines.tsv") + clients = tsv.parse("clients/authenticated_machines.tsv","string","string") + #clients = parseTSV("clients/authenticated_machines.tsv","string","string") log("Authenticated Machines: " + ", ".join([m[1] for m in clients])) def checkAPIkey(k): @@ -550,7 +553,8 @@ def newrule(): keys = FormsDict.decode(request.forms) apikey = keys.pop("key",None) if (checkAPIkey(apikey)): - addEntry("rules/webmade.tsv",[k for k in keys]) + tsv.add_entry("rules/webmade.tsv",[k for k in keys]) + #addEntry("rules/webmade.tsv",[k for k in keys]) global db_rulestate db_rulestate = False @@ -742,7 +746,8 @@ def build_db(): # parse files - db = parseAllTSV("scrobbles","int","string","string",escape=False) + db = tsv.parse_all("scrobbles","int","string","string",comments=False) + #db = parseAllTSV("scrobbles","int","string","string",escape=False) for sc in db: artists = sc[1].split("␟") title = sc[2] @@ -803,7 +808,8 @@ def sync(): SCROBBLES[idx] = (SCROBBLES[idx][0],SCROBBLES[idx][1],True) for e in entries: - addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False) + tsv.add_entries("scrobbles/" + e + ".tsv",entries[e],comments=False) + #addEntries("scrobbles/" + e + ".tsv",entries[e],escape=False) combineChecksums("scrobbles/" + e + ".tsv",cla.checksums) diff --git a/utilities.py b/utilities.py index ab858fc..72a2899 100644 --- a/utilities.py +++ b/utilities.py @@ -6,48 +6,49 @@ import pickle import urllib import datetime from doreah import settings +from doreah.logging import log ### TSV files -def parseTSV(filename,*args,escape=True): - f = open(filename) +#def parseTSV(filename,*args,escape=True): +# f = open(filename) +# +# result = [] +# for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]: +# +# l = l.replace("\n","") +# if escape: +# l = l.split("#")[0] +# l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason) +# data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing +# entry = [] * len(args) +# for i in range(len(args)): +# if args[i]=="list": +# try: +# entry.append(data[i].split("␟")) +# except: +# entry.append([]) +# elif args[i]=="string": +# try: +# entry.append(data[i]) +# except: +# entry.append("") +# elif args[i]=="int": +# try: +# entry.append(int(data[i])) +# except: +# entry.append(0) +# elif args[i]=="bool": +# try: +# entry.append((data[i].lower() in ["true","yes","1","y"])) +# except: +# entry.append(False) +# +# result.append(entry) - result = [] - for l in [l for l in f if (not l.startswith("#")) and (not l.strip()=="")]: - - l = l.replace("\n","") - if escape: - l = l.split("#")[0] - l = l.replace(r"\num","#") # translate escape sequences even if we don't support comments in the file and they are not actually necessary (they might still be used for some reason) - data = list(filter(None,l.split("\t"))) # Multiple tabs are okay, we don't accept empty fields unless trailing - entry = [] * len(args) - for i in range(len(args)): - if args[i]=="list": - try: - entry.append(data[i].split("␟")) - except: - entry.append([]) - elif args[i]=="string": - try: - entry.append(data[i]) - except: - entry.append("") - elif args[i]=="int": - try: - entry.append(int(data[i])) - except: - entry.append(0) - elif args[i]=="bool": - try: - entry.append((data[i].lower() in ["true","yes","1","y"])) - except: - entry.append(False) - - result.append(entry) - - f.close() - return result +# f.close() +# return result def checksumTSV(folder): @@ -110,40 +111,40 @@ def consistentRulestate(folder,checksums): return True -def parseAllTSV(path,*args,escape=True): +#def parseAllTSV(path,*args,escape=True): +# +# +# result = [] +# for f in os.listdir(path + "/"): +# +# if (f.endswith(".tsv")): +# +# result += parseTSV(path + "/" + f,*args,escape=escape) +# +# return result +#def createTSV(filename): +# +# if not os.path.exists(filename): +# open(filename,"w").close() - result = [] - for f in os.listdir(path + "/"): - - if (f.endswith(".tsv")): - - result += parseTSV(path + "/" + f,*args,escape=escape) - - return result - -def createTSV(filename): - - if not os.path.exists(filename): - open(filename,"w").close() - -def addEntry(filename,a,escape=True): - - createTSV(filename) - - line = "\t".join(a) - if escape: line = line.replace("#",r"\num") - with open(filename,"a") as f: - f.write(line + "\n") - -def addEntries(filename,al,escape=True): - - with open(filename,"a") as f: - for a in al: - line = "\t".join(a) - if escape: line = line.replace("#",r"\num") - f.write(line + "\n") +#def addEntry(filename,a,escape=True): +# +# createTSV(filename) +# +# line = "\t".join(a) +# if escape: line = line.replace("#",r"\num") +# with open(filename,"a") as f: +# f.write(line + "\n") +#def addEntries(filename,al,escape=True): +# +# with open(filename,"a") as f: +# for a in al: +# line = "\t".join(a) +# if escape: line = line.replace("#",r"\num") +# f.write(line + "\n") +# ### Useful functions @@ -273,6 +274,7 @@ def cache_track(artists,title,result): day = datetime.date.today().toordinal() cachedTracksDays[(frozenset(artists),title)] = day def cache_artist(artist,result): + if result is None: log("Caching None for " + artist,module="debug") cachedArtists[artist] = result day = datetime.date.today().toordinal() cachedArtistsDays[artist] = day