From 0a11464ce60a2e9d2be585541d24661a7dc0a713 Mon Sep 17 00:00:00 2001 From: Zergling_man Date: Wed, 17 Nov 2021 22:44:26 +1100 Subject: [PATCH] initcommit, I'll add .gitignore later --- README.md | 1 + config.json.example | 4 ++ getlyrics.py | 62 +++++++++++++++++++++++++++++++ sources/README.md | 14 +++++++ sources/com_bandcamp.py | 72 ++++++++++++++++++++++++++++++++++++ sources/com_fandom_lyrics.py | 25 +++++++++++++ 6 files changed, 178 insertions(+) create mode 100644 README.md create mode 100644 config.json.example create mode 100644 getlyrics.py create mode 100644 sources/README.md create mode 100644 sources/com_bandcamp.py create mode 100644 sources/com_fandom_lyrics.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..d0abd9e --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +A lyrics fetching tool, written in python. Similarly designed to a package manager. Manages indexing itself, sources don't need to do that. diff --git a/config.json.example b/config.json.example new file mode 100644 index 0000000..04c9cbb --- /dev/null +++ b/config.json.example @@ -0,0 +1,4 @@ +{ +"lyrics":"/path/to/lyrics/files", +"indexcache":"~/.cache/known_songs" +} \ No newline at end of file diff --git a/getlyrics.py b/getlyrics.py new file mode 100644 index 0000000..331e634 --- /dev/null +++ b/getlyrics.py @@ -0,0 +1,62 @@ +import requests as r +import sys +import os +import json as j + +os.chdir('/home/wisknort/lyricise') # Do this first so that the following work. +with open('config.json') as b: conf=j.load(b) +import importlib as il +sources=[il.import_module(f'sources.{n[:-3]}') for n in os.listdir('sources') if n.endswith('.py')] +sources=list(filter(lambda x:x.enabled,sources)) +class FlashyNotMountedError(Exception): + pass + +def init(args): + #Init + split=args.index('-') + band=' '.join([n.capitalize() for n in args[:split]]) + song=' '.join([n.capitalize() for n in args[split+1:]]) + return band,song + +def local(band,song): + #Local + #All lyrics that are fetched from remote will automatically get saved in here for later reference + #Also, any time the existence of a song on a remote is proven (eg. the remote offers an album list for a band, which in turn offers a track list, and this needs to be parsed to get the correct URL), that will be indexed locally for future reference, in ./.indices + #Deleting that folder will reset that, and should always be safe. + #Indices will be one file per source, so deleting individual files will be part of uninstalling a source. + try: + os.chdir(conf['lyrics']) + prep=lambda x:x.lower().replace(' ','_') + try: + with open(f'{prep(song)}_{prep(band)}.lyric') as b: + lyrics=b.read() + except FileNotFoundError: lyrics='' + return lyrics + except FileNotFoundError: + raise FlashyNotMountedError + +def remote(band,song): + #Runs through sources and tries to locate the song and fetch lyrics. + #Will search all local indices first, and offer which repository to fetch from if multiple found + #But will *not* poll all remote repositories; if not found in local indices, will begin polling sources in whatever order the underlying (他框)walk decides, and will stop as soon as an exact match is found. + #If a partial match is found, will offer to user whether it's close enough. + if not sources: + print('No sources configured! Failed to fetch lyrics from remote!') + #TODO + lyrics='' + for source in sources: + try: lyrics=source.lyrics(song,band) + except Exception: raise # This should actually catch errors later + if lyrics: break + return lyrics + +if __name__=='__main__': + band,song=init(sys.argv[1:]) + try: lyrics=local(band,song) + except FlashyNotMountedError: + if input("Flashy not mounted, proceed? (y/n)")!='y': exit() + lyrics='' + if not lyrics: + print("Lyrics not found locally, searching for remote...") + lyrics=remote(band,song) or '' + if lyrics: print(lyrics) diff --git a/sources/README.md b/sources/README.md new file mode 100644 index 0000000..c62fa8e --- /dev/null +++ b/sources/README.md @@ -0,0 +1,14 @@ +Not actually markdown, haha tricked ya! + +Documentation for files in this folder. + +1) Every file must be a python script +2) It must be named with the domain it serves as the filename (starting from TLD and narrowing down; dots and slashes should be converted to underscores - the fetcher won't actually try to restore it and load the URL). +3) It must expose an "enabled" global boolean that, if False, will cause the fetcher to ignore its existence entirely. This is useful for sources that are temporarily not working, or for sources that are pending proper updates to the source's changes, etc. +4) It must expose a lyrics(song,band='',album='') function that returns song lyrics as plaintext (str), or as arranged lyrics (dict\[sections (list),arrangement (list)\]). It's recommended that if neither band nor album are given, the function should automatically return a failure, or explicitly recommend a search, rather than attempting to work with it. +5) It may expose other functions, such as: +- search(song,band='',album=''), which, as above, is recommended to automatically fail if only song is given +- index(band,album=''), which should return a list of the band's tracks (on that album), for populating the local index +- massfetch(band,album=''), which should fetch lyrics to all tracks of the band (on that album) and return them in a list +- This will, at some point, become a definitive list of optional functions that the fetcher supports; it will always accept recommendations for change. +6) Input formats: All names will be passed in exactly as the user presents the information, it is the source's job to wrangle it correctly into URLs. For example, "wither." is an album name that may be given to the source. It should generally assume that words will be separated with spaces, but there's no guarantee. Also it may occasionally get bands and songs passed in as each other. It isn't expected to figure that out. diff --git a/sources/com_bandcamp.py b/sources/com_bandcamp.py new file mode 100644 index 0000000..220769c --- /dev/null +++ b/sources/com_bandcamp.py @@ -0,0 +1,72 @@ +enabled=True + +import requests as r + +def processneedle(gunk,start,needle): + try: start+=gunk[start:].index(needle[0])+len(needle[0]) + except ValueError: print('failed needle:',needle); return None,None + end=gunk[start+1:].index(needle[1])+1 + return gunk[start:start+end],start+end+len(needle[1]) + +def stringiter(gunk:str,needle:tuple=('',''),needles:list=[]): + start=0 + if needles: + # Non-interactive mode. You have all the info upfront. + needle=0 + while True: + out={} + for needle in needles: + res,start=processneedle(gunk,start,needle) + if res==start==None: return + try: out[needle[2]]=res + except IndexError: + try: out[''].append(res) + except KeyError: out['']=[res] + yield out + # Interactive mode. I guess don't use this as much. + if needle==('',''): needle=yield None + while True: + res,start=processneedle(gunk,start,needle) + if res==start==None: return + yield res + +def bandget(band): + peg=r.get(f'https://{band}.bandcamp.com/').content.decode('utf-8') # Just assume success, + # I'll put error handling in later + # I'll also demand a real API later + albums=[] + needles=[('href="/album/','"','urlid'), +('img src="','"','coverurl'), +('class="title">\n ','\n','title')] + for needle in stringiter(peg,needles=needles): + albums.append(needle) + return albums + +def albumget(band,album,mode=0): + # Returns track list or all tracks' lyrics, based on mode + peg=r.get(f'https://{band}.bandcamp.com/album/{album}').content.decode('utf-8') + tracks=[] + needles=[('rel="tracknum=','"','num'), +('a href="/track/','">','urlid'), +('span class="track-title">','','title'), +('\n \n ','\n','duration')] + # Despite the lyrics being in the pages, it's not actually safe to get them with this system because any track without lyrics will quietly delete all tracks after it, up to and including the next one with lyrics. + for needle in stringiter(peg,needles=needles): + tracks.append(needle) + return tracks + +def index(band,album=''): + if album: + return albumget(band,album,0) + albums=bandget(band) + return {x:albumget(band,x,0) for x in map(lambda x:x['urlid'],albums)} + +def lyrics(song,band='',album=''): + if not (band): + return "Bandcamp does not currently support song search (due to being a webscraper). You must specify the band that performed the song." + band=band.replace(' ',''); song=song.replace(' ','-') + peg=r.get(f'https://{band}.bandcamp.com/track/{song}').content.decode('utf-8') + needle=('
','
','lyrics') + lyrics=next(stringiter(peg,needle=needle)) + lyrics=lyrics.replace('\r','').replace('\n','').replace('
','\n') + return lyrics diff --git a/sources/com_fandom_lyrics.py b/sources/com_fandom_lyrics.py new file mode 100644 index 0000000..dd9c8df --- /dev/null +++ b/sources/com_fandom_lyrics.py @@ -0,0 +1,25 @@ +enabled=False +""" +Lyrics fandom has been deleted with no notice and no reason given, even when asked directly. +It will never be reinstated, this file will never work again. It can be safely deleted. +It's still included as a reminder of one of the nice things we just can't have thanks to, probably, copyright. +""" + +def lyrics(band,song): + #Remote + p={'format':'json', 'action':'parse', 'prop':'wikitext', 'page':f'{band}:{song}'} + lyrics=r.get('https://lyrics.fandom.com/api.php',params=p) + try: + lyrics=lyrics.json()['parse']['wikitext']['*'] + except KeyError: return "Couldn't find it boi" + #Follow redirects + while '#redirect' in lyrics.lower(): + p['page']=lyrics[lyrics.index('[[')+2:lyrics.index(']]')] + lyrics=r.get('https://lyrics.fandom.com/api.php',params=p).json()['parse']['wikitext']['*'] + out=lyrics[lyrics.index('')+8:lyrics.index('')] + return out + +def url(band,song): + p={'format':'json', 'action':'parse', 'prop':'wikitext', 'page':f'{band}:{song}'} + p="&".join([f'{k}={v}' for k,v in p.items()]) + return f'https://lyrics.fandom.com/wiki/{band}:{song}',f'https://lyrics.fandom.com/api.php?{p}'