Extract song info from odesli (song.link)

2023-04-22 08:50:17 +02:00
parent 45eeb550b3
commit b62936ed54
6 changed files with 309 additions and 13 deletions
--- a/src/lib/server/timeline.ts
+++ b/src/lib/server/timeline.ts
@ -1,13 +1,16 @@
 import {
  HASHTAG_FILTER,
  MASTODON_INSTANCE,
+  ODESLI_API_KEY,
  URL_FILTER,
  YOUTUBE_API_KEY,
  YOUTUBE_DISABLE
 } from '$env/static/private';
 import type { Post, Tag, TimelineEvent } from '$lib/mastodon/response';
+import type { OdesliResponse, Platform, SongInfo } from '$lib/odesliResponse';
 import { getPosts, savePost } from '$lib/server/db';
 import { createFeed, saveAtomFeed } from '$lib/server/rss';
+import { sleep } from '$lib/sleep';
 import { isTruthy } from '$lib/truthyString';
 import { WebSocket } from 'ws';

@ -15,11 +18,13 @@ const YOUTUBE_REGEX = new RegExp(
  /https?:\/\/(www\.)?youtu((be.com\/.*?v=)|(\.be\/))(?<videoId>[a-zA-Z_0-9-]+)/gm
 );

+const URL_REGEX = new RegExp(/href="(?<postUrl>[^>]+?)" target="_blank"/gm);
+
 export class TimelineReader {
  private static _instance: TimelineReader;

  private static async isMusicVideo(videoId: string) {
-    if (YOUTUBE_API_KEY === undefined) {
+    if (!YOUTUBE_API_KEY || YOUTUBE_API_KEY === 'CHANGE_ME') {
      // Assume that it *is* a music link when no YT API key is provided
      // If it should assumed to not be YOUTUBE_DISABLE needs to be set to something truthy
      return true;
@ -56,9 +61,9 @@ export class TimelineReader {
    return categoryTitle === 'Music';
  }

-  private static async checkYoutubeMatches(postContent: string): Promise<boolean> {
+  private static async checkYoutubeMatches(postContent: string): Promise<string | null> {
    if (isTruthy(YOUTUBE_DISABLE)) {
-      return false;
+      return null;
    }
    const matches = postContent.matchAll(YOUTUBE_REGEX);
    for (const match of matches) {
@ -69,18 +74,93 @@ export class TimelineReader {
      try {
        const isMusic = await TimelineReader.isMusicVideo(videoId);
        if (isMusic) {
-          return true;
+          return match[0];
        }
      } catch (e) {
        console.error('Could not check if', videoId, 'is a music video', e);
      }
    }
-    return false;
+    return null;
+  }
+
+  private static async getSongInfo(
+    url: string,
+    remainingTries: number = 6
+  ): Promise<SongInfo | null> {
+    if (remainingTries === 0) {
+      console.error('No tries remaining. Lookup failed!');
+      return null;
+    }
+    let hostname: string;
+    try {
+      hostname = new URL(url).hostname;
+    } catch (e) {
+      console.error(`Could not construct URL ${url}`, e);
+      return null;
+    }
+    if (hostname === 'songwhip.com') {
+      // song.link doesn't support songwhip links and songwhip themselves will provide metadata if you pass in a
+      // Apple Music/Spotify/etc link, but won't when provided with their own link, so no way to extract song info
+      // except maybe scraping their HTML
+      return null;
+    }
+
+    const odesliParams = new URLSearchParams();
+    odesliParams.append('url', url);
+    odesliParams.append('userCountry', 'DE');
+    odesliParams.append('songIfSingle', 'true');
+    if (ODESLI_API_KEY && ODESLI_API_KEY !== 'CHANGE_ME') {
+      odesliParams.append('key', ODESLI_API_KEY);
+    }
+    const odesliApiUrl = `https://api.song.link/v1-alpha.1/links?${odesliParams}`;
+    try {
+      return fetch(odesliApiUrl).then(async (response) => {
+        if (response.status === 429) {
+          throw new Error('Rate limit reached', { cause: 429 });
+        }
+        return response.json().then((odesliInfo: OdesliResponse) => {
+          const info = odesliInfo.entitiesByUniqueId[odesliInfo.entityUniqueId];
+          const platform: Platform = 'youtube';
+          return {
+            ...info,
+            pageUrl: odesliInfo.pageUrl,
+            youtubeUrl: odesliInfo.linksByPlatform[platform]?.url
+          } as SongInfo;
+        });
+      });
+    } catch (e) {
+      if (e instanceof Error && e.cause === 429) {
+        console.warn('song.link rate limit reached. Trying again in 10 seconds');
+        await sleep(10_000);
+        return await this.getSongInfo(url, remainingTries - 1);
+      }
+      console.error(`Failed to load ${url} info from song.link`, e);
+      return null;
+    }
+  }
+
+  private static async getUrlFromPreviewCard(post: Post): Promise<string | undefined> {
+    return undefined;
+    // Currently disabled, because it seems to always be null, even after re-fetching the post from Mastodon
+    /*
+    if (post.card) {
+      return post.card?.url;
+    }
+    try {
+      const status: Post = await (
+        await fetch(`https://${MASTODON_INSTANCE}/api/v1/statuses/${post.id}`)
+      ).json();
+      return status.card?.url;
+    } catch (e) {
+      console.error(`Could not fetch status ${post.url}`, e);
+    }
+    */
  }

  private startWebsocket() {
    const socket = new WebSocket(`wss://${MASTODON_INSTANCE}/api/v1/streaming`);
    socket.onopen = () => {
+      console.log('Connected to WS');
      socket.send('{ "type": "subscribe", "stream": "public:local"}');
    };
    socket.onmessage = async (event) => {
@ -95,17 +175,69 @@ export class TimelineReader {

        const urls: string[] = URL_FILTER.split(',');
        const found_urls = urls.filter((t) => post.content.includes(t));
-
+        const urlsToCheck: string[] = [];
        // If we don't have any tags or non-youtube urls, check youtube
        // YT is handled separately, because it requires an API call and therefore is slower
-        if (
-          found_urls.length === 0 &&
-          found_tags.length === 0 &&
-          !(await TimelineReader.checkYoutubeMatches(post.content))
-        ) {
-          return;
+        if (found_urls.length === 0 && found_tags.length === 0) {
+          const youtubeUrl = await TimelineReader.checkYoutubeMatches(post.content);
+          if (youtubeUrl === null) {
+            console.log('Ignoring post', post.url);
+            return;
+          }
+          urlsToCheck.push(youtubeUrl);
+          console.log('Found YT URL', youtubeUrl, found_urls, found_urls.length);
        }
+
+        // TODO: Change URL detection above to use this regex.
+        // Looks like we're stuck with regex for now instead of using preview cards.
+        // Might as well use it to find URLs. Could also use this for YouTube: If Odesli finds something, it's a song,
+        // if not, ignore it. No need to consult the YT API and give those links a special handling
+        const musicUrls: string[] = [];
+        const musicUrl = await TimelineReader.getUrlFromPreviewCard(post);
+        if (musicUrl) {
+          musicUrls.push(musicUrl);
+        } else {
+          const urlMatches = post.content.matchAll(URL_REGEX);
+          for (const match of urlMatches) {
+            if (match === undefined || match.groups === undefined) {
+              continue;
+            }
+            const urlMatch = match.groups.postUrl.toString();
+            const musicUrl = urls.find((u) => urlMatch.includes(u));
+            if (musicUrl) {
+              musicUrls.push(urlMatch);
+            }
+          }
+        }
+
+        for (const url of musicUrls) {
+          let hostname: string | null = null;
+          try {
+            hostname = new URL(url).hostname;
+          } catch (e) {
+            console.error(`Could not check hostname for URL ${url}`, e);
+          }
+          if (hostname === 'songwhip.com') {
+            // TODO: Implement checking the songwhip API
+            continue;
+          }
+          const info = await TimelineReader.getSongInfo(url);
+          if (info) {
+            console.info(
+              'Got song info for',
+              post.url,
+              url,
+              info.artistName,
+              info.title,
+              info.thumbnailUrl,
+              info.pageUrl,
+              info.youtubeUrl
+            );
+          }
+        }
+
        await savePost(post);
+
        const posts = await getPosts(null, null, 100);
        await saveAtomFeed(createFeed(posts));
      } catch (e) {