Fix #24, refactor URL detection

2023-04-24 19:38:13 +02:00 · 2023-04-24 19:38:13 +02:00 · 68aade4f1f
commit 68aade4f1f
parent 9bbcc843c2
6 changed files with 696 additions and 428 deletions
--- a/.env.EXAMPLE
+++ b/.env.EXAMPLE
@ -1,7 +1,5 @@
 HASHTAG_FILTER = ichlausche,music,musik,nowplaying,tunetuesday,nowlistening
-URL_FILTER = song.link,album.link,spotify.com,music.apple.com,bandcamp.com,songwhip.com
 YOUTUBE_API_KEY = CHANGE_ME
-YOUTUBE_DISABLE = false
 ODESLI_API_KEY = CHANGE_ME
 MASTODON_INSTANCE = 'metalhead.club'
 BASE_URL = 'https://moshingmammut.phlaym.net'
--- a/README.md
+++ b/README.md
@ -11,8 +11,8 @@ Having a quick overview over what is being posted can be a great way to discover

 This is fairly simple from a technical point of view! metalhead.club's local timeline is being watched using the
 Mastodon Streaming API over a Websocket. Every time a new post arrives, it is checked if it contains any music by
-checking included hashtags and URLs. A list of tags and URLs can be found in [the configuration](.env.EXAMPLE).
-Additionally, lins to YouTube are queried, if they are music or other videos using the YouTube API.
+checking included hashtags and URLs. A list of tags can be found in [the configuration](.env.EXAMPLE).
+Additionally, links are vetted if they are music by checking if https://song.link finds info on them.

 If a post passes this check it is saved to a SQLite database.

@ -93,11 +93,12 @@ and set your `User`, `Group`, `ExecStart` and `WorkingDirectory` accordingly.

 #### On your development machine

-Copy `.env.EXAMPLE` to `.env` and add your `YOUTUBE_API_KEY`.
+Copy `.env.EXAMPLE` to `.env` and add your `YOUTUBE_API_KEY` and `ODESLI_API_KEY`.
 To obtain one follow [YouTube's guide](https://developers.google.com/youtube/registering_an_application) to create an
 _API key_.
-If `YOUTUBE_API_KEY` is unset, all YouTube videos will be assumed to contain music links.
-If this is unwanted, set `YOUTUBE_DISABLE` to `true`).
+If `YOUTUBE_API_KEY` is unset, no playlist will be updated.
+
+If `ODESLI_API_KEY` is unset, your rate limit to the song.link API will be lower.

 Run `npm run build` and copy the output folder, usually `build` to `$APP_DIR` on your server.

--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "moshing-mammut",
-  "version": "1.1.0",
+  "version": "1.3.0",
  "private": true,
  "license": "LGPL-3.0-or-later",
  "scripts": {
--- a/src/lib/server/timeline.ts
+++ b/src/lib/server/timeline.ts
@ -1,89 +1,17 @@
-import {
-  HASHTAG_FILTER,
-  MASTODON_INSTANCE,
-  ODESLI_API_KEY,
-  URL_FILTER,
-  YOUTUBE_API_KEY,
-  YOUTUBE_DISABLE
-} from '$env/static/private';
+import { HASHTAG_FILTER, MASTODON_INSTANCE, ODESLI_API_KEY } from '$env/static/private';
 import { log } from '$lib/log';
 import type { Post, Tag, TimelineEvent } from '$lib/mastodon/response';
 import type { OdesliResponse, Platform, SongInfo } from '$lib/odesliResponse';
 import { getPosts, savePost } from '$lib/server/db';
 import { createFeed, saveAtomFeed } from '$lib/server/rss';
 import { sleep } from '$lib/sleep';
-import { isTruthy } from '$lib/truthyString';
 import { WebSocket } from 'ws';

-const YOUTUBE_REGEX = new RegExp(
-  /https?:\/\/(www\.)?youtu((be.com\/.*?v=)|(\.be\/))(?<videoId>[a-zA-Z_0-9-]+)/gm
-);
-
 const URL_REGEX = new RegExp(/href="(?<postUrl>[^>]+?)" target="_blank"/gm);

 export class TimelineReader {
  private static _instance: TimelineReader;

-  private static async isMusicVideo(videoId: string) {
-    if (!YOUTUBE_API_KEY || YOUTUBE_API_KEY === 'CHANGE_ME') {
-      // Assume that it *is* a music link when no YT API key is provided
-      // If it should assumed to not be YOUTUBE_DISABLE needs to be set to something truthy
-      return true;
-    }
-    const searchParams = new URLSearchParams([
-      ['part', 'snippet'],
-      ['id', videoId],
-      ['key', YOUTUBE_API_KEY]
-    ]);
-    const youtubeVideoUrl = new URL(`https://www.googleapis.com/youtube/v3/videos?${searchParams}`);
-    const resp = await fetch(youtubeVideoUrl);
-    const respObj = await resp.json();
-    if (!respObj.items.length) {
-      log.warn('Could not find video with id', videoId);
-      return false;
-    }
-
-    const item = respObj.items[0];
-    if (item.tags?.includes('music')) {
-      return true;
-    }
-
-    const categorySearchParams = new URLSearchParams([
-      ['part', 'snippet'],
-      ['id', item.categoryId],
-      ['key', YOUTUBE_API_KEY]
-    ]);
-    const youtubeCategoryUrl = new URL(
-      `https://www.googleapis.com/youtube/v3/videoCategories?${categorySearchParams}`
-    );
-    const categoryTitle: string = await fetch(youtubeCategoryUrl)
-      .then((r) => r.json())
-      .then((r) => r.items[0]?.title);
-    return categoryTitle === 'Music';
-  }
-
-  private static async checkYoutubeMatches(postContent: string): Promise<string | null> {
-    if (isTruthy(YOUTUBE_DISABLE)) {
-      return null;
-    }
-    const matches = postContent.matchAll(YOUTUBE_REGEX);
-    for (const match of matches) {
-      if (match === undefined || match.groups === undefined) {
-        continue;
-      }
-      const videoId = match.groups.videoId.toString();
-      try {
-        const isMusic = await TimelineReader.isMusicVideo(videoId);
-        if (isMusic) {
-          return match[0];
-        }
-      } catch (e) {
-        log.error('Could not check if', videoId, 'is a music video', e);
-      }
-    }
-    return null;
-  }
-
  private static async getSongInfo(url: URL, remainingTries = 6): Promise<SongInfo | null> {
    if (remainingTries === 0) {
      log.error('No tries remaining. Lookup failed!');
@ -109,7 +37,10 @@ export class TimelineReader {
        if (response.status === 429) {
          throw new Error('Rate limit reached', { cause: 429 });
        }
-        return response.json().then((odesliInfo: OdesliResponse) => {
+        const odesliInfo: OdesliResponse = await response.json();
+        if (!odesliInfo || !odesliInfo.entitiesByUniqueId || !odesliInfo.entityUniqueId) {
+          return null;
+        }
        const info = odesliInfo.entitiesByUniqueId[odesliInfo.entityUniqueId];
        const platform: Platform = 'youtube';
        return {
@ -119,7 +50,6 @@ export class TimelineReader {
          postedUrl: url.toString()
        } as SongInfo;
      });
-      });
    } catch (e) {
      if (e instanceof Error && e.cause === 429) {
        log.warn('song.link rate limit reached. Trying again in 10 seconds');
@ -131,24 +61,6 @@ export class TimelineReader {
    }
  }

-  private static async getUrlFromPreviewCard(post: Post): Promise<string | undefined> {
-    return undefined;
-    // Currently disabled, because it seems to always be null, even after re-fetching the post from Mastodon
-    /*
-    if (post.card) {
-      return post.card?.url;
-    }
-    try {
-      const status: Post = await (
-        await fetch(`https://${MASTODON_INSTANCE}/api/v1/statuses/${post.id}`)
-      ).json();
-      return status.card?.url;
-    } catch (e) {
-      log.error(`Could not fetch status ${post.url}`, e);
-    }
-    */
-  }
-
  private startWebsocket() {
    const socket = new WebSocket(`wss://${MASTODON_INSTANCE}/api/v1/streaming`);
    socket.onopen = () => {
@ -165,42 +77,11 @@ export class TimelineReader {
        const hashttags: string[] = HASHTAG_FILTER.split(',');
        const found_tags: Tag[] = post.tags.filter((t: Tag) => hashttags.includes(t.name));

-        const urls: string[] = URL_FILTER.split(',');
-        const found_urls = urls.filter((t) => post.content.includes(t));
-        // If we don't have any tags or non-youtube urls, check youtube
-        // YT is handled separately, because it requires an API call and therefore is slower
-        if (found_urls.length === 0 && found_tags.length === 0) {
-          const youtubeUrl = await TimelineReader.checkYoutubeMatches(post.content);
-          if (youtubeUrl === null) {
-            log.log('Ignoring post', post.url);
-            return;
-          }
-          log.debug('Found YT URL', youtubeUrl, found_urls, found_urls.length);
-        } else {
-          log.debug('Found URLs and/or tags:', found_urls, found_tags);
-        }
-
-        // TODO: Change URL detection above to use this regex.
-        // Looks like we're stuck with regex for now instead of using preview cards.
-        // Might as well use it to find URLs. Could also use this for YouTube: If Odesli finds something, it's a song,
-        // if not, ignore it. No need to consult the YT API and give those links a special handling
-        const musicUrls: URL[] = [];
-        const musicUrl = await TimelineReader.getUrlFromPreviewCard(post);
-        if (musicUrl) {
-          try {
-            musicUrls.push(new URL(musicUrl));
-          } catch (e) {
-            log.error(
-              'URL received from preview card does not seem to be a valid URL',
-              musicUrl,
-              e
-            );
-          }
-        } else {
        const urlMatches = post.content.matchAll(URL_REGEX);
+        const songs: SongInfo[] = [];
        for (const match of urlMatches) {
          if (match === undefined || match.groups === undefined) {
-              console.warn(
+            log.warn(
              'Match listed in allMatches, but either it or its groups are undefined',
              match
            );
@ -216,23 +97,7 @@ export class TimelineReader {
          }

          // Check *all* found url and let odesli determine if it is music or not
-            musicUrls.push(url);
-          }
-        }
-
-        const songs: SongInfo[] = [];
-        log.debug(`Checking ${musicUrls.length} URLs if they contain song data`);
-        for (const url of musicUrls) {
-          let hostname: string | null = null;
-          try {
-            hostname = new URL(url).hostname;
-          } catch (e) {
-            log.error(`Could not check hostname for URL ${url}`, e);
-          }
-          if (hostname === 'songwhip.com') {
-            // TODO: Implement checking the songwhip API
-            continue;
-          }
+          log.debug(`Checking ${url} if it contains song data`);
          const info = await TimelineReader.getSongInfo(url);
          log.debug(`Found song info for ${url}?`, info);
          if (info) {
@ -240,6 +105,13 @@ export class TimelineReader {
          }
        }

+        // If we don't have any tags or non-youtube urls, check youtube
+        // YT is handled separately, because it requires an API call and therefore is slower
+        if (songs.length === 0 && found_tags.length === 0) {
+          log.log('Ignoring post', post.url);
+          return;
+        }
+
        await savePost(post, songs);
        log.debug('Saved post', post.url);

--- a/src/routes/+page.svelte
+++ b/src/routes/+page.svelte
@ -187,7 +187,7 @@
  }
  .post {
    width: 100%;
-    max-width: 600px;
+    max-width: min(800px, 80vw);
    margin-bottom: 1em;
    border-bottom: 1px solid var(--color-border);
    padding: 1em;