Fix #24, refactor URL detection

2023-04-24 19:38:13 +02:00 · 2023-04-24 19:38:13 +02:00 · 68aade4f1f
commit 68aade4f1f
parent 9bbcc843c2
6 changed files with 696 additions and 428 deletions
--- a/.env.EXAMPLE
+++ b/.env.EXAMPLE
@ -1,7 +1,5 @@
 HASHTAG_FILTER = ichlausche,music,musik,nowplaying,tunetuesday,nowlistening
 URL_FILTER = song.link,album.link,spotify.com,music.apple.com,bandcamp.com,songwhip.com
 YOUTUBE_API_KEY = CHANGE_ME
 YOUTUBE_DISABLE = false
 ODESLI_API_KEY = CHANGE_ME
 MASTODON_INSTANCE = 'metalhead.club'
 BASE_URL = 'https://moshingmammut.phlaym.net'
--- a/README.md
+++ b/README.md
@ -11,8 +11,8 @@ Having a quick overview over what is being posted can be a great way to discover
 This is fairly simple from a technical point of view! metalhead.club's local timeline is being watched using the
 Mastodon Streaming API over a Websocket. Every time a new post arrives, it is checked if it contains any music by
-checking included hashtags and URLs. A list of tags and URLs can be found in [the configuration](.env.EXAMPLE).
+checking included hashtags and URLs. A list of tags can be found in [the configuration](.env.EXAMPLE).
-Additionally, lins to YouTube are queried, if they are music or other videos using the YouTube API.
+Additionally, links are vetted if they are music by checking if https://song.link finds info on them.
 If a post passes this check it is saved to a SQLite database.
@ -93,11 +93,12 @@ and set your `User`, `Group`, `ExecStart` and `WorkingDirectory` accordingly.
 #### On your development machine
-Copy `.env.EXAMPLE` to `.env` and add your `YOUTUBE_API_KEY`.
+Copy `.env.EXAMPLE` to `.env` and add your `YOUTUBE_API_KEY` and `ODESLI_API_KEY`.
 To obtain one follow [YouTube's guide](https://developers.google.com/youtube/registering_an_application) to create an
 _API key_.
-If `YOUTUBE_API_KEY` is unset, all YouTube videos will be assumed to contain music links.
+If `YOUTUBE_API_KEY` is unset, no playlist will be updated.
-If this is unwanted, set `YOUTUBE_DISABLE` to `true`).
+
 If `ODESLI_API_KEY` is unset, your rate limit to the song.link API will be lower.
 Run `npm run build` and copy the output folder, usually `build` to `$APP_DIR` on your server.
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "moshing-mammut",
-  "version": "1.1.0",
+  "version": "1.3.0",
  "private": true,
  "license": "LGPL-3.0-or-later",
  "scripts": {
--- a/src/lib/server/timeline.ts
+++ b/src/lib/server/timeline.ts
@ -1,89 +1,17 @@
-import {
+import { HASHTAG_FILTER, MASTODON_INSTANCE, ODESLI_API_KEY } from '$env/static/private';
  HASHTAG_FILTER,
  MASTODON_INSTANCE,
  ODESLI_API_KEY,
  URL_FILTER,
  YOUTUBE_API_KEY,
  YOUTUBE_DISABLE
 } from '$env/static/private';
 import { log } from '$lib/log';
 import type { Post, Tag, TimelineEvent } from '$lib/mastodon/response';
 import type { OdesliResponse, Platform, SongInfo } from '$lib/odesliResponse';
 import { getPosts, savePost } from '$lib/server/db';
 import { createFeed, saveAtomFeed } from '$lib/server/rss';
 import { sleep } from '$lib/sleep';
 import { isTruthy } from '$lib/truthyString';
 import { WebSocket } from 'ws';
 const YOUTUBE_REGEX = new RegExp(
  /https?:\/\/(www\.)?youtu((be.com\/.*?v=)|(\.be\/))(?<videoId>[a-zA-Z_0-9-]+)/gm
 );
 const URL_REGEX = new RegExp(/href="(?<postUrl>[^>]+?)" target="_blank"/gm);
 export class TimelineReader {
  private static _instance: TimelineReader;
  private static async isMusicVideo(videoId: string) {
    if (!YOUTUBE_API_KEY || YOUTUBE_API_KEY === 'CHANGE_ME') {
      // Assume that it *is* a music link when no YT API key is provided
      // If it should assumed to not be YOUTUBE_DISABLE needs to be set to something truthy
      return true;
    }
    const searchParams = new URLSearchParams([
      ['part', 'snippet'],
      ['id', videoId],
      ['key', YOUTUBE_API_KEY]
    ]);
    const youtubeVideoUrl = new URL(`https://www.googleapis.com/youtube/v3/videos?${searchParams}`);
    const resp = await fetch(youtubeVideoUrl);
    const respObj = await resp.json();
    if (!respObj.items.length) {
      log.warn('Could not find video with id', videoId);
      return false;
    }
    const item = respObj.items[0];
    if (item.tags?.includes('music')) {
      return true;
    }
    const categorySearchParams = new URLSearchParams([
      ['part', 'snippet'],
      ['id', item.categoryId],
      ['key', YOUTUBE_API_KEY]
    ]);
    const youtubeCategoryUrl = new URL(
      `https://www.googleapis.com/youtube/v3/videoCategories?${categorySearchParams}`
    );
    const categoryTitle: string = await fetch(youtubeCategoryUrl)
      .then((r) => r.json())
      .then((r) => r.items[0]?.title);
    return categoryTitle === 'Music';
  }
  private static async checkYoutubeMatches(postContent: string): Promise<string | null> {
    if (isTruthy(YOUTUBE_DISABLE)) {
      return null;
    }
    const matches = postContent.matchAll(YOUTUBE_REGEX);
    for (const match of matches) {
      if (match === undefined || match.groups === undefined) {
        continue;
      }
      const videoId = match.groups.videoId.toString();
      try {
        const isMusic = await TimelineReader.isMusicVideo(videoId);
        if (isMusic) {
          return match[0];
        }
      } catch (e) {
        log.error('Could not check if', videoId, 'is a music video', e);
      }
    }
    return null;
  }
  private static async getSongInfo(url: URL, remainingTries = 6): Promise<SongInfo | null> {
    if (remainingTries === 0) {
      log.error('No tries remaining. Lookup failed!');
@ -109,16 +37,18 @@ export class TimelineReader {
        if (response.status === 429) {
          throw new Error('Rate limit reached', { cause: 429 });
        }
-        return response.json().then((odesliInfo: OdesliResponse) => {
+        const odesliInfo: OdesliResponse = await response.json();
-          const info = odesliInfo.entitiesByUniqueId[odesliInfo.entityUniqueId];
+        if (!odesliInfo || !odesliInfo.entitiesByUniqueId || !odesliInfo.entityUniqueId) {
-          const platform: Platform = 'youtube';
+          return null;
-          return {
+        }
-            ...info,
+        const info = odesliInfo.entitiesByUniqueId[odesliInfo.entityUniqueId];
-            pageUrl: odesliInfo.pageUrl,
+        const platform: Platform = 'youtube';
-            youtubeUrl: odesliInfo.linksByPlatform[platform]?.url,
+        return {
-            postedUrl: url.toString()
+          ...info,
-          } as SongInfo;
+          pageUrl: odesliInfo.pageUrl,
-        });
+          youtubeUrl: odesliInfo.linksByPlatform[platform]?.url,
          postedUrl: url.toString()
        } as SongInfo;
      });
    } catch (e) {
      if (e instanceof Error && e.cause === 429) {
@ -131,24 +61,6 @@ export class TimelineReader {
    }
  }
  private static async getUrlFromPreviewCard(post: Post): Promise<string | undefined> {
    return undefined;
    // Currently disabled, because it seems to always be null, even after re-fetching the post from Mastodon
    /*
    if (post.card) {
      return post.card?.url;
    }
    try {
      const status: Post = await (
        await fetch(`https://${MASTODON_INSTANCE}/api/v1/statuses/${post.id}`)
      ).json();
      return status.card?.url;
    } catch (e) {
      log.error(`Could not fetch status ${post.url}`, e);
    }
    */
  }
  private startWebsocket() {
    const socket = new WebSocket(`wss://${MASTODON_INSTANCE}/api/v1/streaming`);
    socket.onopen = () => {
@ -165,74 +77,27 @@ export class TimelineReader {
        const hashttags: string[] = HASHTAG_FILTER.split(',');
        const found_tags: Tag[] = post.tags.filter((t: Tag) => hashttags.includes(t.name));
-        const urls: string[] = URL_FILTER.split(',');
+        const urlMatches = post.content.matchAll(URL_REGEX);
        const found_urls = urls.filter((t) => post.content.includes(t));
        // If we don't have any tags or non-youtube urls, check youtube
        // YT is handled separately, because it requires an API call and therefore is slower
        if (found_urls.length === 0 && found_tags.length === 0) {
          const youtubeUrl = await TimelineReader.checkYoutubeMatches(post.content);
          if (youtubeUrl === null) {
            log.log('Ignoring post', post.url);
            return;
          }
          log.debug('Found YT URL', youtubeUrl, found_urls, found_urls.length);
        } else {
          log.debug('Found URLs and/or tags:', found_urls, found_tags);
        }
        // TODO: Change URL detection above to use this regex.
        // Looks like we're stuck with regex for now instead of using preview cards.
        // Might as well use it to find URLs. Could also use this for YouTube: If Odesli finds something, it's a song,
        // if not, ignore it. No need to consult the YT API and give those links a special handling
        const musicUrls: URL[] = [];
        const musicUrl = await TimelineReader.getUrlFromPreviewCard(post);
        if (musicUrl) {
          try {
            musicUrls.push(new URL(musicUrl));
          } catch (e) {
            log.error(
              'URL received from preview card does not seem to be a valid URL',
              musicUrl,
              e
            );
          }
        } else {
          const urlMatches = post.content.matchAll(URL_REGEX);
          for (const match of urlMatches) {
            if (match === undefined || match.groups === undefined) {
              console.warn(
                'Match listed in allMatches, but either it or its groups are undefined',
                match
              );
              continue;
            }
            const urlMatch = match.groups.postUrl.toString();
            let url: URL;
            try {
              url = new URL(urlMatch);
            } catch (e) {
              log.error('URL found via Regex does not seem to be a valud url', urlMatch, e);
              continue;
            }
            // Check *all* found url and let odesli determine if it is music or not
            musicUrls.push(url);
          }
        }
        const songs: SongInfo[] = [];
-        log.debug(`Checking ${musicUrls.length} URLs if they contain song data`);
+        for (const match of urlMatches) {
-        for (const url of musicUrls) {
+          if (match === undefined || match.groups === undefined) {
-          let hostname: string | null = null;
+            log.warn(
-          try {
+              'Match listed in allMatches, but either it or its groups are undefined',
-            hostname = new URL(url).hostname;
+              match
-          } catch (e) {
+            );
            log.error(`Could not check hostname for URL ${url}`, e);
          }
          if (hostname === 'songwhip.com') {
            // TODO: Implement checking the songwhip API
            continue;
          }
          const urlMatch = match.groups.postUrl.toString();
          let url: URL;
          try {
            url = new URL(urlMatch);
          } catch (e) {
            log.error('URL found via Regex does not seem to be a valud url', urlMatch, e);
            continue;
          }
          // Check *all* found url and let odesli determine if it is music or not
          log.debug(`Checking ${url} if it contains song data`);
          const info = await TimelineReader.getSongInfo(url);
          log.debug(`Found song info for ${url}?`, info);
          if (info) {
@ -240,6 +105,13 @@ export class TimelineReader {
          }
        }
        // If we don't have any tags or non-youtube urls, check youtube
        // YT is handled separately, because it requires an API call and therefore is slower
        if (songs.length === 0 && found_tags.length === 0) {
          log.log('Ignoring post', post.url);
          return;
        }
        await savePost(post, songs);
        log.debug('Saved post', post.url);
--- a/src/routes/+page.svelte
+++ b/src/routes/+page.svelte
@ -187,7 +187,7 @@
  }
  .post {
    width: 100%;
-    max-width: 600px;
+    max-width: min(800px, 80vw);
    margin-bottom: 1em;
    border-bottom: 1px solid var(--color-border);
    padding: 1em;