parsing video divs

2022-01-11 15:17:33 +01:00 · 2022-01-11 15:17:33 +01:00 · eae1b8e1cf
commit eae1b8e1cf
parent d2bf250fbf
1 changed files with 24 additions and 7 deletions
--- a/nebula_rss/nebula_loader.py
+++ b/nebula_rss/nebula_loader.py
@ -1,7 +1,8 @@
+import datetime
 import os
 import re
 import time
-from typing import Optional
+from typing import List, Optional

 from selenium import webdriver
 from selenium.webdriver.common.by import By
@ -30,9 +31,25 @@ class NebulaLoader:
        options.headless = True
        self.driver = webdriver.Firefox(service=service, options=options)
        self.driver.implicitly_wait(10)  # seconds
-        #NebulaVideo('', '', '', None)

-    def load(self):
+    @staticmethod
+    def _parse_anchor(anchor) -> NebulaVideo:
+        info_div = anchor.next_sibling
+        details_anchor = info_div.find_all('a')[1]
+        divs = details_anchor.find_all('div')
+        title_div = divs[0]
+        details_div = divs[1]
+        creator = details_div.find('span').string
+        release_text = details_div.find('time').get('datetime')
+        release_date = datetime.datetime.fromisoformat(release_text.replace('Z', '+00:00'))
+        return NebulaVideo(
+            title=title_div.string,
+            creator=creator,
+            url='https://nebula.app' + anchor.get('href'),
+            release_at=release_date
+        )
+
+    def load(self) -> List[NebulaVideo]:
        self.driver.get('https://nebula.app/login')

        username_input = '//*[@name="email"]'
@ -55,7 +72,7 @@ class NebulaLoader:
        follower_error_re = re.compile("You aren't following any creators yet.*")
        while not video_links and count_remaining > 0:
            time.sleep(2)
-            soup = BeautifulSoup(self.driver.page_source)
+            soup = BeautifulSoup(self.driver.page_source, features="lxml")
            follower_error = False
            follower_error = [p for p in soup.find_all('p') if p.find(string=follower_error_re)]
            if follower_error:
@ -63,7 +80,7 @@ class NebulaLoader:
                self.driver.refresh()
                count_remaining = 5
            else:
-                video_links = [a for a in soup.find_all('a') if a.get('href').startswith('/videos/')]
+                all_anchors = soup.find_all('a')
+                video_links = [a for a in all_anchors if a.get('href').startswith('/videos/') and a.get('aria-hidden')]
            count_remaining -= 1
-        #v = nebula_video.NebulaVideo('', '', '', None)
-        print(video_links)
+        return [NebulaLoader._parse_anchor(v) for v in video_links]