From eae1b8e1cf33fe58ce29d32d7e3cfff98f4be621 Mon Sep 17 00:00:00 2001 From: Max Nuding Date: Tue, 11 Jan 2022 15:17:33 +0100 Subject: [PATCH] parsing video divs --- nebula_rss/nebula_loader.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/nebula_rss/nebula_loader.py b/nebula_rss/nebula_loader.py index 273170d..2abc5ac 100755 --- a/nebula_rss/nebula_loader.py +++ b/nebula_rss/nebula_loader.py @@ -1,7 +1,8 @@ +import datetime import os import re import time -from typing import Optional +from typing import List, Optional from selenium import webdriver from selenium.webdriver.common.by import By @@ -30,9 +31,25 @@ class NebulaLoader: options.headless = True self.driver = webdriver.Firefox(service=service, options=options) self.driver.implicitly_wait(10) # seconds - #NebulaVideo('', '', '', None) - def load(self): + @staticmethod + def _parse_anchor(anchor) -> NebulaVideo: + info_div = anchor.next_sibling + details_anchor = info_div.find_all('a')[1] + divs = details_anchor.find_all('div') + title_div = divs[0] + details_div = divs[1] + creator = details_div.find('span').string + release_text = details_div.find('time').get('datetime') + release_date = datetime.datetime.fromisoformat(release_text.replace('Z', '+00:00')) + return NebulaVideo( + title=title_div.string, + creator=creator, + url='https://nebula.app' + anchor.get('href'), + release_at=release_date + ) + + def load(self) -> List[NebulaVideo]: self.driver.get('https://nebula.app/login') username_input = '//*[@name="email"]' @@ -55,7 +72,7 @@ class NebulaLoader: follower_error_re = re.compile("You aren't following any creators yet.*") while not video_links and count_remaining > 0: time.sleep(2) - soup = BeautifulSoup(self.driver.page_source) + soup = BeautifulSoup(self.driver.page_source, features="lxml") follower_error = False follower_error = [p for p in soup.find_all('p') if p.find(string=follower_error_re)] if follower_error: @@ -63,7 +80,7 @@ class NebulaLoader: self.driver.refresh() count_remaining = 5 else: - video_links = [a for a in soup.find_all('a') if a.get('href').startswith('/videos/')] + all_anchors = soup.find_all('a') + video_links = [a for a in all_anchors if a.get('href').startswith('/videos/') and a.get('aria-hidden')] count_remaining -= 1 - #v = nebula_video.NebulaVideo('', '', '', None) - print(video_links) + return [NebulaLoader._parse_anchor(v) for v in video_links]