parsing video divs

This commit is contained in:
Max Nuding 2022-01-11 15:17:33 +01:00
parent d2bf250fbf
commit eae1b8e1cf
Signed by: phlaym
GPG Key ID: A06651BAB6777237

View File

@ -1,7 +1,8 @@
import datetime
import os import os
import re import re
import time import time
from typing import Optional from typing import List, Optional
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
@ -30,9 +31,25 @@ class NebulaLoader:
options.headless = True options.headless = True
self.driver = webdriver.Firefox(service=service, options=options) self.driver = webdriver.Firefox(service=service, options=options)
self.driver.implicitly_wait(10) # seconds self.driver.implicitly_wait(10) # seconds
#NebulaVideo('', '', '', None)
def load(self): @staticmethod
def _parse_anchor(anchor) -> NebulaVideo:
info_div = anchor.next_sibling
details_anchor = info_div.find_all('a')[1]
divs = details_anchor.find_all('div')
title_div = divs[0]
details_div = divs[1]
creator = details_div.find('span').string
release_text = details_div.find('time').get('datetime')
release_date = datetime.datetime.fromisoformat(release_text.replace('Z', '+00:00'))
return NebulaVideo(
title=title_div.string,
creator=creator,
url='https://nebula.app' + anchor.get('href'),
release_at=release_date
)
def load(self) -> List[NebulaVideo]:
self.driver.get('https://nebula.app/login') self.driver.get('https://nebula.app/login')
username_input = '//*[@name="email"]' username_input = '//*[@name="email"]'
@ -55,7 +72,7 @@ class NebulaLoader:
follower_error_re = re.compile("You aren't following any creators yet.*") follower_error_re = re.compile("You aren't following any creators yet.*")
while not video_links and count_remaining > 0: while not video_links and count_remaining > 0:
time.sleep(2) time.sleep(2)
soup = BeautifulSoup(self.driver.page_source) soup = BeautifulSoup(self.driver.page_source, features="lxml")
follower_error = False follower_error = False
follower_error = [p for p in soup.find_all('p') if p.find(string=follower_error_re)] follower_error = [p for p in soup.find_all('p') if p.find(string=follower_error_re)]
if follower_error: if follower_error:
@ -63,7 +80,7 @@ class NebulaLoader:
self.driver.refresh() self.driver.refresh()
count_remaining = 5 count_remaining = 5
else: else:
video_links = [a for a in soup.find_all('a') if a.get('href').startswith('/videos/')] all_anchors = soup.find_all('a')
video_links = [a for a in all_anchors if a.get('href').startswith('/videos/') and a.get('aria-hidden')]
count_remaining -= 1 count_remaining -= 1
#v = nebula_video.NebulaVideo('', '', '', None) return [NebulaLoader._parse_anchor(v) for v in video_links]
print(video_links)