import scrapy import traceback from scrapy_settings import EXT_SETTINGS from pprint import pprint class unloze_spider(scrapy.Spider): """ Main unloze event scraper """ custom_settings = EXT_SETTINGS def __init__(self, item): self.url = item["url"] self.item = item def start_requests(self): request = scrapy.Request( url = self.url, callback = self.parse ) yield request def parse(self, response): """ Parsing content in the events sections """ newest_thread = None threads = response.xpath("//div[@class='structItem-title']/@uix-href").extract() for thread in threads: if "poll" in thread.lower() or "nomination-thread" in thread.lower(): continue newest_thread = thread break if newest_thread is None: print("no thread found. url: ", response.url) import sys sys.exit(1) request = scrapy.Request( url = "https://unloze.com" + newest_thread, callback = self.parse2 ) yield request def parse2(self, response): """ Parsing content on the actual newest event thread """ try: event_title = response.url.rsplit(".", 1)[0].rsplit("/", 1)[1] event_server = "" #several event managers do the threads differently in terms of highlighting and marks, they dont use standardization index = 0 for r in response.xpath("//span[contains(text(),'TL;DR')]/../../../text()").extract(): if "\n" in r or len(r) < 4: continue if index < 2: event_server += r if index == 2: event_date = r if index == 3: event_time = r #just skipping the leader part on ze if index == 4 and '27015' not in event_server: event_reward = r if index == 5 and '27015' in event_server: event_reward = r index += 1 event_maps = "" for r in response.xpath("//span[contains(text(),'TL;DR')]/../../../a/text()").extract(): event_maps += f"{r} " if not index: tldr_count = 0 for r in response.xpath("//b[contains(text(),'TL;DR')]/../../../span//text()").extract(): if "\n" in r or len(r) < 4: continue if "TL;DR" in r: tldr_count += 1 if tldr_count < 2: continue if index == 2 or index == 4: event_server += r if index == 7: event_date = r if index == 9: event_time = r if index == 13: event_reward = r index += 1 for r in response.xpath("//b[contains(text(),'TL;DR')]/../../../a//text()").extract(): event_maps += f"{r} " self.item["event_title"] = event_title self.item["event_date"] = event_date self.item["event_time"] = event_time self.item["event_server"] = event_server self.item["event_maps"] = event_maps self.item["event_reward"] = event_reward self.item["event_url"] = response.url except Exception: error_msg = traceback.format_exc() print("traceback msg: ", error_msg) print("url: ", response.url) import sys sys.exit(1) #pprint(self.item) return self.item