import scrapy import traceback from scrapy_settings import EXT_SETTINGS from pprint import pprint class unloze_spider(scrapy.Spider): """ Main unloze event scraper """ custom_settings = EXT_SETTINGS def __init__(self, item): self.url = item["url"] self.item = item def start_requests(self): request = scrapy.Request( url = self.url, callback = self.parse ) yield request def parse(self, response): """ Parsing content in the events sections """ newest_thread = None threads = response.xpath("//div[@class='structItem-title']/@uix-href").extract() for thread in threads: if "poll" in thread.lower() or "nomination-thread" in thread.lower(): continue newest_thread = thread break if newest_thread is None: print("no thread found. url: ", response.url) import sys sys.exit(1) request = scrapy.Request( url = "https://unloze.com" + newest_thread, callback = self.parse2 ) yield request def parse2(self, response): """ Parsing content on the actual newest event thread """ try: event_title = response.url.rsplit(".", 1)[0].rsplit("/", 1)[1] event_server = "" event_maps = "" #several event managers do the threads differently in terms of highlighting and marks, they dont use or understand standardization welp #as long as no random nigger types TL;DR in their post i guess this will work skipping = len(response.xpath("//*[contains(text(),'TL;DR')]").extract()) next_event_maps = False next_event_date = False next_event_server = False next_event_time = False next_event_reward = False for r in response.xpath("//*[contains(text(),'TL;DR')]/../../..//text()").extract(): if "\n" in r or len(r) < 4 or "\t" in r or skipping > 0: if r.lower() == "tl;dr": skipping -= 1 continue if "server" in r.lower() and "time" not in r.lower(): next_event_server = True continue if next_event_server: event_server += r next_event_server = False if "maps" in r.lower() and "rewards" not in r.lower(): next_event_maps = True continue if "date" in r.lower(): next_event_maps = False next_event_date = True continue if next_event_maps: event_maps += f"{r} " if next_event_date: next_event_date= False event_date = r if "time" in r.lower() and "server" not in r.lower(): next_event_time = True continue if next_event_time: event_time = r next_event_time = False if "rewards" in r.lower(): next_event_reward = True continue if next_event_reward: event_reward = r next_event_reward = False self.item["event_title"] = event_title self.item["event_date"] = event_date self.item["event_time"] = event_time self.item["event_server"] = event_server self.item["event_maps"] = event_maps self.item["event_reward"] = event_reward self.item["event_url"] = response.url except Exception: error_msg = traceback.format_exc() print("traceback msg: ", error_msg) print("url: ", response.url) import sys sys.exit(1) #pprint(self.item) return self.item