120 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			120 lines
		
	
	
		
			4.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import scrapy
 | 
						|
import traceback
 | 
						|
from scrapy_settings import EXT_SETTINGS
 | 
						|
from pprint import pprint
 | 
						|
 | 
						|
class unloze_spider(scrapy.Spider):
 | 
						|
    """
 | 
						|
    Main unloze event scraper 
 | 
						|
    """
 | 
						|
 | 
						|
    custom_settings = EXT_SETTINGS
 | 
						|
    
 | 
						|
    def __init__(self, item):
 | 
						|
        self.url = item["url"]
 | 
						|
        self.item = item
 | 
						|
 | 
						|
    def start_requests(self):
 | 
						|
        request = scrapy.Request(
 | 
						|
            url = self.url,
 | 
						|
            callback = self.parse
 | 
						|
        )
 | 
						|
        yield request
 | 
						|
 | 
						|
    def parse(self, response):
 | 
						|
        """
 | 
						|
        Parsing content in the events sections
 | 
						|
        """
 | 
						|
        newest_thread = None
 | 
						|
        threads = response.xpath("//div[@class='structItem-title']/@uix-href").extract()
 | 
						|
        for thread in threads:
 | 
						|
            if "poll" in thread.lower() or "nomination-thread" in thread.lower():
 | 
						|
                continue
 | 
						|
            newest_thread = thread
 | 
						|
            break
 | 
						|
        
 | 
						|
        if newest_thread is None:
 | 
						|
            print("no thread found. url: ", response.url)
 | 
						|
            import sys
 | 
						|
            sys.exit(1)
 | 
						|
        request = scrapy.Request(
 | 
						|
            url = "https://unloze.com" + newest_thread,
 | 
						|
            callback = self.parse2
 | 
						|
        )
 | 
						|
        yield request
 | 
						|
        
 | 
						|
    def parse2(self, response):
 | 
						|
        """
 | 
						|
        Parsing content on the actual newest event thread
 | 
						|
        """
 | 
						|
        try:
 | 
						|
            event_title = response.url.rsplit(".", 1)[0].rsplit("/", 1)[1]
 | 
						|
            event_server = ""
 | 
						|
            event_maps = ""
 | 
						|
            #several event managers do the threads differently in terms of highlighting and marks, they dont use or understand standardization welp
 | 
						|
            #as long as no random nigger types TL;DR in their post i guess this will work
 | 
						|
            skipping = len(response.xpath("//*[contains(text(),'TL;DR')]").extract())
 | 
						|
            next_event_maps = False
 | 
						|
            next_event_date = False
 | 
						|
            next_event_server = False
 | 
						|
            next_event_time = False
 | 
						|
            next_event_reward = False
 | 
						|
            for r in response.xpath("//*[contains(text(),'TL;DR')]/../../..//text()").extract():
 | 
						|
                if "\n" in r or len(r) < 4 or "\t" in r or skipping > 0:
 | 
						|
                    if r.lower() == "tl;dr":
 | 
						|
                        skipping -= 1
 | 
						|
                    continue
 | 
						|
                if "server" in r.lower() and "time" not in r.lower():
 | 
						|
                    next_event_server = True
 | 
						|
                    continue
 | 
						|
                if next_event_server:
 | 
						|
                    event_server += r
 | 
						|
                    next_event_server = False
 | 
						|
 | 
						|
                if "maps" in r.lower() and "rewards" not in r.lower():
 | 
						|
                    next_event_maps = True
 | 
						|
                    continue
 | 
						|
 | 
						|
                if "date" in r.lower():
 | 
						|
                    next_event_maps = False
 | 
						|
                    next_event_date = True
 | 
						|
                    continue
 | 
						|
 | 
						|
                if next_event_maps:
 | 
						|
                    event_maps += f"{r} "
 | 
						|
 | 
						|
                if next_event_date:
 | 
						|
                    next_event_date= False
 | 
						|
                    event_date = r
 | 
						|
                if "time" in r.lower() and "server" not in r.lower():
 | 
						|
                    next_event_time = True
 | 
						|
                    continue
 | 
						|
                if next_event_time:
 | 
						|
                    event_time = r
 | 
						|
                    next_event_time = False
 | 
						|
 | 
						|
                if "rewards" in r.lower():
 | 
						|
                    next_event_reward = True
 | 
						|
                    continue
 | 
						|
                if next_event_reward:
 | 
						|
                    event_reward = r
 | 
						|
                    next_event_reward = False
 | 
						|
 | 
						|
            self.item["event_title"] = event_title
 | 
						|
            self.item["event_date"] = event_date
 | 
						|
            self.item["event_time"] = event_time
 | 
						|
            self.item["event_server"] = event_server
 | 
						|
            self.item["event_maps"] = event_maps
 | 
						|
            self.item["event_reward"] = event_reward
 | 
						|
            self.item["event_url"] = response.url
 | 
						|
 | 
						|
        except Exception:
 | 
						|
            error_msg = traceback.format_exc()
 | 
						|
            print("traceback msg: ", error_msg)
 | 
						|
            print("url: ", response.url)
 | 
						|
            import sys
 | 
						|
            sys.exit(1)
 | 
						|
        
 | 
						|
        #pprint(self.item)
 | 
						|
        return self.item 
 |