109 lines
3.6 KiB
Python
109 lines
3.6 KiB
Python
|
import scrapy
|
||
|
import traceback
|
||
|
from scrapy_settings import EXT_SETTINGS
|
||
|
from pprint import pprint
|
||
|
|
||
|
class unloze_spider(scrapy.Spider):
|
||
|
"""
|
||
|
Main unloze event scraper
|
||
|
"""
|
||
|
|
||
|
custom_settings = EXT_SETTINGS
|
||
|
|
||
|
def __init__(self, item):
|
||
|
self.url = item["url"]
|
||
|
self.item = item
|
||
|
|
||
|
def start_requests(self):
|
||
|
request = scrapy.Request(
|
||
|
url = self.url,
|
||
|
callback = self.parse
|
||
|
)
|
||
|
yield request
|
||
|
|
||
|
def parse(self, response):
|
||
|
"""
|
||
|
Parsing content in the events sections
|
||
|
"""
|
||
|
newest_thread = None
|
||
|
threads = response.xpath("//div[@class='structItem-title']/@uix-href").extract()
|
||
|
for thread in threads:
|
||
|
if "poll" in thread.lower() or "nomination-thread" in thread.lower():
|
||
|
continue
|
||
|
newest_thread = thread
|
||
|
break
|
||
|
|
||
|
if newest_thread is None:
|
||
|
print("no thread found. url: ", response.url)
|
||
|
import sys
|
||
|
sys.exit(1)
|
||
|
request = scrapy.Request(
|
||
|
url = "https://unloze.com" + newest_thread,
|
||
|
callback = self.parse2
|
||
|
)
|
||
|
yield request
|
||
|
|
||
|
def parse2(self, response):
|
||
|
"""
|
||
|
Parsing content on the actual newest event thread
|
||
|
"""
|
||
|
try:
|
||
|
event_title = response.url.rsplit(".", 1)[0].rsplit("/", 1)[1]
|
||
|
event_server = ""
|
||
|
#several event managers do the threads differently in terms of highlighting and marks, they dont use standardization
|
||
|
index = 0
|
||
|
for r in response.xpath("//span[contains(text(),'TL;DR')]/../../../text()").extract():
|
||
|
if "\n" in r or len(r) < 4:
|
||
|
continue
|
||
|
if index < 2:
|
||
|
event_server += r
|
||
|
if index == 2:
|
||
|
event_date = r
|
||
|
if index == 3:
|
||
|
event_time = r[:-1]
|
||
|
if index == 4:
|
||
|
event_reward = r
|
||
|
index += 1
|
||
|
event_maps = ""
|
||
|
for r in response.xpath("//span[contains(text(),'TL;DR')]/../../../a/text()").extract():
|
||
|
event_maps += f"{r} "
|
||
|
if not index:
|
||
|
tldr_count = 0
|
||
|
for r in response.xpath("//b[contains(text(),'TL;DR')]/../../../span//text()").extract():
|
||
|
if "\n" in r or len(r) < 4:
|
||
|
continue
|
||
|
if "TL;DR" in r:
|
||
|
tldr_count += 1
|
||
|
if tldr_count < 2:
|
||
|
continue
|
||
|
if index == 2 or index == 4:
|
||
|
event_server += r
|
||
|
if index == 7:
|
||
|
event_date = r
|
||
|
if index == 9:
|
||
|
event_time = r
|
||
|
if index == 13:
|
||
|
event_reward = r
|
||
|
index += 1
|
||
|
for r in response.xpath("//b[contains(text(),'TL;DR')]/../../../a//text()").extract():
|
||
|
event_maps += f"{r} "
|
||
|
|
||
|
|
||
|
self.item["event_title"] = event_title
|
||
|
self.item["event_date"] = event_date
|
||
|
self.item["event_time"] = event_time
|
||
|
self.item["event_server"] = event_server
|
||
|
self.item["event_maps"] = event_maps
|
||
|
self.item["event_reward"] = event_reward
|
||
|
self.item["event_url"] = response.url
|
||
|
|
||
|
except Exception:
|
||
|
error_msg = traceback.format_exc()
|
||
|
print("traceback msg: ", error_msg)
|
||
|
print("url: ", response.url)
|
||
|
import sys
|
||
|
sys.exit(1)
|
||
|
|
||
|
#pprint(self.item)
|
||
|
return self.item
|