projects-jenz/event_notification/python/scrape_event.py

131 lines
4.6 KiB
Python
Raw Normal View History

2022-06-19 13:40:27 +02:00
import scrapy
import traceback
2024-04-24 13:35:52 +02:00
import warnings
warnings.filterwarnings("ignore", category=scrapy.exceptions.ScrapyDeprecationWarning)
2022-06-19 13:40:27 +02:00
from scrapy_settings import EXT_SETTINGS
from pprint import pprint
class unloze_spider(scrapy.Spider):
"""
Main unloze event scraper
"""
custom_settings = EXT_SETTINGS
def __init__(self, item):
self.url = item["url"]
self.item = item
def start_requests(self):
request = scrapy.Request(
url = self.url,
callback = self.parse
)
yield request
def parse(self, response):
"""
Parsing content in the events sections
"""
newest_thread = None
threads = response.xpath("//div[@class='structItem-title']/@uix-href").extract()
for thread in threads:
if "poll" in thread.lower() or "nomination-thread" in thread.lower():
continue
newest_thread = thread
break
if newest_thread is None:
print("no thread found. url: ", response.url)
import sys
sys.exit(1)
request = scrapy.Request(
url = "https://unloze.com" + newest_thread,
callback = self.parse2
)
yield request
def parse2(self, response):
"""
Parsing content on the actual newest event thread
"""
try:
event_title = response.url.rsplit(".", 1)[0].rsplit("/", 1)[1]
event_server = ""
event_maps = ""
#several event managers do the threads differently in terms of highlighting and marks, they dont use or understand standardization welp
#as long as no random nigger types TL;DR in their post i guess this will work
skipping = len(response.xpath("//*[contains(text(),'TL;DR')]").extract())
next_event_maps = False
next_event_date = False
next_event_server = False
next_event_time = False
next_event_reward = False
2024-04-24 13:35:52 +02:00
event_date = None
2024-06-08 22:20:57 +02:00
event_reward = ""
event_time = ""
for r in response.xpath("//*[contains(text(),'TL;DR')]/../../..//text()").extract():
if "\n" in r or len(r) < 4 or "\t" in r or skipping > 0:
if r.lower() == "tl;dr":
2024-04-24 13:35:52 +02:00
#skipping -= 1
skipping = -1
continue
2024-04-24 13:35:52 +02:00
if "server" in r.lower() and "time" not in r.lower() and next_event_server is not None:
next_event_server = True
2022-06-19 13:40:27 +02:00
continue
if next_event_server:
2022-06-19 13:40:27 +02:00
event_server += r
2024-04-24 13:35:52 +02:00
if ":270" in r: #server port generally
next_event_server = None
if "maps" in r.lower() and "rewards" not in r.lower():
next_event_maps = True
continue
if "date" in r.lower():
next_event_maps = False
next_event_date = True
continue
if next_event_maps:
2024-04-24 13:35:52 +02:00
if r.startswith('ze_') or r.startswith('mg_') or r.startswith('de_') or r.startswith('zr_'):
event_maps += f"{r} "
if next_event_date:
next_event_date= False
2022-06-19 13:40:27 +02:00
event_date = r
if "time" in r.lower() and "server" not in r.lower():
next_event_time = True
continue
2024-06-08 22:20:57 +02:00
if next_event_time and event_time == "":
event_time = r
next_event_time = False
2022-06-19 13:40:27 +02:00
if "rewards" in r.lower():
next_event_reward = True
2024-06-08 22:20:57 +02:00
#print("rewards: ", r)
continue
2024-06-08 22:20:57 +02:00
if next_event_reward and event_reward == "":
#print("next_event_reward: ", r)
event_reward = r
next_event_reward = False
2022-06-19 13:40:27 +02:00
2024-04-24 13:35:52 +02:00
2022-06-19 13:40:27 +02:00
self.item["event_title"] = event_title
self.item["event_date"] = event_date
self.item["event_time"] = event_time
self.item["event_server"] = event_server
self.item["event_maps"] = event_maps
self.item["event_reward"] = event_reward
self.item["event_url"] = response.url
except Exception:
error_msg = traceback.format_exc()
print("traceback msg: ", error_msg)
print("url: ", response.url)
import sys
sys.exit(1)
#pprint(self.item)
2024-06-08 22:20:57 +02:00
return self.item