Code:
#!/usr/bin/python
# -*- coding:utf-8 -*-
import tkinter as tk
from tkinter import scrolledtext, messagebox, IntVar
import requests
from bs4 import BeautifulSoup
import time
import datetime
class Stunde4StundeScraperGUI:
def __init__(self, master):
self.master = master
master.title("Stunde4Stunde-Scraper (by Scrypton)")
# Variablen für Checkboxen
self.show_post_counts_var = IntVar(value=1)
self.show_hourly_points_var = IntVar(value=0)
self.show_total_points_var = IntVar(value=1)
self.show_thread_content_var = IntVar(value=0)
# GUI-Elemente für Eingabe
self.page_label = tk.Label(master, text="Ab welcher Seite?")
self.page_label.pack()
self.page_entry = tk.Entry(master)
self.page_entry.pack()
self.page_entry.focus_set()
self.page_entry.bind('<Return>', lambda event: self.start_post_entry.focus_set())
self.start_post_label = tk.Label(master, text="Start-Nummer von Beitrag:")
self.start_post_label.pack()
self.start_post_entry = tk.Entry(master)
self.start_post_entry.pack()
self.start_post_entry.bind('<Return>', lambda event: self.end_post_entry.focus_set())
self.end_post_label = tk.Label(master, text="Bis einschließlich Beitrag-Nr (oder Enter wenn bis zum aller letzten):")
self.end_post_label.pack()
self.end_post_entry = tk.Entry(master)
self.end_post_entry.pack()
self.end_post_entry.bind('<Return>', self.start_scraper)
# Checkboxen für Ausgaben
self.show_post_counts_checkbox = tk.Checkbutton(master, text="Beitragszählung pro Autor", variable=self.show_post_counts_var)
self.show_post_counts_checkbox.pack()
self.show_hourly_points_checkbox = tk.Checkbutton(master, text="Punktestände pro Stunde", variable=self.show_hourly_points_var)
self.show_hourly_points_checkbox.pack()
self.show_total_points_checkbox = tk.Checkbutton(master, text="Punktestände pro Autor insgesamt", variable=self.show_total_points_var)
self.show_total_points_checkbox.pack()
self.show_thread_content_checkbox = tk.Checkbutton(master, text="Details der Beiträge", variable=self.show_thread_content_var)
self.show_thread_content_checkbox.pack()
# Button zum Starten des Scrapers
self.start_button = tk.Button(master, text="Start Scraper", command=self.start_scraper)
self.start_button.pack()
# GUI-Element für die Konsolenausgabe
self.console_output = scrolledtext.ScrolledText(master, wrap=tk.WORD)
self.console_output.pack(expand=True, fill='both')
def start_scraper(self, event=None):
page = int(self.page_entry.get())
start_post = int(self.start_post_entry.get())
end_post_input = self.end_post_entry.get()
end_post = int(end_post_input) if end_post_input else None
if not isinstance(page, int) or not isinstance(start_post, int) or (end_post_input and not isinstance(end_post, int)):
messagebox.showerror("Fehler", "Bitte geben Sie nur Ziffern ein.")
return
starturl = 'https://www.forumla.de/f-fun-spiele-raetsel-39/t-stunde-fuer-stunde-a-its-time-for-return-236511/page'
post_counts = {}
hourly_points = {}
# Durchlaufe die Seiten von der Startseite bis zur letzten Seite
for lauf in range(page, 10000): # Setze das obere Limit hoch, um alle Seiten abzudecken
url = f"{starturl}{lauf}"
self.console_output.insert(tk.END, '\n\n####################################\n')
self.console_output.insert(tk.END, f'Seite {lauf} von ...\n') # Kein Ende mehr anzeigen
self.console_output.insert(tk.END, 'URL: ' + url + '\n')
self.console_output.yview(tk.END)
self.master.update_idletasks()
# Durchsuche die Seite nur, wenn die Start-Nummer erreicht oder überschritten wurde
start_post_on_page = (lauf - 1) * 20 + 1
end_post_on_page = min(lauf * 20, end_post) if end_post is not None else lauf * 20
if end_post is not None and start_post_on_page > end_post:
self.console_output.insert(tk.END, 'End-Nummer erreicht oder überschritten. Beende die Schleife.\n')
self.console_output.yview(tk.END)
self.master.update_idletasks()
break
if start_post_on_page <= end_post_on_page:
self.get_threadpage_content(url, start_post, end_post_on_page, post_counts, hourly_points)
if self.show_post_counts_var.get():
self.console_output.insert(tk.END, '\nBeitragszählung pro Autor:\n')
for author, count in post_counts.items():
self.console_output.insert(tk.END, f'{author}: {count} Beiträge\n')
self.console_output.yview(tk.END)
self.master.update_idletasks()
if self.show_hourly_points_var.get():
self.console_output.insert(tk.END, '\nPunktestände pro Stunde:\n')
for hour_key, data in hourly_points.items():
self.console_output.insert(tk.END, f"{data['author']} hat um {data['time'].strftime('%H:%M')} den Beitrag "
f"{data['post_number']} geschrieben und erhält einen Punkt für die Stunde "
f"{hour_key}.\n")
self.console_output.yview(tk.END)
self.master.update_idletasks()
total_points = {}
for hour_key, data in hourly_points.items():
author = data['author']
total_points[author] = total_points.get(author, 0) + 1
if self.show_total_points_var.get():
self.console_output.insert(tk.END, '\nPunktestände pro Autor insgesamt:\n')
for author, points in total_points.items():
self.console_output.insert(tk.END, f'{author}: {points} Punkte\n')
self.console_output.yview(tk.END)
self.master.update_idletasks()
def get_threadpage_content(self, url, start_post, end_post, post_counts, hourly_points):
content = self.get_url_content(url)
soup = BeautifulSoup(content, "html.parser")
for post in soup.findAll('li', {'class': 'postbitim'}):
advertising_check = post.find('a', class_="postcounter")
if advertising_check:
post_number_text = post.find('a', class_="postcounter").text
post_number = int(post_number_text.replace('#', ''))
if start_post <= post_number <= end_post:
post_time_str = post.find('span', class_="date").text
post_author = post.find('span', class_="username").text
post_counts[post_author] = post_counts.get(post_author, 0) + 1
post_datetime = self.extract_datetime(post_time_str)
post_date = post_datetime.date()
post_time = post_datetime.time()
if self.show_thread_content_var.get():
self.console_output.insert(tk.END, 'Beitragsnummer: %s\n' % post_number)
self.console_output.insert(tk.END, 'Datum des Posts: %s\n' % post_date)
self.console_output.insert(tk.END, 'Uhrzeit des Posts: %s\n' % post_time.strftime("%H:%M"))
self.console_output.insert(tk.END, 'Autor: %s\n' % post_author)
self.console_output.yview(tk.END)
self.master.update_idletasks()
hour_key = post_datetime.strftime("%Y-%m-%d %H")
hourly_points[hour_key] = {'time': post_datetime, 'author': post_author,
'post_number': post_number}
def get_url_content(self, url):
# wartet eine halbe Sekunde zwischen jeder Anfrage
time.sleep(0.5)
return requests.get(url).text
def extract_datetime(self, time_str):
# Komma und nicht-brechende Leerzeichen durch normale Leerzeichen ersetzen...
time_str = time_str.replace(',', '').replace('\xa0', ' ')
# Differenzieren zwischen Heute/Gestern und Datum, ersteres zu Datum umwandeln
if 'Heute' in time_str or 'Gestern' in time_str:
today_or_yesterday = self.convert_relative_date(time_str.split()[0])
time_part = time_str.split()[1]
return datetime.datetime.combine(today_or_yesterday,
datetime.datetime.strptime(time_part, "%H:%M").time())
else:
return datetime.datetime.strptime(time_str, "%d.%m.%Y %H:%M")
def convert_relative_date(self, date_str):
# Mache aus "Heute" das aktuelle Datum (Systemzeit)
if date_str == 'Heute':
return datetime.datetime.now().date()
# Mache aus "Gestern" das gestrige Datum (erneut Systemzeit)
elif date_str == 'Gestern':
yesterday = datetime.datetime.now() - datetime.timedelta(days=1)
return yesterday.date()
# Nötiges Format herstellen
else:
return datetime.datetime.strptime(date_str, "%d.%m.%Y").date()
# Hauptprogramm starten....
root = tk.Tk()
app = Stunde4StundeScraperGUI(root)
root.mainloop()
Die vielen Gegentore gab's halt in der 1. und nicht in der 2. Woche. Der Tipp ging aufs Haus
Zockerspiel CXXXVII