using System.Globalization;
using HtmlAgilityPack;
using System.Web;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.Support.UI;
using UWLib;
using OpenQA.Selenium;
using Microsoft.EntityFrameworkCore;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
using Microsoft.EntityFrameworkCore.Design;
using System.Reflection;
using System.ComponentModel;
namespace UWScraper
{
internal static class Scraper
{
public class ScrapedEventArgs : EventArgs
{
public int Scraped { get; set; }
public int ToScrape { get; set; }
}
readonly static List checkedUrls = [];
readonly static List urlsToCheck = [];
static bool refreshLectures = false;
static uint timeout = 2;
static string semester = "";
static LectureContext db = new("lecture.db");
static readonly ChromeDriver? driver = null;
public delegate void ScrapedEventHandler(ScrapedEventArgs e);
public delegate void InitEventHandler();
public static event ScrapedEventHandler? Scraped;
public static event ScrapedEventHandler? Init;
static internal void QuitDriver()
{
driver?.Quit();
}
static Scraper()
{
var service = ChromeDriverService.CreateDefaultService();
service.HideCommandPromptWindow = true;
var options = new ChromeOptions();
//options.AddArgument("--headless");
driver = new ChromeDriver(service, options);
}
static internal void Scrape(ScrapeCommand.Settings settings)
{
db = new LectureContext(settings.DatabasePath);
Scraper.timeout = settings.Timeout;
Scraper.refreshLectures = settings.RefreshLectures;
if (settings.ClearLinksToScrape)
{
db.LinksToScrape.RemoveRange(db.LinksToScrape.ToList());
db.SaveChanges();
}
if (settings.Semester != null)
{
semester = settings.Semester;
}
if (!refreshLectures)
{
urlsToCheck.AddRange([.. db.LinksToScrape.OrderBy(x => x.Sort).Select(x => x.Url)]);
//add all urls that were checked in the last 24 hours
checkedUrls.AddRange([.. db.ScrapedLinks.Where(x => x.LastScrape > DateTime.Now.AddDays(-settings.RescrapeHours)).Select(x => x.Url)]);
//remove all checked urls from the urls to check
foreach (var url in checkedUrls)
{
urlsToCheck.Remove(url);
}
if (urlsToCheck.Count == 0)
{
// add the first url to check / Vorlesungsverzeichnis
var defaultValueAttr = settings.GetType().GetCustomAttribute();
if (defaultValueAttr != null && settings.StartUrl == defaultValueAttr.Value?.ToString())
{
urlsToCheck.Add($"{settings.StartUrl}+?semester={semester}");
}
else
{
urlsToCheck.Add(settings.StartUrl);
}
}
}
else
{
urlsToCheck.AddRange([.. db.Lectures.Select(x => x.Url)]);
}
Console.CursorVisible = false;
int top = Console.CursorTop;
Init?.Invoke(new ScrapedEventArgs() { Scraped = checkedUrls.Count, ToScrape = urlsToCheck.Count });
while (urlsToCheck.Count > 0)
{
var url = urlsToCheck.First();
ScrapeUrl(url);
Scraped?.Invoke(new ScrapedEventArgs() { Scraped = checkedUrls.Count, ToScrape = urlsToCheck.Count });
}
}
static string GetAbsoluteUrl(string relativeUrl, string baseUrl)
{
var uri = new Uri(baseUrl);
var baseUri = new Uri(uri, relativeUrl);
var absolute = HttpUtility.HtmlDecode(baseUri.AbsoluteUri);
var sanitized = SanitizeUrl(absolute);
return sanitized;
}
public static string SanitizeUrl(string url)
{
string[] filter = ["from", "to", "details"];
Uri uri = new(url);
url = uri.GetLeftPart(UriPartial.Path);
if (!string.IsNullOrWhiteSpace(uri.Query))
{
var query = HttpUtility.ParseQueryString(uri.Query);
StringBuilder builder = new();
builder.Append('?');
foreach (var item in query.AllKeys.Where(x => !filter.Contains(x)).OrderBy(x => x))
{
builder.Append($"{item}={query[item]}");
builder.Append('&');
}
builder.Length--;
var sortedQuery = builder.ToString();
url += sortedQuery;
}
return url;
}
private static void ScrapeUrl(string source)
{
Navigate(source);
RefreshScrapedLink(source);
RemoveLinkToScrape(source);
db.SaveChanges();
checkedUrls.Add(source);
urlsToCheck.Remove(source);
}
private static void Navigate(string source)
{
try
{
var s = GetSemesterOfUrl(source);
if (!string.IsNullOrEmpty(s) && semester != "all" && semester != s)
{
return;
}
driver?.Navigate().GoToUrl(source);
// wait for the page to load
WebDriverWait wait = new(driver, TimeSpan.FromSeconds(timeout));
wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 ||
d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 ||
d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0
);
HtmlDocument doc = new();
doc.LoadHtml(driver?.PageSource);
var root = doc.DocumentNode;
var semesterNode = root.SelectSingleNode("/html/body/main/nav/span[@class='current']");
if (semesterNode != null)
{
s = semesterNode.InnerText;
if (!string.IsNullOrEmpty(s) && semester != "all" && semester != s)
{
return;
}
}
if (source.Contains("course.html"))
{
CreateLecture(source, root);
}
if (!refreshLectures)
{
FindPathLinks(source, root);
FindCourseLinks(source, root);
}
}
catch (Exception)
{
}
}
private static string GetSemesterOfUrl(string source)
{
Uri uri = new(source);
if (uri.Query.Contains("semester"))
{
var query = HttpUtility.ParseQueryString(uri.Query);
if (query.AllKeys.Contains("semester"))
{
return query["semester"] ?? string.Empty;
}
}
return string.Empty;
}
private static void RefreshScrapedLink(string source)
{
var scrapedLink = db.ScrapedLinks.Find(source);
if (scrapedLink == null)
{
scrapedLink = new ScrapedLink { Url = source };
db.ScrapedLinks.Add(scrapedLink);
}
scrapedLink.LastScrape = DateTime.Now;
}
private static void FindCourseLinks(string source, HtmlNode root)
{
var courseLinkParents = root.SelectNodes("//a[starts-with(@href, 'course.html')]/..");
if (courseLinkParents != null)
{
foreach (var parent in courseLinkParents)
{
var link = parent.SelectSingleNode("a[contains(@href, 'course.html')]");
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
if (!checkedUrls.Contains(url) && !urlsToCheck.Contains(url))
{
AddLinkToScrape(url, 0);
urlsToCheck.Insert(0, url);
}
}
}
}
private static void FindPathLinks(string source, HtmlNode root)
{
var pathLinks = root.SelectNodes("//a[starts-with(@href, 'vvz_sub.html')]");
if (pathLinks != null)
{
foreach (var link in pathLinks)
{
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
if (!checkedUrls.Contains(url) && !urlsToCheck.Contains(url))
{
AddLinkToScrape(url, 1);
urlsToCheck.Add(url);
}
}
}
}
private static void RemoveLinkToScrape(string url)
{
var link = db.LinksToScrape.Find(url);
if (link != null)
{
db.LinksToScrape.Remove(link);
}
}
private static void AddLinkToScrape(string url, int sort)
{
var link = db.LinksToScrape.Find(url);
if (link == null)
{
db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = sort });
}
}
private static void CreateLecture(string source, HtmlNode root)
{
//ingoreLectureTypes
var branch = root.SelectSingleNode("//*[@class='spl']");
var number = root.SelectSingleNode("//*[@class='title']//*[@class='number']");
var type = root.SelectSingleNode("//*[@class='title']//*[@class='type']");
var what = root.SelectSingleNode("//*[@class='title']//*[@class='what']");
var when = root.SelectSingleNode("//*[@class='title']//*[@class='when']");
var info = root.SelectSingleNode("//*[@class='info list']");
var events = root.SelectNodes("//ul[@class='classes events list']/li");
var uri = new Uri(source);
var query = HttpUtility.ParseQueryString(uri.Query);
int id = 0;
string semester = string.Empty;
if (number != null)
{
id = int.Parse(number.InnerText);
}
if (query.AllKeys.Contains("lv"))
{
id = int.Parse(query["lv"] ?? "0");
}
if (when != null)
{
semester = when.InnerText;
}
else if (query.AllKeys.Contains("semester"))
{
semester = query["semester"] ?? "";
}
Lecture? lecture = db.Lectures.Include(x => x.Events).FirstOrDefault(db => db.Id == id && db.Semester == semester);
if (lecture == null)
{
lecture = new Lecture
{
Id = id,
Semester = semester
};
db.Lectures.Add(lecture);
}
lecture.Url = source;
if (branch != null)
{
lecture.Branch = branch.InnerText;
}
if (what != null)
{
lecture.Title = what.InnerText;
}
if (info != null)
{
lecture.Description = info.InnerHtml;
}
if (type != null)
{
lecture.Type = type.GetAttributeValue("title", "");
}
if (events != null)
{
CreateLectureEvents(events, lecture);
}
}
private static void CreateLectureEvents(HtmlNodeCollection events, Lecture lecture)
{
int year = int.Parse(lecture.Semester[..4]);
db.RemoveRange(lecture.Events);
lecture.Events.Clear();
foreach (var item in events)
{
LectureEvent lectureEvent = new() { Lecture = lecture };
var day = item.SelectSingleNode("*[@class='date']");
var time = item.SelectSingleNode("*[@class='time']");
var room = item.SelectSingleNode("*[@class='room']");
DateTime date = new();
if (day != null)
{
date = DateTime.ParseExact(day.InnerText + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture);
}
if (time != null)
{
var text = time.InnerText;
var times = text.Split(" - ");
var from = TimeSpan.ParseExact(times[0], "hh\\:mm", CultureInfo.InvariantCulture);
var to = TimeSpan.ParseExact(times[1], "hh\\:mm", CultureInfo.InvariantCulture);
lectureEvent.From = date.Add(from);
lectureEvent.To = date.Add(to);
}
if (room != null)
{
lectureEvent.Location = room.InnerText;
}
lecture.Events.Add(lectureEvent);
}
}
}
}