using System.Globalization; using HtmlAgilityPack; using System.Web; using OpenQA.Selenium.Chrome; using OpenQA.Selenium.Support.UI; using UWLib; using OpenQA.Selenium; using Microsoft.EntityFrameworkCore; using System.Text; using System.IO; using System.Text.RegularExpressions; using Microsoft.EntityFrameworkCore.Design; using System.Reflection; using System.ComponentModel; namespace UWScraper { internal static class Scraper { public class ScrapedEventArgs : EventArgs { public int Scraped { get; set; } public int ToScrape { get; set; } } readonly static List checkedUrls = []; readonly static List urlsToCheck = []; static bool refreshLectures = false; static uint timeout = 2; static string semester = ""; static LectureContext db = new("lecture.db"); static readonly ChromeDriver? driver = null; public delegate void ScrapedEventHandler(ScrapedEventArgs e); public delegate void InitEventHandler(); public static event ScrapedEventHandler? Scraped; public static event ScrapedEventHandler? Init; static internal void QuitDriver() { driver?.Quit(); } static Scraper() { var service = ChromeDriverService.CreateDefaultService(); service.HideCommandPromptWindow = true; var options = new ChromeOptions(); //options.AddArgument("--headless"); driver = new ChromeDriver(service, options); } static internal void Scrape(ScrapeCommand.Settings settings) { db = new LectureContext(settings.DatabasePath); Scraper.timeout = settings.Timeout; Scraper.refreshLectures = settings.RefreshLectures; if (settings.ClearLinksToScrape) { db.LinksToScrape.RemoveRange(db.LinksToScrape.ToList()); db.SaveChanges(); } if (settings.Semester != null) { semester = settings.Semester; } if (!refreshLectures) { urlsToCheck.AddRange([.. db.LinksToScrape.OrderBy(x => x.Sort).Select(x => x.Url)]); //add all urls that were checked in the last 24 hours checkedUrls.AddRange([.. db.ScrapedLinks.Where(x => x.LastScrape > DateTime.Now.AddDays(-settings.RescrapeHours)).Select(x => x.Url)]); //remove all checked urls from the urls to check foreach (var url in checkedUrls) { urlsToCheck.Remove(url); } if (urlsToCheck.Count == 0) { // add the first url to check / Vorlesungsverzeichnis var defaultValueAttr = settings.GetType().GetCustomAttribute(); if (defaultValueAttr != null && settings.StartUrl == defaultValueAttr.Value?.ToString()) { urlsToCheck.Add($"{settings.StartUrl}+?semester={semester}"); } else { urlsToCheck.Add(settings.StartUrl); } } } else { urlsToCheck.AddRange([.. db.Lectures.Select(x => x.Url)]); } Console.CursorVisible = false; int top = Console.CursorTop; Init?.Invoke(new ScrapedEventArgs() { Scraped = checkedUrls.Count, ToScrape = urlsToCheck.Count }); while (urlsToCheck.Count > 0) { var url = urlsToCheck.First(); ScrapeUrl(url); Scraped?.Invoke(new ScrapedEventArgs() { Scraped = checkedUrls.Count, ToScrape = urlsToCheck.Count }); } } static string GetAbsoluteUrl(string relativeUrl, string baseUrl) { var uri = new Uri(baseUrl); var baseUri = new Uri(uri, relativeUrl); var absolute = HttpUtility.HtmlDecode(baseUri.AbsoluteUri); var sanitized = SanitizeUrl(absolute); return sanitized; } public static string SanitizeUrl(string url) { string[] filter = ["from", "to", "details"]; Uri uri = new(url); url = uri.GetLeftPart(UriPartial.Path); if (!string.IsNullOrWhiteSpace(uri.Query)) { var query = HttpUtility.ParseQueryString(uri.Query); StringBuilder builder = new(); builder.Append('?'); foreach (var item in query.AllKeys.Where(x => !filter.Contains(x)).OrderBy(x => x)) { builder.Append($"{item}={query[item]}"); builder.Append('&'); } builder.Length--; var sortedQuery = builder.ToString(); url += sortedQuery; } return url; } private static void ScrapeUrl(string source) { Navigate(source); RefreshScrapedLink(source); RemoveLinkToScrape(source); db.SaveChanges(); checkedUrls.Add(source); urlsToCheck.Remove(source); } private static void Navigate(string source) { try { var s = GetSemesterOfUrl(source); if (!string.IsNullOrEmpty(s) && semester != "all" && semester != s) { return; } driver?.Navigate().GoToUrl(source); // wait for the page to load WebDriverWait wait = new(driver, TimeSpan.FromSeconds(timeout)); wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 || d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 || d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0 ); HtmlDocument doc = new(); doc.LoadHtml(driver?.PageSource); var root = doc.DocumentNode; var semesterNode = root.SelectSingleNode("/html/body/main/nav/span[@class='current']"); if (semesterNode != null) { s = semesterNode.InnerText; if (!string.IsNullOrEmpty(s) && semester != "all" && semester != s) { return; } } if (source.Contains("course.html")) { CreateLecture(source, root); } if (!refreshLectures) { FindPathLinks(source, root); FindCourseLinks(source, root); } } catch (Exception) { } } private static string GetSemesterOfUrl(string source) { Uri uri = new(source); if (uri.Query.Contains("semester")) { var query = HttpUtility.ParseQueryString(uri.Query); if (query.AllKeys.Contains("semester")) { return query["semester"] ?? string.Empty; } } return string.Empty; } private static void RefreshScrapedLink(string source) { var scrapedLink = db.ScrapedLinks.Find(source); if (scrapedLink == null) { scrapedLink = new ScrapedLink { Url = source }; db.ScrapedLinks.Add(scrapedLink); } scrapedLink.LastScrape = DateTime.Now; } private static void FindCourseLinks(string source, HtmlNode root) { var courseLinkParents = root.SelectNodes("//a[starts-with(@href, 'course.html')]/.."); if (courseLinkParents != null) { foreach (var parent in courseLinkParents) { var link = parent.SelectSingleNode("a[contains(@href, 'course.html')]"); var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source); if (!checkedUrls.Contains(url) && !urlsToCheck.Contains(url)) { AddLinkToScrape(url, 0); urlsToCheck.Insert(0, url); } } } } private static void FindPathLinks(string source, HtmlNode root) { var pathLinks = root.SelectNodes("//a[starts-with(@href, 'vvz_sub.html')]"); if (pathLinks != null) { foreach (var link in pathLinks) { var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source); if (!checkedUrls.Contains(url) && !urlsToCheck.Contains(url)) { AddLinkToScrape(url, 1); urlsToCheck.Add(url); } } } } private static void RemoveLinkToScrape(string url) { var link = db.LinksToScrape.Find(url); if (link != null) { db.LinksToScrape.Remove(link); } } private static void AddLinkToScrape(string url, int sort) { var link = db.LinksToScrape.Find(url); if (link == null) { db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = sort }); } } private static void CreateLecture(string source, HtmlNode root) { //ingoreLectureTypes var branch = root.SelectSingleNode("//*[@class='spl']"); var number = root.SelectSingleNode("//*[@class='title']//*[@class='number']"); var type = root.SelectSingleNode("//*[@class='title']//*[@class='type']"); var what = root.SelectSingleNode("//*[@class='title']//*[@class='what']"); var when = root.SelectSingleNode("//*[@class='title']//*[@class='when']"); var info = root.SelectSingleNode("//*[@class='info list']"); var events = root.SelectNodes("//ul[@class='classes events list']/li"); var uri = new Uri(source); var query = HttpUtility.ParseQueryString(uri.Query); int id = 0; string semester = string.Empty; if (number != null) { id = int.Parse(number.InnerText); } if (query.AllKeys.Contains("lv")) { id = int.Parse(query["lv"] ?? "0"); } if (when != null) { semester = when.InnerText; } else if (query.AllKeys.Contains("semester")) { semester = query["semester"] ?? ""; } Lecture? lecture = db.Lectures.Include(x => x.Events).FirstOrDefault(db => db.Id == id && db.Semester == semester); if (lecture == null) { lecture = new Lecture { Id = id, Semester = semester }; db.Lectures.Add(lecture); } lecture.Url = source; if (branch != null) { lecture.Branch = branch.InnerText; } if (what != null) { lecture.Title = what.InnerText; } if (info != null) { lecture.Description = info.InnerHtml; } if (type != null) { lecture.Type = type.GetAttributeValue("title", ""); } if (events != null) { CreateLectureEvents(events, lecture); } } private static void CreateLectureEvents(HtmlNodeCollection events, Lecture lecture) { int year = int.Parse(lecture.Semester[..4]); db.RemoveRange(lecture.Events); lecture.Events.Clear(); foreach (var item in events) { LectureEvent lectureEvent = new() { Lecture = lecture }; var day = item.SelectSingleNode("*[@class='date']"); var time = item.SelectSingleNode("*[@class='time']"); var room = item.SelectSingleNode("*[@class='room']"); DateTime date = new(); if (day != null) { date = DateTime.ParseExact(day.InnerText + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture); } if (time != null) { var text = time.InnerText; var times = text.Split(" - "); var from = TimeSpan.ParseExact(times[0], "hh\\:mm", CultureInfo.InvariantCulture); var to = TimeSpan.ParseExact(times[1], "hh\\:mm", CultureInfo.InvariantCulture); lectureEvent.From = date.Add(from); lectureEvent.To = date.Add(to); } if (room != null) { lectureEvent.Location = room.InnerText; } lecture.Events.Add(lectureEvent); } } } }