using System.Collections.Generic; using System.Globalization; using System.Net.NetworkInformation; using System.Text.Encodings.Web; using System.Web; using System.Xml.Linq; using HtmlAgilityPack; using OpenQA.Selenium; using OpenQA.Selenium.BiDi.Modules.Script; using OpenQA.Selenium.Chrome; using OpenQA.Selenium.DevTools.V127.Target; using OpenQA.Selenium.Support.UI; using UWLib; using static Microsoft.EntityFrameworkCore.DbLoggerCategory; internal class Program { readonly static List checkedUrls = []; readonly static List urlsToCheck = []; static bool refreshLectures = false; static readonly LectureContext db = new(); static ChromeDriver? driver = null; private static void Main(string[] args) { AppDomain.CurrentDomain.ProcessExit += (s, e) => { Done(); }; var service = ChromeDriverService.CreateDefaultService(); service.HideCommandPromptWindow = true; var options = new ChromeOptions(); //options.AddArgument("--headless"); driver = new ChromeDriver(service, options); if (args.Any(x => x == "-r" || x == "--refresh-lectures")) { refreshLectures = true; Console.WriteLine("Refreshing lectures"); } if (args.Length == 0) { urlsToCheck.AddRange([.. db.LinksToScrape.Select(x => x.Url)]); //add all urls that were checked in the last 24 hours checkedUrls.AddRange([.. db.ScrapedLinks.Where(x => x.LastScrape > DateTime.Now.AddDays(-1)).Select(x => x.Url)]); //remove all checked urls from the urls to check foreach (var url in checkedUrls) { urlsToCheck.Remove(url); } if (urlsToCheck.Count == 0) { // add the first url to check / Vorlesungsverzeichnis urlsToCheck.Add("https://ufind.univie.ac.at/de/vvz.html"); } } if (refreshLectures) { urlsToCheck.AddRange([.. db.Lectures.Select(x => x.Url)]); } Console.CursorVisible = false; int top = Console.CursorTop; while (urlsToCheck.Count > 0) { var url = urlsToCheck.First(); try { FindUrls(url); } catch (Exception e) { Console.Clear(); Console.WriteLine($"Url: {url}"); Console.WriteLine(); Console.WriteLine(e); return; } Console.CursorLeft = 0; Console.CursorTop = top; Console.WriteLine($"Urls checked: {checkedUrls.Count}"); Console.WriteLine($"Urls to check: {urlsToCheck.Count}"); } Console.WriteLine("Done"); } private static void Done() { driver?.Quit(); Console.CursorVisible = true; Console.ReadLine(); } static string GetAbsoluteUrl(string relativeUrl, string baseUrl) { var uri = new Uri(baseUrl); var baseUri = new Uri(uri, relativeUrl); return HttpUtility.HtmlDecode(baseUri.AbsoluteUri); } private static void FindUrls(string source) { driver?.Navigate().GoToUrl(source); // wait for the page to load WebDriverWait wait = new(driver, TimeSpan.FromSeconds(2)); wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 || d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 || d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0 ); HtmlDocument doc = new(); doc.LoadHtml(driver?.PageSource); var root = doc.DocumentNode; if (source.Contains("course.html")) { CreateLecture(source, root); } if (!refreshLectures) { FindPathLinks(source, root); FindCourseLinks(source, root); } RefreshScrapedLink(source); RemoveLinkToScrape(source); db.SaveChanges(); checkedUrls.Add(source); urlsToCheck.Remove(source); } private static void RefreshScrapedLink(string source) { var scrapedLink = db.ScrapedLinks.Find(source); if (scrapedLink == null) { scrapedLink = new ScrapedLink { Url = source }; db.ScrapedLinks.Add(scrapedLink); } scrapedLink.LastScrape = DateTime.Now; } private static void FindCourseLinks(string source, HtmlNode root) { var courseLinkParents = root.SelectNodes("//a[starts-with(@href, 'course.html')]/.."); if (courseLinkParents != null) { foreach (var parent in courseLinkParents) { if (parent.SelectSingleNode("abbr[contains(@title,'Vorlesung')]") != null) { var link = parent.SelectSingleNode("a[contains(@href, 'course.html')]"); var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source); if (!checkedUrls.Contains(url)) { db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = 0 }); urlsToCheck.Insert(0, url); } } } } } private static void FindPathLinks(string source, HtmlNode root) { var pathLinks = root.SelectNodes("//a[starts-with(@href, 'vvz_sub.html')]"); if (pathLinks != null) { foreach (var link in pathLinks) { var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source); if (!checkedUrls.Contains(url)) { int sort = 1; AddLinkToScrape(url, sort); urlsToCheck.Add(url); } } } } private static void RemoveLinkToScrape(string url) { var link = db.LinksToScrape.Find(url); if (link != null) { db.LinksToScrape.Remove(link); } } private static void AddLinkToScrape(string url, int sort) { var link = db.LinksToScrape.Find(url); if (link == null) { db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = sort }); } } private static void CreateLecture(string source, HtmlNode root) { var branch = root.SelectSingleNode("/html/body/main/div[1]/div[1]/a"); var what = root.SelectSingleNode("//h1/*/*[@class='what']"); var when = root.SelectSingleNode("//h1/*/*[@class='when']"); var info = root.SelectSingleNode("//*[@class='info list']"); var events = root.SelectNodes("//ul[@class='classes events list']/li"); var uri = new Uri(source); var query = HttpUtility.ParseQueryString(uri.Query); Lecture? lecture = db.Lectures.FirstOrDefault(db => db.Url == source); if (lecture == null) { lecture = new Lecture(); if (query.AllKeys.Contains("lv")) { lecture.Id = int.Parse(query["lv"] ?? "0"); } if (when != null) { lecture.Semester = when.InnerText; } lecture.Url = source; db.Lectures.Add(lecture); } if (branch != null) { lecture.Branch = branch.InnerText; } if (what != null) { lecture.Title = what.InnerText; } if (info != null) { lecture.Description = info.InnerHtml; } if (events != null) { CreateLectureEvents(events, lecture); } } private static void CreateLectureEvents(HtmlNodeCollection events, Lecture lecture) { int year = int.Parse(lecture.Semester[..4]); db.RemoveRange(lecture.Events); lecture.Events.Clear(); foreach (var item in events) { LectureEvent lectureEvent = new(); var day = item.SelectSingleNode("*[@class='date']"); var time = item.SelectSingleNode("*[@class='time']"); var room = item.SelectSingleNode("*[@class='room']"); DateTime date = new(); if (day != null) { date = DateTime.ParseExact(day.InnerText + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture); } if (time != null) { var text = time.InnerText; var times = text.Split(" - "); var from = TimeSpan.ParseExact(times[0], "hh\\:mm", CultureInfo.InvariantCulture); var to = TimeSpan.ParseExact(times[1], "hh\\:mm", CultureInfo.InvariantCulture); lectureEvent.From = date.Add(from); lectureEvent.To = date.Add(to); } if (room != null) { lectureEvent.Location = room.InnerText; } lecture.Events.Add(lectureEvent); } } }