Lectureplan/UWScraper/Program.cs
Robin Weichselbraun 03ddeba846 Performance update with HTMLAgilityPack
Added Branches
Added LinksToScrape
Added ScrapedLinks
2024-10-12 17:36:23 +02:00

319 lines
9.0 KiB
C#

using System.Collections.Generic;
using System.Globalization;
using System.Net.NetworkInformation;
using System.Text.Encodings.Web;
using System.Web;
using System.Xml.Linq;
using HtmlAgilityPack;
using OpenQA.Selenium;
using OpenQA.Selenium.BiDi.Modules.Script;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.DevTools.V127.Target;
using OpenQA.Selenium.Support.UI;
using UWLib;
using static Microsoft.EntityFrameworkCore.DbLoggerCategory;
internal class Program
{
readonly static List<string> checkedUrls = [];
readonly static List<string> urlsToCheck = [];
static bool refreshLectures = false;
static readonly LectureContext db = new();
static ChromeDriver? driver = null;
private static void Main(string[] args)
{
AppDomain.CurrentDomain.ProcessExit += (s, e) =>
{
Done();
};
var service = ChromeDriverService.CreateDefaultService();
service.HideCommandPromptWindow = true;
var options = new ChromeOptions();
//options.AddArgument("--headless");
driver = new ChromeDriver(service, options);
if (args.Any(x => x == "-r" || x == "--refresh-lectures"))
{
refreshLectures = true;
Console.WriteLine("Refreshing lectures");
}
if (args.Length == 0)
{
urlsToCheck.AddRange([.. db.LinksToScrape.Select(x => x.Url)]);
//add all urls that were checked in the last 24 hours
checkedUrls.AddRange([.. db.ScrapedLinks.Where(x => x.LastScrape > DateTime.Now.AddDays(-1)).Select(x => x.Url)]);
//remove all checked urls from the urls to check
foreach (var url in checkedUrls)
{
urlsToCheck.Remove(url);
}
if (urlsToCheck.Count == 0)
{
// add the first url to check / Vorlesungsverzeichnis
urlsToCheck.Add("https://ufind.univie.ac.at/de/vvz.html");
}
}
if (refreshLectures)
{
urlsToCheck.AddRange([.. db.Lectures.Select(x => x.Url)]);
}
Console.CursorVisible = false;
int top = Console.CursorTop;
while (urlsToCheck.Count > 0)
{
var url = urlsToCheck.First();
try
{
FindUrls(url);
}
catch (Exception e)
{
Console.Clear();
Console.WriteLine($"Url: {url}");
Console.WriteLine();
Console.WriteLine(e);
return;
}
Console.CursorLeft = 0;
Console.CursorTop = top;
Console.WriteLine($"Urls checked: {checkedUrls.Count}");
Console.WriteLine($"Urls to check: {urlsToCheck.Count}");
}
Console.WriteLine("Done");
}
private static void Done()
{
driver?.Quit();
Console.CursorVisible = true;
Console.ReadLine();
}
static string GetAbsoluteUrl(string relativeUrl, string baseUrl)
{
var uri = new Uri(baseUrl);
var baseUri = new Uri(uri, relativeUrl);
return HttpUtility.HtmlDecode(baseUri.AbsoluteUri);
}
private static void FindUrls(string source)
{
driver?.Navigate().GoToUrl(source);
// wait for the page to load
WebDriverWait wait = new(driver, TimeSpan.FromSeconds(2));
wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 ||
d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 ||
d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0
);
HtmlDocument doc = new();
doc.LoadHtml(driver?.PageSource);
var root = doc.DocumentNode;
if (source.Contains("course.html"))
{
CreateLecture(source, root);
}
if (!refreshLectures)
{
FindPathLinks(source, root);
FindCourseLinks(source, root);
}
RefreshScrapedLink(source);
RemoveLinkToScrape(source);
db.SaveChanges();
checkedUrls.Add(source);
urlsToCheck.Remove(source);
}
private static void RefreshScrapedLink(string source)
{
var scrapedLink = db.ScrapedLinks.Find(source);
if (scrapedLink == null)
{
scrapedLink = new ScrapedLink { Url = source };
db.ScrapedLinks.Add(scrapedLink);
}
scrapedLink.LastScrape = DateTime.Now;
}
private static void FindCourseLinks(string source, HtmlNode root)
{
var courseLinkParents = root.SelectNodes("//a[starts-with(@href, 'course.html')]/..");
if (courseLinkParents != null)
{
foreach (var parent in courseLinkParents)
{
if (parent.SelectSingleNode("abbr[contains(@title,'Vorlesung')]") != null)
{
var link = parent.SelectSingleNode("a[contains(@href, 'course.html')]");
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
if (!checkedUrls.Contains(url))
{
db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = 0 });
urlsToCheck.Insert(0, url);
}
}
}
}
}
private static void FindPathLinks(string source, HtmlNode root)
{
var pathLinks = root.SelectNodes("//a[starts-with(@href, 'vvz_sub.html')]");
if (pathLinks != null)
{
foreach (var link in pathLinks)
{
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
if (!checkedUrls.Contains(url))
{
int sort = 1;
AddLinkToScrape(url, sort);
urlsToCheck.Add(url);
}
}
}
}
private static void RemoveLinkToScrape(string url)
{
var link = db.LinksToScrape.Find(url);
if (link != null)
{
db.LinksToScrape.Remove(link);
}
}
private static void AddLinkToScrape(string url, int sort)
{
var link = db.LinksToScrape.Find(url);
if (link == null)
{
db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = sort });
}
}
private static void CreateLecture(string source, HtmlNode root)
{
var branch = root.SelectSingleNode("/html/body/main/div[1]/div[1]/a");
var what = root.SelectSingleNode("//h1/*/*[@class='what']");
var when = root.SelectSingleNode("//h1/*/*[@class='when']");
var info = root.SelectSingleNode("//*[@class='info list']");
var events = root.SelectNodes("//ul[@class='classes events list']/li");
var uri = new Uri(source);
var query = HttpUtility.ParseQueryString(uri.Query);
Lecture? lecture = db.Lectures.FirstOrDefault(db => db.Url == source);
if (lecture == null)
{
lecture = new Lecture();
if (query.AllKeys.Contains("lv"))
{
lecture.Id = int.Parse(query["lv"] ?? "0");
}
if (when != null)
{
lecture.Semester = when.InnerText;
}
lecture.Url = source;
db.Lectures.Add(lecture);
}
if (branch != null)
{
lecture.Branch = branch.InnerText;
}
if (what != null)
{
lecture.Title = what.InnerText;
}
if (info != null)
{
lecture.Description = info.InnerHtml;
}
if (events != null)
{
CreateLectureEvents(events, lecture);
}
}
private static void CreateLectureEvents(HtmlNodeCollection events, Lecture lecture)
{
int year = int.Parse(lecture.Semester[..4]);
db.RemoveRange(lecture.Events);
lecture.Events.Clear();
foreach (var item in events)
{
LectureEvent lectureEvent = new();
var day = item.SelectSingleNode("*[@class='date']");
var time = item.SelectSingleNode("*[@class='time']");
var room = item.SelectSingleNode("*[@class='room']");
DateTime date = new();
if (day != null)
{
date = DateTime.ParseExact(day.InnerText + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture);
}
if (time != null)
{
var text = time.InnerText;
var times = text.Split(" - ");
var from = TimeSpan.ParseExact(times[0], "hh\\:mm", CultureInfo.InvariantCulture);
var to = TimeSpan.ParseExact(times[1], "hh\\:mm", CultureInfo.InvariantCulture);
lectureEvent.From = date.Add(from);
lectureEvent.To = date.Add(to);
}
if (room != null)
{
lectureEvent.Location = room.InnerText;
}
lecture.Events.Add(lectureEvent);
}
}
}