319 lines
9.0 KiB
C#
319 lines
9.0 KiB
C#
|
|
using System.Collections.Generic;
|
|
using System.Globalization;
|
|
using System.Net.NetworkInformation;
|
|
using System.Text.Encodings.Web;
|
|
using System.Web;
|
|
using System.Xml.Linq;
|
|
using HtmlAgilityPack;
|
|
using OpenQA.Selenium;
|
|
using OpenQA.Selenium.BiDi.Modules.Script;
|
|
using OpenQA.Selenium.Chrome;
|
|
using OpenQA.Selenium.DevTools.V127.Target;
|
|
using OpenQA.Selenium.Support.UI;
|
|
using UWLib;
|
|
using static Microsoft.EntityFrameworkCore.DbLoggerCategory;
|
|
|
|
|
|
internal class Program
|
|
{
|
|
readonly static List<string> checkedUrls = [];
|
|
readonly static List<string> urlsToCheck = [];
|
|
|
|
static bool refreshLectures = false;
|
|
|
|
static readonly LectureContext db = new();
|
|
static ChromeDriver? driver = null;
|
|
private static void Main(string[] args)
|
|
{
|
|
AppDomain.CurrentDomain.ProcessExit += (s, e) =>
|
|
{
|
|
Done();
|
|
};
|
|
|
|
var service = ChromeDriverService.CreateDefaultService();
|
|
service.HideCommandPromptWindow = true;
|
|
|
|
var options = new ChromeOptions();
|
|
//options.AddArgument("--headless");
|
|
driver = new ChromeDriver(service, options);
|
|
|
|
if (args.Any(x => x == "-r" || x == "--refresh-lectures"))
|
|
{
|
|
refreshLectures = true;
|
|
|
|
Console.WriteLine("Refreshing lectures");
|
|
}
|
|
|
|
if (args.Length == 0)
|
|
{
|
|
urlsToCheck.AddRange([.. db.LinksToScrape.Select(x => x.Url)]);
|
|
|
|
//add all urls that were checked in the last 24 hours
|
|
checkedUrls.AddRange([.. db.ScrapedLinks.Where(x => x.LastScrape > DateTime.Now.AddDays(-1)).Select(x => x.Url)]);
|
|
|
|
//remove all checked urls from the urls to check
|
|
foreach (var url in checkedUrls)
|
|
{
|
|
urlsToCheck.Remove(url);
|
|
}
|
|
|
|
if (urlsToCheck.Count == 0)
|
|
{
|
|
// add the first url to check / Vorlesungsverzeichnis
|
|
urlsToCheck.Add("https://ufind.univie.ac.at/de/vvz.html");
|
|
}
|
|
}
|
|
|
|
if (refreshLectures)
|
|
{
|
|
urlsToCheck.AddRange([.. db.Lectures.Select(x => x.Url)]);
|
|
}
|
|
|
|
Console.CursorVisible = false;
|
|
|
|
int top = Console.CursorTop;
|
|
|
|
while (urlsToCheck.Count > 0)
|
|
{
|
|
var url = urlsToCheck.First();
|
|
|
|
try
|
|
{
|
|
FindUrls(url);
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
Console.Clear();
|
|
Console.WriteLine($"Url: {url}");
|
|
Console.WriteLine();
|
|
Console.WriteLine(e);
|
|
return;
|
|
}
|
|
|
|
Console.CursorLeft = 0;
|
|
Console.CursorTop = top;
|
|
Console.WriteLine($"Urls checked: {checkedUrls.Count}");
|
|
Console.WriteLine($"Urls to check: {urlsToCheck.Count}");
|
|
}
|
|
|
|
Console.WriteLine("Done");
|
|
}
|
|
|
|
private static void Done()
|
|
{
|
|
driver?.Quit();
|
|
|
|
Console.CursorVisible = true;
|
|
|
|
Console.ReadLine();
|
|
}
|
|
|
|
static string GetAbsoluteUrl(string relativeUrl, string baseUrl)
|
|
{
|
|
var uri = new Uri(baseUrl);
|
|
var baseUri = new Uri(uri, relativeUrl);
|
|
return HttpUtility.HtmlDecode(baseUri.AbsoluteUri);
|
|
}
|
|
|
|
private static void FindUrls(string source)
|
|
{
|
|
driver?.Navigate().GoToUrl(source);
|
|
|
|
// wait for the page to load
|
|
WebDriverWait wait = new(driver, TimeSpan.FromSeconds(2));
|
|
wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 ||
|
|
d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 ||
|
|
d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0
|
|
);
|
|
|
|
HtmlDocument doc = new();
|
|
doc.LoadHtml(driver?.PageSource);
|
|
var root = doc.DocumentNode;
|
|
|
|
if (source.Contains("course.html"))
|
|
{
|
|
CreateLecture(source, root);
|
|
}
|
|
|
|
if (!refreshLectures)
|
|
{
|
|
FindPathLinks(source, root);
|
|
FindCourseLinks(source, root);
|
|
}
|
|
|
|
RefreshScrapedLink(source);
|
|
RemoveLinkToScrape(source);
|
|
db.SaveChanges();
|
|
|
|
checkedUrls.Add(source);
|
|
urlsToCheck.Remove(source);
|
|
}
|
|
|
|
private static void RefreshScrapedLink(string source)
|
|
{
|
|
var scrapedLink = db.ScrapedLinks.Find(source);
|
|
if (scrapedLink == null)
|
|
{
|
|
scrapedLink = new ScrapedLink { Url = source };
|
|
db.ScrapedLinks.Add(scrapedLink);
|
|
}
|
|
scrapedLink.LastScrape = DateTime.Now;
|
|
}
|
|
|
|
private static void FindCourseLinks(string source, HtmlNode root)
|
|
{
|
|
var courseLinkParents = root.SelectNodes("//a[starts-with(@href, 'course.html')]/..");
|
|
if (courseLinkParents != null)
|
|
{
|
|
foreach (var parent in courseLinkParents)
|
|
{
|
|
if (parent.SelectSingleNode("abbr[contains(@title,'Vorlesung')]") != null)
|
|
{
|
|
var link = parent.SelectSingleNode("a[contains(@href, 'course.html')]");
|
|
|
|
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
|
|
|
|
if (!checkedUrls.Contains(url))
|
|
{
|
|
db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = 0 });
|
|
urlsToCheck.Insert(0, url);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void FindPathLinks(string source, HtmlNode root)
|
|
{
|
|
var pathLinks = root.SelectNodes("//a[starts-with(@href, 'vvz_sub.html')]");
|
|
|
|
if (pathLinks != null)
|
|
{
|
|
foreach (var link in pathLinks)
|
|
{
|
|
var url = GetAbsoluteUrl(link.GetAttributeValue("href", ""), source);
|
|
|
|
if (!checkedUrls.Contains(url))
|
|
{
|
|
int sort = 1;
|
|
AddLinkToScrape(url, sort);
|
|
urlsToCheck.Add(url);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void RemoveLinkToScrape(string url)
|
|
{
|
|
var link = db.LinksToScrape.Find(url);
|
|
if (link != null)
|
|
{
|
|
db.LinksToScrape.Remove(link);
|
|
}
|
|
}
|
|
|
|
private static void AddLinkToScrape(string url, int sort)
|
|
{
|
|
var link = db.LinksToScrape.Find(url);
|
|
if (link == null)
|
|
{
|
|
db.LinksToScrape.Add(new LinkToScrape { Url = url, Sort = sort });
|
|
}
|
|
}
|
|
|
|
private static void CreateLecture(string source, HtmlNode root)
|
|
{
|
|
var branch = root.SelectSingleNode("/html/body/main/div[1]/div[1]/a");
|
|
var what = root.SelectSingleNode("//h1/*/*[@class='what']");
|
|
var when = root.SelectSingleNode("//h1/*/*[@class='when']");
|
|
var info = root.SelectSingleNode("//*[@class='info list']");
|
|
var events = root.SelectNodes("//ul[@class='classes events list']/li");
|
|
|
|
var uri = new Uri(source);
|
|
|
|
var query = HttpUtility.ParseQueryString(uri.Query);
|
|
|
|
Lecture? lecture = db.Lectures.FirstOrDefault(db => db.Url == source);
|
|
|
|
if (lecture == null)
|
|
{
|
|
lecture = new Lecture();
|
|
|
|
if (query.AllKeys.Contains("lv"))
|
|
{
|
|
lecture.Id = int.Parse(query["lv"] ?? "0");
|
|
}
|
|
|
|
if (when != null)
|
|
{
|
|
lecture.Semester = when.InnerText;
|
|
}
|
|
|
|
lecture.Url = source;
|
|
|
|
db.Lectures.Add(lecture);
|
|
}
|
|
|
|
if (branch != null)
|
|
{
|
|
lecture.Branch = branch.InnerText;
|
|
}
|
|
|
|
if (what != null)
|
|
{
|
|
lecture.Title = what.InnerText;
|
|
}
|
|
|
|
if (info != null)
|
|
{
|
|
lecture.Description = info.InnerHtml;
|
|
}
|
|
|
|
if (events != null)
|
|
{
|
|
CreateLectureEvents(events, lecture);
|
|
}
|
|
}
|
|
|
|
private static void CreateLectureEvents(HtmlNodeCollection events, Lecture lecture)
|
|
{
|
|
int year = int.Parse(lecture.Semester[..4]);
|
|
|
|
db.RemoveRange(lecture.Events);
|
|
lecture.Events.Clear();
|
|
foreach (var item in events)
|
|
{
|
|
LectureEvent lectureEvent = new();
|
|
|
|
var day = item.SelectSingleNode("*[@class='date']");
|
|
var time = item.SelectSingleNode("*[@class='time']");
|
|
var room = item.SelectSingleNode("*[@class='room']");
|
|
DateTime date = new();
|
|
if (day != null)
|
|
{
|
|
date = DateTime.ParseExact(day.InnerText + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture);
|
|
}
|
|
|
|
if (time != null)
|
|
{
|
|
var text = time.InnerText;
|
|
|
|
var times = text.Split(" - ");
|
|
|
|
var from = TimeSpan.ParseExact(times[0], "hh\\:mm", CultureInfo.InvariantCulture);
|
|
var to = TimeSpan.ParseExact(times[1], "hh\\:mm", CultureInfo.InvariantCulture);
|
|
|
|
lectureEvent.From = date.Add(from);
|
|
lectureEvent.To = date.Add(to);
|
|
}
|
|
|
|
if (room != null)
|
|
{
|
|
lectureEvent.Location = room.InnerText;
|
|
}
|
|
|
|
lecture.Events.Add(lectureEvent);
|
|
}
|
|
}
|
|
} |