using System.Globalization; using System.Net.NetworkInformation; using System.Web; using System.Xml.Linq; using OpenQA.Selenium; using OpenQA.Selenium.BiDi.Modules.Script; using OpenQA.Selenium.Chrome; using OpenQA.Selenium.Support.UI; using UWLib; using static Microsoft.EntityFrameworkCore.DbLoggerCategory; internal class Program { static List checkedUrls = new List(); static List urlsToCheck = new List(); static LectureContext db = new LectureContext(); static IWebDriver driver = null; private static void Main(string[] args) { var options = new ChromeOptions(); //options.AddArgument("--headless"); driver = new ChromeDriver(options); checkedUrls.AddRange(db.Lectures.Select(x => x.Url).ToList()); urlsToCheck.Add("https://ufind.univie.ac.at/de/vvz.html"); while (urlsToCheck.Count > 0) { var url = urlsToCheck.First(); FindUrls(url); } } private static void FindUrls(string source) { if (checkedUrls.Contains(source)) { urlsToCheck.Remove(source); return; } if (db.Lectures.Any(x=>x.Url == source)) { checkedUrls.Add(source); urlsToCheck.Remove(source); return; } driver.Navigate().GoToUrl(source); WebDriverWait wait = new WebDriverWait(driver, TimeSpan.FromSeconds(2)); wait.Until(d => d.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")).Count > 0 || d.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]")).Count > 0 || d.FindElements(By.XPath("//h1/*/*[@class='what']")).Count > 0 ); if (source.Contains("course.html")) { Lecture lecture = new Lecture(); lecture.Url = source; var uri = new Uri(source); var query = HttpUtility.ParseQueryString(uri.Query); if (query.AllKeys.Contains("lv")) { lecture.Id = int.Parse(query["lv"]); } int year = DateTime.Now.Year; var what = driver.FindElements(By.XPath("//h1/*/*[@class='what']")); var when = driver.FindElements(By.XPath("//h1/*/*[@class='when']")); var info = driver.FindElements(By.XPath("//*[@class='info list']")); var events = driver.FindElements(By.XPath("//ul[@class='classes events list']/li")); if (what.Count > 0) { lecture.Title = what.First().Text; } if (when.Count > 0) { lecture.Semester = when.First().Text; year = int.Parse(when.First().Text.Substring(0, 4)); } if (info.Count > 0) { lecture.Description = info.First().GetAttribute("innerHTML"); } foreach (var item in events) { LectureEvent lectureEvent = new LectureEvent(); var day = item.FindElements(By.XPath("*[@class='date']")); var time = item.FindElements(By.XPath("*[@class='time']")); var room = item.FindElements(By.XPath("*[@class='room']")); DateTime date = new DateTime(); if (day.Count > 0) { date = DateTime.ParseExact(day.First().Text + year.ToString(), "dd.MM.yyyy", CultureInfo.InvariantCulture); } if (time.Count > 0) { var text = time.First().Text; var times = text.Split(" - "); var from = TimeSpan.ParseExact(times[0], "hh\\:mm", CultureInfo.InvariantCulture); var to = TimeSpan.ParseExact(times[1], "hh\\:mm", CultureInfo.InvariantCulture); lectureEvent.From = date.Add(from); lectureEvent.To = date.Add(to); } if (room.Count > 0) { lectureEvent.Location = room.First().Text; } lecture.Events.Add(lectureEvent); } try { db.Lectures.Add(lecture); } catch (Exception e) { } db.SaveChanges(); } var pathLinks = driver.FindElements(By.XPath("//a[starts-with(@href, 'vvz_sub.html')]")); foreach (var link in pathLinks) { var url = link.GetAttribute("href"); if (!checkedUrls.Contains(url)) { urlsToCheck.Add(link.GetAttribute("href")); } } var courseLinkParents = driver.FindElements(By.XPath("//a[starts-with(@href, 'course.html')]/..")); foreach (var parent in courseLinkParents) { if (parent.FindElements(By.XPath("abbr[@title='Vorlesung']")).Count > 0) { var link = parent.FindElement(By.XPath("a[starts-with(@href, 'course.html')]")); var url = link.GetAttribute("href"); if (!checkedUrls.Contains(url)) { urlsToCheck.Insert(0, link.GetAttribute("href")); } } } // driver.Close(); checkedUrls.Add(source); urlsToCheck.Remove(source); } }