added basic websraper

This commit is contained in:
Ksan 2025-10-24 00:24:42 +02:00
parent 7d3a9106dc
commit 7dad75333c
7 changed files with 229 additions and 18 deletions

4
.gitignore vendored
View File

@ -1,5 +1,7 @@
.db/
todo
*temp.java
./src/main/java/dev/ksan/etfoglasiserver/temp.java
HELP.md
.gradle

View File

@ -33,6 +33,8 @@ dependencies {
runtimeOnly 'org.postgresql:postgresql'
testImplementation 'org.springframework.boot:spring-boot-starter-test'
testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
implementation("net.sourceforge.htmlunit:htmlunit:2.70.0")
}
generateJava {

View File

@ -1,17 +1,9 @@
package dev.ksan.etfoglasiserver;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.time.LocalDate;
import java.time.LocalDateTime;
import org.springframework.boot.CommandLineRunner;
import dev.ksan.etfoglasiserver.service.Scraper;
import java.util.Scanner;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.Bean;
import org.springframework.jdbc.core.JdbcTemplate;
@SpringBootApplication
public class EtfoglasiServerApplication {
@ -19,7 +11,36 @@ public class EtfoglasiServerApplication {
public static void main(String[] args) {
SpringApplication.run(EtfoglasiServerApplication.class, args);
boolean running = true;
//temp.run();
System.out.println("EtfoglasiServerApplication started");
Scraper scraper = new Scraper();
Thread webClientThread = new Thread(scraper, "WebClientThread");
Scanner scanner = new Scanner(System.in);
webClientThread.start();
try {
while (running) {
String command = scanner.nextLine();
switch (command) {
case "stop":
scraper.stop();
webClientThread.interrupt();
running = false;
System.out.println("Stopping...");
break;
case "list":
System.out.println();
}
}
} catch (Exception e) {
e.printStackTrace();
} finally {
scanner.close();
}
}
}

View File

@ -25,6 +25,32 @@ public class Entry {
public Entry() {}
public Entry(
String title,
LocalDateTime time_published,
String info_entry,
String paragraph,
String filepath) {
this.title = title;
this.time_published = time_published;
this.info_entry = info_entry;
this.paragraph = paragraph;
this.filepath = filepath;
}
public Entry(
String title,
LocalDateTime time_published,
String info_entry,
List<String> paragraph,
String filepath) {
this.title = title;
this.time_published = time_published;
this.info_entry = info_entry;
this.paragraph = String.join("\n", paragraph);
this.filepath = filepath;
}
public Entry(EntryDTO entry) {
this.title = entry.getTitle();
this.paragraph = entry.getParagraph();
@ -56,6 +82,7 @@ public class Entry {
public void setTitle(String title) {
this.title = title;
}
public String getTitle() {
return title;
}

View File

@ -0,0 +1,128 @@
package dev.ksan.etfoglasiserver.service;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import dev.ksan.etfoglasiserver.model.Entry;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.ConsoleHandler;
import java.util.logging.Handler;
import java.util.logging.Level;
import java.util.logging.Logger;
public class Scraper implements Runnable {
private static List<Entry> entries = new ArrayList<>();
private WebClient webClient;
private volatile boolean running = true;
public Scraper() {
this.webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
}
private static String getTextOrEmpty(HtmlElement parent, String xPath) {
HtmlElement element = parent.getFirstByXPath(xPath);
return element == null ? "" : element.asNormalizedText();
}
static List<Entry> getEntries() {
synchronized (entries) {
return new ArrayList<>(entries);
}
}
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("dd.MM.yyyy HH:mm:ss");
private void configureHtmlUnitLogging() {
Logger htmlUnitLogger = Logger.getLogger("com.gargoylesoftware.htmlunit");
htmlUnitLogger.setLevel(Level.SEVERE);
Handler consoleHandler = new ConsoleHandler();
consoleHandler.setLevel(Level.SEVERE);
htmlUnitLogger.addHandler(consoleHandler);
}
@Override
public void run() {
configureHtmlUnitLogging();
while (running && !Thread.currentThread().isInterrupted()) {
try {
System.out.println("Performing WebClient task...");
HtmlPage mainPage = webClient.getPage("https://efee.etf.unibl.org/oglasi/");
webClient.waitForBackgroundJavaScript(1000);
List<DomElement> rawToggles = mainPage.getByXPath("//a[@href='#']");
List<HtmlAnchor> toggles = new ArrayList<>();
for (DomElement el : rawToggles) {
if (el instanceof HtmlAnchor) {
toggles.add((HtmlAnchor) el);
}
}
int ul_idSelection = 1;
for (HtmlAnchor anchor : toggles) {
String groupName = anchor.asNormalizedText().split("\n")[0].trim();
System.out.println("Group name: " + groupName);
HtmlPage updatedPage = anchor.click();
webClient.waitForBackgroundJavaScript(1000);
String ul_id = "ul_id_" + Integer.toString(ul_idSelection);
DomElement rawElement = updatedPage.getElementById(ul_id);
HtmlElement listElement =
rawElement instanceof HtmlElement ? (HtmlElement) rawElement : null;
if (listElement == null) {
System.out.println("An element with id " + ul_id + " was not found");
ul_idSelection++;
continue;
}
List<HtmlElement> items = listElement.getElementsByTagName("li");
for (HtmlElement item : items) {
String title = getTextOrEmpty(item, ".//h1");
String date = getTextOrEmpty(item, ".//h2[1]");
String info = getTextOrEmpty(item, ".//h2[2]");
List<String> paragraphs = new ArrayList<>();
List<HtmlElement> pTags = item.getByXPath(".//p");
for (HtmlElement pTag : pTags) {
paragraphs.add(pTag.asNormalizedText());
}
// Entry entry = new Entry(title, groupName, date, info, paragraphs);
Entry entry =
new Entry(title, LocalDateTime.parse(date, formatter), info, paragraphs, null);
entry.setParagraph(paragraphs);
System.out.println(entry);
Thread.sleep(2000);
entries.add(entry);
}
ul_idSelection++;
}
// Thread.sleep(20000);
} catch (Exception e) {
e.printStackTrace();
System.out.println("ERROR: " + e.getMessage());
} finally {
this.webClient.close();
}
}
System.out.println("WebScraper thread stopped");
}
public void stop() {
running = false;
}
}

View File

@ -44,6 +44,9 @@ public class UserService {
public void updateUser(UserCreationDTO user) {
Optional<User> existingUserOpt = userRepo.findByEmail(user.getEmail());
if(userRepo.findByEmail(user.getNewEmail()).isPresent()) {
throw new RuntimeException("Email taken");
}
if (userRepo.findByEmail(user.getEmail()).isPresent()) {
if (this.isValidEmail(user.getEmail())) {
@ -51,18 +54,16 @@ public class UserService {
if (this.isValidPassword(user.getPassword())) {
User existingUser = existingUserOpt.get();
/*
if(user.getNewEmail() == null){
existingUser.setEmail(user.getEmail());
}else{
if(user.getNewEmail() != null && user.getNewEmail() != existingUser.getEmail()) {
existingUser.setEmail(user.getNewEmail());
}
*/
}else{
existingUser.setEmail(user.getEmail());
}
existingUser.setPassword(user.getPassword());
userRepo.save(existingUser);
} else throw new RuntimeException("Password too short");

View File

@ -0,0 +1,30 @@
package dev.ksan.etfoglasiserver;
import org.springframework.http.HttpEntity;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType;
import org.springframework.web.client.RestTemplate;
import java.util.List;
public class temp {
private static final String URL = "http://localhost:8080/subject";
public static void run() {
System.out.println("started to add valjda");
List<String> subjects = List.of(
);
RestTemplate restTemplate = new RestTemplate();
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
for (String name : subjects) {
String json = "{ \"name\": \"" + name + "\", \"code\": \"\" }";
HttpEntity<String> request = new HttpEntity<>(json, headers);
restTemplate.postForObject(URL, request, Void.class);
System.out.println("Added subject: " + name);
}
}
}