Skip to content

Commit

Permalink
feat: 베스트 셀러 크롤링 기능 추가 (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
jwooo committed Mar 27, 2024
1 parent ca31b32 commit 8cc6f05
Show file tree
Hide file tree
Showing 11 changed files with 161 additions and 46 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,5 @@ out/
application-dev.yml
application-prod.yml
application-jwt.yml
application-oauth.yml
application-oauth.yml
application-crawler.yml
3 changes: 2 additions & 1 deletion src/main/java/com/jisungin/exception/ErrorCode.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ public enum ErrorCode {
TALK_ROOM_NOT_FOUND(400, "토크방을 찾을 수 없습니다."),
UNAUTHORIZED_REQUEST(400, "권한이 없는 사용자입니다."),
COMMENT_NOT_FOUND(404, "의견을 찾을 수 없습니다."),
REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다.");
REVIEW_NOT_FOUND(404, "리뷰를 찾을 수 없습니다."),
REQUEST_TIME_OUT(408, "요청 시간이 만료 되었습니다.");


private final int code;
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Crawler.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
package com.jisungin.infra.crawler;

import java.util.Map;

public interface Crawler {

CrawlingBook crawlBook(String isbn);
Map<Long, CrawlingBook> crawlBestSellerBook();

}
37 changes: 32 additions & 5 deletions src/main/java/com/jisungin/infra/crawler/CrawlingBook.java
Original file line number Diff line number Diff line change
@@ -1,29 +1,56 @@
package com.jisungin.infra.crawler;

import java.time.LocalDateTime;
import lombok.Builder;
import lombok.Getter;
import lombok.ToString;

@Getter
@ToString
public class CrawlingBook {

private String imageUrl;
private String title;
private String content;
private String isbn;
private String publisher;
private String imageUrl;
private String thumbnail;
private String[] authors;
private LocalDateTime dateTime;

@Builder
private CrawlingBook(String imageUrl, String content) {
this.imageUrl = imageUrl;
private CrawlingBook(String title, String content, String isbn, String publisher, String imageUrl, String thumbnail,
String authors, LocalDateTime dateTime) {
this.title = title;
this.content = content;
this.isbn = isbn;
this.publisher = publisher;
this.imageUrl = imageUrl;
this.thumbnail = thumbnail;
this.authors = parseAuthorsToArr(authors);
this.dateTime = dateTime;
}

public static CrawlingBook of(String imageUrl, String content) {
public static CrawlingBook of(String title, String content, String isbn, String publisher, String imageUrl,
String thumbnail, String authors, LocalDateTime dateTime) {
return CrawlingBook.builder()
.imageUrl(imageUrl)
.title(title)
.content(content)
.isbn(isbn)
.publisher(publisher)
.imageUrl(imageUrl)
.thumbnail(thumbnail)
.authors(authors)
.dateTime(dateTime)
.build();
}

public boolean isBlankContent() {
return this.content.isBlank();
}

private String[] parseAuthorsToArr(String authors) {
return authors.split(" 저| 공저| 글| 편저| 원저")[0].split(",");
}

}
1 change: 1 addition & 0 deletions src/main/java/com/jisungin/infra/crawler/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ public interface Fetcher {

Document fetchIsbn(String isbn);
Document fetchBook(String bookId);
Document fetchBestSellerBookId();

}
2 changes: 2 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Parser.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package com.jisungin.infra.crawler;

import java.util.Map;
import org.jsoup.nodes.Document;

public interface Parser {

String parseIsbn(Document doc);
CrawlingBook parseBook(Document doc);
Map<Long, String> parseBestSellerBookId(Document doc);

}
19 changes: 19 additions & 0 deletions src/main/java/com/jisungin/infra/crawler/Yes24Crawler.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
package com.jisungin.infra.crawler;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Component;

Expand All @@ -17,4 +21,19 @@ public CrawlingBook crawlBook(String isbn) {
return parser.parseBook(fetcher.fetchBook(bookId));
}

@Override
public Map<Long, CrawlingBook> crawlBestSellerBook() {
Map<Long, String> bestSellerBookIds = parser.parseBestSellerBookId(fetcher.fetchBestSellerBookId());
Map<Long, CrawlingBook> bestSellerBooks = new HashMap<>();

List<CompletableFuture<Void>> futures = bestSellerBookIds.entrySet().stream()
.map(entry -> CompletableFuture.supplyAsync(() -> parser.parseBook(fetcher.fetchBook(entry.getValue())))
.thenAccept(crawlingBook -> bestSellerBooks.put(entry.getKey(), crawlingBook)))
.toList();

CompletableFuture.allOf(futures.toArray(CompletableFuture[]::new)).join();

return bestSellerBooks;
}

}
23 changes: 0 additions & 23 deletions src/main/java/com/jisungin/infra/crawler/Yes24CrawlerConstant.java

This file was deleted.

44 changes: 39 additions & 5 deletions src/main/java/com/jisungin/infra/crawler/Yes24Fetcher.java
Original file line number Diff line number Diff line change
@@ -1,25 +1,34 @@
package com.jisungin.infra.crawler;

import static com.jisungin.infra.crawler.Yes24CrawlerConstant.*;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.USER_AGENT;

import com.jisungin.exception.BusinessException;
import com.jisungin.exception.ErrorCode;
import java.net.SocketTimeoutException;
import lombok.Setter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

@Component
@Setter
@ConfigurationProperties(prefix = "crawler.yes24.fetcher")
public class Yes24Fetcher implements Fetcher {

private String isbnUrl;
private String bookUrl;
private String bestBookUrl;
private String userAgent;

@Override
public Document fetchIsbn(String isbn) {
try {
return Jsoup.connect(getIsbnUrl(isbn))
.timeout(5000)
.userAgent(USER_AGENT)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
Expand All @@ -30,12 +39,37 @@ public Document fetchBook(String bookId) {
try {
return Jsoup.connect(getBookUrl(bookId))
.timeout(5000)
.userAgent(USER_AGENT)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
}

@Override
public Document fetchBestSellerBookId() {
try {
return Jsoup.connect(bestBookUrl)
.timeout(5000)
.userAgent(userAgent)
.ignoreContentType(true)
.get();
} catch (SocketTimeoutException e) {
throw new BusinessException(ErrorCode.REQUEST_TIME_OUT);
} catch (Exception e) {
throw new BusinessException(ErrorCode.BOOK_NOT_FOUND);
}
}

private String getIsbnUrl(String isbn) {
return isbnUrl + isbn;
}

private String getBookUrl(String bookId) {
return bookUrl + bookId;
}

}
69 changes: 59 additions & 10 deletions src/main/java/com/jisungin/infra/crawler/Yes24Parser.java
Original file line number Diff line number Diff line change
@@ -1,29 +1,78 @@
package com.jisungin.infra.crawler;

import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_CONTENT_CSS;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_ATTR;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.BOOK_IMAGE_CSS;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_ATTR;
import static com.jisungin.infra.crawler.Yes24CrawlerConstant.ISBN_CSS;

import com.jayway.jsonpath.JsonPath;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import lombok.Setter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;

@Component
@Setter
@ConfigurationProperties(prefix = "crawler.yes24.parser")
public class Yes24Parser implements Parser {

private String isbnCss;
private String isbnAttr;
private String bookContentCss;
private String bookJsonCss;
private String bestRankingCss;
private String bestIdCss;
private String bestIdAttrs;

@Override
public String parseIsbn(Document doc) {
return doc.select(ISBN_CSS).attr(ISBN_ATTR);
return doc.select(isbnCss).attr(isbnAttr);
}

@Override
public CrawlingBook parseBook(Document doc) {
String image = doc.select(BOOK_IMAGE_CSS).attr(BOOK_IMAGE_ATTR);
String content = Jsoup.clean(doc.select(BOOK_CONTENT_CSS).text(), Safelist.none());
String json = doc.select(bookJsonCss).html();

String title = parseJsonToString(json, "$.name");
String isbn = parseJsonToString(json, "$.workExample[0].isbn");
String imageUrl = parseJsonToString(json, "$.image");
String publisher = parseJsonToString(json, "$.publisher.name");
String authors = parseJsonToString(json, "$.author.name");
String thumbnail = imageUrl.replace("XL", "M");
String content = Jsoup.clean(doc.select(bookContentCss).text(), Safelist.none());
LocalDateTime dateTime = parseDate(parseJsonToString(json, "$.workExample[0].datePublished"));

return CrawlingBook.of(title, content, isbn, publisher, imageUrl, thumbnail, authors, dateTime);
}

@Override
public Map<Long, String> parseBestSellerBookId(Document doc) {
Elements rankings = doc.select(bestRankingCss);
List<String> bookIds = doc.select(bestIdCss)
.eachAttr(bestIdAttrs);

return IntStream.range(0, rankings.size())
.boxed()
.collect(Collectors.toMap(
i -> parseRanking(rankings.get(i)),
bookIds::get));
}

private Long parseRanking(Element rankingElement) {
return Long.parseLong(rankingElement.text());
}

private String parseJsonToString(String json, String path) {
return JsonPath.read(json, path);
}

return CrawlingBook.of(image, content);
private LocalDateTime parseDate(String dateString) {
return LocalDate.parse(dateString).atStartOfDay();
}

}
3 changes: 2 additions & 1 deletion src/main/resources/application.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,5 @@ spring:
prod-env:
- prod
include:
oauth
- oauth
- crawler

0 comments on commit 8cc6f05

Please sign in to comment.