Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/convert r2dbc #22

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions build.gradle.kts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ plugins {
id("org.jlleitschuh.gradle.ktlint") version "11.3.2"
kotlin("jvm") version "1.9.21"
kotlin("plugin.spring") version "1.9.21"
kotlin("plugin.jpa") version "1.9.24"
}

group = "com.yourssu"
Expand All @@ -29,8 +28,10 @@ dependencies {
implementation("org.jetbrains.kotlin:kotlin-reflect")
implementation("io.projectreactor.kotlin:reactor-kotlin-extensions")
implementation("org.jetbrains.kotlinx:kotlinx-coroutines-reactor")
implementation("org.springframework.boot:spring-boot-starter-data-jpa")
runtimeOnly("com.mysql:mysql-connector-j")
// implementation("org.springframework.boot:spring-boot-starter-data-jpa")
implementation("org.springframework.boot:spring-boot-starter-data-r2dbc")
// implementation("io.r2dbc:r2dbc-pool")
implementation("org.mariadb:r2dbc-mariadb:1.1.2")

implementation("org.springdoc:springdoc-openapi-starter-webmvc-ui:2.1.0")

Expand Down
6 changes: 6 additions & 0 deletions src/main/kotlin/com/yourssu/search/SearchApplication.kt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@ package com.yourssu.search

import org.springframework.boot.autoconfigure.SpringBootApplication
import org.springframework.boot.runApplication
import org.springframework.data.r2dbc.config.EnableR2dbcAuditing
import org.springframework.data.r2dbc.repository.config.EnableR2dbcRepositories
import org.springframework.scheduling.annotation.EnableScheduling

@EnableScheduling
@EnableR2dbcRepositories
@EnableR2dbcAuditing
@SpringBootApplication
class SearchApplication

Expand Down
15 changes: 15 additions & 0 deletions src/main/kotlin/com/yourssu/search/crawling/config/R2dbcConfig.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package com.yourssu.search.crawling.config

import io.r2dbc.spi.ConnectionFactory
import org.springframework.context.annotation.Bean
import org.springframework.context.annotation.Configuration
import org.springframework.data.r2dbc.core.R2dbcEntityTemplate

@Configuration
class R2dbcConfig(private val connectionFactory: ConnectionFactory) {

@Bean
fun r2dbcEntityTemplate(): R2dbcEntityTemplate {
return R2dbcEntityTemplate(connectionFactory)
}
}
Original file line number Diff line number Diff line change
@@ -1,25 +1,17 @@
package com.yourssu.search.crawling.domain

import jakarta.persistence.Column
import jakarta.persistence.Entity
import jakarta.persistence.EnumType
import jakarta.persistence.Enumerated
import jakarta.persistence.GeneratedValue
import jakarta.persistence.GenerationType
import jakarta.persistence.Id
import jakarta.persistence.Table
import org.springframework.data.annotation.Id
import org.springframework.data.relational.core.mapping.Column
import org.springframework.data.relational.core.mapping.Table

@Entity
@Table(name = "information_url")
@Table("information_url")
class InformationUrl(
@field:Id
@field:GeneratedValue(strategy = GenerationType.IDENTITY)
@Id
val id: Long? = null,

@field:Column(name = "content_url", nullable = false, unique = true, length = 500)
@Column("content_url")
val contentUrl: String,

@field:Column(name = "source_type", nullable = false)
@field:Enumerated(EnumType.STRING)
@Column("source_type")
val sourceType: SourceType
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package com.yourssu.search.crawling.repository

import com.yourssu.search.crawling.domain.Information
import org.springframework.data.elasticsearch.repository.CoroutineElasticsearchRepository

interface CoroutineInformationRepository : CoroutineElasticsearchRepository<Information, String>
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@ package com.yourssu.search.crawling.repository

import com.yourssu.search.crawling.domain.InformationUrl
import com.yourssu.search.crawling.domain.SourceType
import org.springframework.data.jpa.repository.JpaRepository
import kotlinx.coroutines.flow.Flow
import org.springframework.data.repository.kotlin.CoroutineCrudRepository
import org.springframework.stereotype.Repository

@Repository
interface InformationUrlRepository : JpaRepository<InformationUrl, String> {
fun findAllBySourceType(sourceType: SourceType): List<InformationUrl>
interface InformationUrlRepository : CoroutineCrudRepository<InformationUrl, Long> {
fun findAllBySourceType(sourceType: SourceType): Flow<InformationUrl>
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package com.yourssu.search.crawling.scheduler

import com.yourssu.search.crawling.service.CrawlingService
import kotlinx.coroutines.runBlocking
import org.springframework.scheduling.annotation.Scheduled
import org.springframework.stereotype.Component

@Component
class CrawlingScheduler(
private val crawlingService: CrawlingService
) {
@Scheduled(cron = "0 0 0 * * ?") // 매일 자정 12시 실행
fun scheduleCrawling() = runBlocking {
crawlingService.strategies.keys.forEach { key -> // strategies의 의존성 주입 된 객체 순회
crawlingService.executeCrawling(key)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@ import com.yourssu.search.crawling.repository.InformationRepository
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.withContext
import org.springframework.stereotype.Service
import org.springframework.transaction.annotation.Transactional

@Service
class CrawlingService(
private val strategies: Map<String, CrawlingStrategy>,
private val informationRepository: InformationRepository,
public val strategies: Map<String, CrawlingStrategy>,
private val informationRepository: InformationRepository
) {

suspend fun executeCrawling(strategyKey: String) {
Expand All @@ -17,6 +18,7 @@ class CrawlingService(
strategy.crawl()
}

@Transactional
suspend fun deleteData() {
withContext(Dispatchers.IO) {
informationRepository.deleteAll()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@ package com.yourssu.search.crawling.service

interface CrawlingStrategy {
suspend fun crawl()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import kotlin.time.measureTimedValue
@Component("fun")
class FunCrawlingStrategy(
private val crawlingUtils: CrawlingUtils
): CrawlingStrategy {
) : CrawlingStrategy {
private val log = LoggerFactory.getLogger(this::class.java)

override suspend fun crawl() {
Expand All @@ -25,8 +25,10 @@ class FunCrawlingStrategy(

val toSaveDocuments: List<Element> =
crawlingUtils.filteringToSaveDocuments(allDocuments, SourceType.FUN, urlSelector)
val flattenedDocuments: List<Element> = allDocuments.flatten()

crawlingUtils.crawlingContents(
// toSaveDocuments = flattenedDocuments,
toSaveDocuments = toSaveDocuments,
titleSelector = ".content .title",
contentSelector = "div .description p",
Expand All @@ -38,4 +40,4 @@ class FunCrawlingStrategy(
}
log.info("all time use {}", duration.duration.inWholeSeconds)
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import kotlin.time.measureTimedValue
@Component("notice")
class NoticeCrawlingStrategy(
private val crawlingUtils: CrawlingUtils
): CrawlingStrategy {
) : CrawlingStrategy {
private val log = LoggerFactory.getLogger(this::class.java)

override suspend fun crawl() {
Expand All @@ -25,12 +25,12 @@ class NoticeCrawlingStrategy(

val toSaveDocuments: List<Element> =
crawlingUtils.filteringToSaveDocuments(
lists = allDocuments,
sourceType = SourceType.NOTICE,
urlSelector = urlSelector
lists = allDocuments, sourceType = SourceType.NOTICE, urlSelector = urlSelector
)
val flattenedDocuments: List<Element> = allDocuments.flatten()

crawlingUtils.crawlingContents(
// toSaveDocuments = flattenedDocuments,
toSaveDocuments = toSaveDocuments,
titleSelector = ".notice_col3 a .d-inline-blcok.m-pt-5",
contentSelector = "div.bg-white p",
Expand All @@ -42,4 +42,4 @@ class NoticeCrawlingStrategy(
}
log.info("all time use {}", duration.duration.inWholeSeconds)
}
}
}
74 changes: 46 additions & 28 deletions src/main/kotlin/com/yourssu/search/crawling/utils/CrawlingUtils.kt
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ package com.yourssu.search.crawling.utils
import com.yourssu.search.crawling.domain.Information
import com.yourssu.search.crawling.domain.InformationUrl
import com.yourssu.search.crawling.domain.SourceType
import com.yourssu.search.crawling.repository.InformationRepository
import com.yourssu.search.crawling.repository.CoroutineInformationRepository
import com.yourssu.search.crawling.repository.InformationUrlRepository
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.coroutineScope
import kotlinx.coroutines.flow.toList
import kotlinx.coroutines.joinAll
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
Expand All @@ -18,6 +19,7 @@ import org.jsoup.nodes.Element
import org.slf4j.LoggerFactory
import org.springframework.beans.factory.annotation.Value
import org.springframework.stereotype.Component
import org.springframework.transaction.annotation.Transactional
import java.io.FileNotFoundException
import java.time.LocalDate
import java.util.concurrent.atomic.AtomicBoolean
Expand All @@ -26,13 +28,14 @@ import java.util.regex.Pattern

@Component
class CrawlingUtils(
private val informationRepository: InformationRepository,
private val coroutineElasticsearchRepository: CoroutineInformationRepository,
private val informationUrlRepository: InformationUrlRepository,

@Value("\${general.user-agent}")
private val userAgent: String,

private val coroutineScope: CoroutineScope = CoroutineScope(Dispatchers.IO),
private val coroutineInformationRepository: CoroutineInformationRepository
) {
private val log = LoggerFactory.getLogger(this::class.java)

Expand All @@ -43,7 +46,7 @@ class CrawlingUtils(
): List<Element> {
val savedData: List<InformationUrl>
withContext(Dispatchers.IO) {
savedData = informationUrlRepository.findAllBySourceType(sourceType)
savedData = informationUrlRepository.findAllBySourceType(sourceType).toList()
}
val savedUrls = savedData.map { it.contentUrl }

Expand Down Expand Up @@ -97,7 +100,6 @@ class CrawlingUtils(
resultList
}


private suspend fun fetchPage(baseUrl: String, pageNumber: Int, ulSelector: String): List<Element> {
val document = Jsoup.connect("$baseUrl/$pageNumber")
.userAgent(userAgent)
Expand All @@ -117,6 +119,7 @@ class CrawlingUtils(
return contents
}

@Transactional
suspend fun crawlingContents(
toSaveDocuments: List<Element>,
titleSelector: String,
Expand All @@ -126,8 +129,6 @@ class CrawlingUtils(
favicon: String?,
sourceType: SourceType
) {
val urlChannel = Channel<InformationUrl>(Channel.UNLIMITED)

val contentJobs: List<Job> = toSaveDocuments.map { element ->
coroutineScope.launch {
val rawDate = element.selectFirst(dateSelector)?.text() ?: ""
Expand All @@ -151,32 +152,37 @@ class CrawlingUtils(
return@launch
}

urlChannel.send(InformationUrl(contentUrl = contentUrl, sourceType = sourceType))

informationRepository.save(
Information(
title = title,
content = content.toString().trim(),
date = extractedDate,
contentUrl = contentUrl,
imgList = imgList,
favicon = favicon,
source = sourceType.value
try {
// `InformationUrl`을 즉시 저장
informationUrlRepository.save(
InformationUrl(
contentUrl = contentUrl,
sourceType = sourceType
)
)
log.info("Saved URL: $contentUrl")

// `Information`도 저장
coroutineInformationRepository.save(
Information(
title = title,
content = content.toString().trim(),
date = extractedDate,
contentUrl = contentUrl,
imgList = imgList,
favicon = favicon,
source = sourceType.value
)
)
)
// log.info("Saved Information for URL: $contentUrl")
} catch (e: Exception) {
// log.error("Error saving URL or Information for $contentUrl", e)
}
}
}

contentJobs.joinAll()
urlChannel.close()

val toSaveUrls = mutableListOf<InformationUrl>()
for (url in urlChannel) {
toSaveUrls.add(url)
}

val distinctUrls = toSaveUrls.distinctBy { it.contentUrl }
informationUrlRepository.saveAll(distinctUrls)
log.info("Crawling and saving completed.")
}

private fun extractDate(dateStr: String): LocalDate? {
Expand All @@ -197,4 +203,16 @@ class CrawlingUtils(
null // 정규표현식에 맞지 않으면 null 반환
}
}
}

/*@Transactional
suspend fun saveAllWithRollback(urls: List<InformationUrl>) {
urls.forEachIndexed { index, url ->
informationUrlRepository.save(url)

// 인위적으로 예외 발생
if (index == 90) {
throw RuntimeException("Simulated exception for rollback")
}
}
}*/
}
Loading