Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Lindera to 0.38.0 #85

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,19 @@ on:

jobs:
create-release:
name: Create Release
name: Upload artifact
runs-on: ubuntu-latest
steps:
- id: create-release
uses: actions/create-release@v1.0.0
uses: softprops/action-gh-release@v2
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
name: Release ${{ github.ref_name }}
tag_name: ${{ github.ref }}
release_name: Release ${{ github.ref }}
draft: false
prerelease: false
generate_release_notes: true

publish-crates:
name: Publish crate
Expand Down
133 changes: 0 additions & 133 deletions CHANGES.md

This file was deleted.

23 changes: 12 additions & 11 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,29 +1,30 @@
[package]
name = "lindera-tantivy"
version = "0.32.2"
version = "0.38.0"
edition = "2021"
description = "Lindera Tokenizer for Tantivy."
documentation = "https://docs.rs/lindera-tantivy"
homepage = "https://github.com/lindera-morphology/lindera-tantivy"
repository = "https://github.com/lindera-morphology/lindera-tantivy"
homepage = "https://github.com/lindera/lindera-tantivy"
repository = "https://github.com/lindera/lindera-tantivy"
readme = "README.md"
keywords = ["tokenizer", "tantivy", "lindera"]
categories = ["text-processing"]
license = "MIT"

[features]
default = []
ipadic = ["lindera-tokenizer/ipadic"] # Japanese dictionary
unidic = ["lindera-tokenizer/unidic"] # Japanese dictionary
ko-dic = ["lindera-tokenizer/ko-dic"] # Korean dictionary
cc-cedict = ["lindera-tokenizer/cc-cedict"] # Chinese dictionary
default = [] # No directories included
ipadic = ["lindera/ipadic"] # Include IPADIC dictionary (Japanese)
ipadic-neologd = ["lindera/ipadic-neologd"] # Include IPADIC NEologd dictionary (Japanese)
unidic = ["lindera/unidic"] # Include UniDic dictionary (Japanese)
ko-dic = ["lindera/ko-dic"] # Include ko-dic dictionary (Korean)
cc-cedict = ["lindera/cc-cedict"] # Include CC-CEDICT dictionary (Chinese)
compress = ["lindera/compress"] # Compress dictionaries

[dependencies]
tantivy-tokenizer-api = "0.3.0"
tantivy = "0.22.0"

lindera-core = "0.32.2"
lindera-dictionary = "0.32.2"
lindera-tokenizer = "0.32.2"
lindera = "0.38.0"

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
Expand Down
150 changes: 138 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,21 +12,24 @@ The following example enables IPADIC.

```
[dependencies]
lindera-core = "0.32.2"
lindera-dictionary = "0.32.2"
lindera-tantivy = { version = "0.32.2", features = ["ipadic"] }
lindera = "0.38"
lindera-tantivy = { version = "0.38.0", features = ["ipadic"] }
```

### Basic example

```rust
fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs, doc, query::QueryParser, schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions}, Document, Index, TantivyDocument
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Document, Index, TantivyDocument,
};

use lindera_core::mode::Mode;
use lindera_dictionary::{DictionaryLoader, DictionaryConfig, DictionaryKind};
use lindera::dictionary::DictionaryKind;
use lindera::{dictionary::load_dictionary_from_kind, mode::Mode, segmenter::Segmenter};
use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
Expand Down Expand Up @@ -75,12 +78,11 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());

// Tokenizer with IPADIC
let dictionary_config = DictionaryConfig {
kind: Some(DictionaryKind::IPADIC),
path: None,
};
let dictionary = DictionaryLoader::load_dictionary_from_config(dictionary_config).unwrap();
let tokenizer = LinderaTokenizer::new(dictionary, None, Mode::Normal);
let mode = Mode::Normal;
let dictionary = load_dictionary_from_kind(DictionaryKind::IPADIC).unwrap();
let user_dictionary = None;
let segmenter = Segmenter::new(mode, dictionary, user_dictionary);
let tokenizer = LinderaTokenizer::from_segmenter(segmenter);

// register Lindera tokenizer
index.tokenizers().register("lang_ja", tokenizer);
Expand Down Expand Up @@ -138,6 +140,130 @@ fn main() -> tantivy::Result<()> {
}
```

### Config by YAML

```rust
use std::path::PathBuf;

fn main() -> tantivy::Result<()> {
use tantivy::{
collector::TopDocs,
doc,
query::QueryParser,
schema::{IndexRecordOption, Schema, TextFieldIndexing, TextOptions},
Document, Index, TantivyDocument,
};

use lindera_tantivy::tokenizer::LinderaTokenizer;

// create schema builder
let mut schema_builder = Schema::builder();

// add id field
let id = schema_builder.add_text_field(
"id",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic),
)
.set_stored(),
);

// add title field
let title = schema_builder.add_text_field(
"title",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("lang_ja")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);

// add body field
let body = schema_builder.add_text_field(
"body",
TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("lang_ja")
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
)
.set_stored(),
);

// build schema
let schema = schema_builder.build();

// create index on memory
let index = Index::create_in_ram(schema.clone());

// Build tokenizer with config file
let config_file = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("./examples")
.join("lindera.yml");
let tokenizer = LinderaTokenizer::from_file(config_file.as_path())?;

// register Lindera tokenizer
index.tokenizers().register("lang_ja", tokenizer);

// create index writer
let mut index_writer = index.writer(50_000_000)?;

// add document
index_writer.add_document(doc!(
id => "1",
title => "成田国際空港",
body => "成田国際空港(なりたこくさいくうこう、英: Narita International Airport)は、千葉県成田市南東部から芝山町北部にかけて建設された日本最大の国際拠点空港である。首都圏東部(東京の東60km)に位置している。空港コードはNRT。"
)).unwrap();

// add document
index_writer.add_document(doc!(
id => "2",
title => "東京国際空港",
body => "東京国際空港(とうきょうこくさいくうこう、英語: Tokyo International Airport)は、東京都大田区にある日本最大の空港。通称は羽田空港(はねだくうこう、英語: Haneda Airport)であり、単に「羽田」と呼ばれる場合もある。空港コードはHND。"
)).unwrap();

// add document
index_writer.add_document(doc!(
id => "3",
title => "関西国際空港",
body => "関西国際空港(かんさいこくさいくうこう、英: Kansai International Airport)は大阪市の南西35㎞に位置する西日本の国際的な玄関口であり、関西三空港の一つとして大阪国際空港(伊丹空港)、神戸空港とともに関西エアポート株式会社によって一体運営が行われている。"
)).unwrap();

// commit
index_writer.commit()?;

// create reader
let reader = index.reader()?;

// create searcher
let searcher = reader.searcher();

// create querhy parser
let query_parser = QueryParser::for_index(&index, vec![title, body]);

// parse query
let query_str = "TOKYO";
let query = query_parser.parse_query(query_str)?;
println!("Query String: {}", query_str);

// search
println!("Parsed Query: {:?}", query);
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
println!("Search Result:");
for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
}

Ok(())
}
```

## API reference

The API reference is available. Please see following URL:
Expand Down
Loading
Loading