ES对word等文档进行全文索引
大约 2 分钟
ES对word等文档进行全文索引
安装es和插件
Ingest-attachment插件下载 注意最后版本是8.8.3,从8.4.0版本后面不更新了 IK分词器
docker network create es-net
docker run -d --name es \
-e "discovery.type=single-node" \
-e "ES_JAVA_OPTS=-Xms256m -Xmx256m" \
-v es-data:/usr/share/elasticsearch/data \
-v /data/es/plugins:/usr/share/elasticsearch/plugins \
--privileged \
--network es-net \
-p 9200:9200 \
-p 9300:9300 \
elasticsearch:7.12.1
使用
文本抽取管道
创建
- attachment 是管道的名称
PUT /_ingest/pipeline/attachment
{
"description": "提取附件信息",
"processors": [
{
"attachment": {
"field": "content",
"ignore_missing": true
}
},
{
"remove": {
"field": "content"
}
}
]
}
删除
DELETE /_ingest/pipeline/attachment
索引
PUT {{domain}}/docwrite
{
"mappings": {
"properties": {
"id": {
"type": "keyword"
},
"name": {
"type": "text",
"analyzer": "ik_max_word"
},
"type": {
"type": "keyword"
},
"url": {
"type": "keyword"
},
"attachment": {
"properties": {
"content": {
"type": "text",
"analyzer": "ik_smart"
}
}
}
}
}
}
文档-上传文件
POST {{domain}}/docwrite/_doc?pipeline=attachment
{
"id":1,
"name":"宇墨大论文",
"type":"word",
"url":"http://xxx",
"content":"文件的Base64编码"
}
查文档
GET {{domain}}/docwrite/_search
{
"_source":["name","type","url"],
"query":{
// "match_all":{}
"match":{
"attachment.content":{
"query":"角色大比拼",
"analyzer":"ik_smart"
}
}
}
}
JAVA整合
RestTemplate配置
@Configuration
public class HttpClientConfig {
@Bean
public RestTemplate restTemplate() {
return new RestTemplateBuilder()
.requestFactory(this::requestFactory)
.build();
}
private HttpComponentsClientHttpRequestFactory requestFactory() {
CloseableHttpClient httpClient = HttpClients.custom()
.setConnectionTimeToLive(30, TimeUnit.SECONDS)
.setDefaultRequestConfig(RequestConfig.custom()
.setConnectTimeout(5000*100)
.setSocketTimeout(5000*100)
.setConnectionRequestTimeout(5000)
.build())
.build();
return new HttpComponentsClientHttpRequestFactory(httpClient);
}
}
实体类
@Data
@AllArgsConstructor
@NoArgsConstructor
@Accessors(chain = true)
@Document(indexName = "docwrite", replicas = 1, shards = 1, createIndex = false)
public class FileObj {
@Id
@Field(index = true, store = true, type = FieldType.Keyword)
private String id; //用于存储文件id
@Field(index = true, store = true, type = FieldType.Text, analyzer = "ik_smart")
private String name; //文件名
@Field(index = true, store = true, type = FieldType.Keyword)
private String type; //文件的type,pdf,word,or txt
@Field(index = true, store = true, type = FieldType.Keyword)
private String url;
}
Controller
@PostMapping("/test3")
public String test3(@RequestParam MultipartFile file) {
try {
byte[] bytes = file.getBytes();
String base64EncodedString = Base64.getEncoder().encodeToString(bytes);
// 设置 Elasticsearch 主机和端口
String elasticHost = "http://192.168.67.5";
int elasticPort = 9200;
String indexName = "docwrite";
String url = String.format("%s:%d/%s/_doc?pipeline=attachment", elasticHost, elasticPort, indexName); // 设置索引名称
// 创建文档 JSON
String jsonString = JSON.toJSONString(new FileObj("1", "宇墨的大论文1", "word", "http://xxx", base64EncodedString));
// 设置请求头
HttpHeaders headers = new HttpHeaders();
headers.setContentType(MediaType.APPLICATION_JSON);
HttpEntity<String> requestEntity = new HttpEntity<>(jsonString, headers);
// 发送请求并获取响应
ResponseEntity<String> response = restTemplate.postForEntity(url, requestEntity, String.class);
// 返回响应结果
return response.getBody();
} catch (IOException e) {
e.fillInStackTrace();
}
return "xxx";
}
