跳至主要內容

ES对word等文档进行全文索引

程序员李某某大约 2 分钟

ES对word等文档进行全文索引

安装es和插件

Ingest-attachment插件下载open in new window 注意最后版本是8.8.3,从8.4.0版本后面不更新了 IK分词器open in new window

docker network create es-net
docker run -d --name es \
    -e "discovery.type=single-node" \
    -e "ES_JAVA_OPTS=-Xms256m -Xmx256m" \
    -v es-data:/usr/share/elasticsearch/data \
    -v /data/es/plugins:/usr/share/elasticsearch/plugins \
    --privileged \
    --network es-net \
    -p 9200:9200 \
    -p 9300:9300 \
elasticsearch:7.12.1

使用

文本抽取管道

创建

  • attachment 是管道的名称
PUT /_ingest/pipeline/attachment

{
    "description": "提取附件信息",
    "processors": [
        {
            "attachment": {
                "field": "content",
                "ignore_missing": true
            }
        },
        {
            "remove": {
                "field": "content"
            }
        }
    ]
}

删除

DELETE /_ingest/pipeline/attachment

索引

PUT {{domain}}/docwrite

{
    "mappings": {
        "properties": {
            "id": {
                "type": "keyword"
            },
            "name": {
                "type": "text",
                "analyzer": "ik_max_word"
            },
            "type": {
                "type": "keyword"
            },
            "url": {
                "type": "keyword"
            },
            "attachment": {
                "properties": {
                    "content": {
                        "type": "text",
                        "analyzer": "ik_smart"
                    }
                }
            }
        }
    }
}

文档-上传文件

POST {{domain}}/docwrite/_doc?pipeline=attachment

{
    "id":1,
    "name":"宇墨大论文",
    "type":"word",
    "url":"http://xxx",
    "content":"文件的Base64编码"
}

查文档

GET {{domain}}/docwrite/_search

{
    "_source":["name","type","url"],
    "query":{
        // "match_all":{}
        "match":{
            "attachment.content":{
                "query":"角色大比拼",
                "analyzer":"ik_smart"
            }
        }
    }
}

JAVA整合

RestTemplate配置

@Configuration
public class HttpClientConfig {

    @Bean
    public RestTemplate restTemplate() {
        return new RestTemplateBuilder()
                .requestFactory(this::requestFactory)
                .build();
    }

    private HttpComponentsClientHttpRequestFactory requestFactory() {
        CloseableHttpClient httpClient = HttpClients.custom()
                .setConnectionTimeToLive(30, TimeUnit.SECONDS)
                .setDefaultRequestConfig(RequestConfig.custom()
                        .setConnectTimeout(5000*100)
                        .setSocketTimeout(5000*100)
                        .setConnectionRequestTimeout(5000)
                        .build())
                .build();
        return new HttpComponentsClientHttpRequestFactory(httpClient);
    }
}

实体类

@Data
@AllArgsConstructor
@NoArgsConstructor
@Accessors(chain = true)
@Document(indexName = "docwrite", replicas = 1, shards = 1, createIndex = false)
public class FileObj {

    @Id
    @Field(index = true, store = true, type = FieldType.Keyword)
    private String id; //用于存储文件id
    @Field(index = true, store = true, type = FieldType.Text, analyzer = "ik_smart")
    private String name; //文件名
    @Field(index = true, store = true, type = FieldType.Keyword)
    private String type; //文件的type,pdf,word,or txt
    @Field(index = true, store = true, type = FieldType.Keyword)
    private String url;
}

Controller

@PostMapping("/test3")
public String test3(@RequestParam MultipartFile file) {

    try {
        byte[] bytes = file.getBytes();
        String base64EncodedString = Base64.getEncoder().encodeToString(bytes);

        // 设置 Elasticsearch 主机和端口
        String elasticHost = "http://192.168.67.5";
        int elasticPort = 9200;
        String indexName = "docwrite";
        String url = String.format("%s:%d/%s/_doc?pipeline=attachment", elasticHost, elasticPort, indexName);                    // 设置索引名称
        // 创建文档 JSON
        String jsonString = JSON.toJSONString(new FileObj("1", "宇墨的大论文1", "word", "http://xxx", base64EncodedString));

        // 设置请求头
        HttpHeaders headers = new HttpHeaders();
        headers.setContentType(MediaType.APPLICATION_JSON);

        HttpEntity<String> requestEntity = new HttpEntity<>(jsonString, headers);
        // 发送请求并获取响应
        ResponseEntity<String> response = restTemplate.postForEntity(url, requestEntity, String.class);

        // 返回响应结果
        return response.getBody();
    } catch (IOException e) {
        e.fillInStackTrace();
    }
    return "xxx";
}
上次编辑于:
贡献者: liyuanhao,李元昊