321 lines
11 KiB
Python
321 lines
11 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""完整导入小学英语核心词汇到数据库(包含所有字段)"""
|
|||
|
|
|
|||
|
|
import pandas as pd
|
|||
|
|
import mysql.connector
|
|||
|
|
from datetime import datetime
|
|||
|
|
import re
|
|||
|
|
|
|||
|
|
# 数据库配置
|
|||
|
|
db_config = {
|
|||
|
|
'host': 'localhost',
|
|||
|
|
'port': 3306,
|
|||
|
|
'user': 'root',
|
|||
|
|
'password': 'JKjk20011115',
|
|||
|
|
'database': 'ai_english_learning',
|
|||
|
|
'charset': 'utf8mb4'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
# 词汇书ID
|
|||
|
|
BOOK_ID = 'primary_core_1000'
|
|||
|
|
|
|||
|
|
def clean_text(text):
|
|||
|
|
"""清理文本,处理nan和空值"""
|
|||
|
|
if pd.isna(text) or str(text).strip() == '' or str(text).strip() == 'nan':
|
|||
|
|
return None
|
|||
|
|
return str(text).strip()
|
|||
|
|
|
|||
|
|
def extract_part_of_speech(translation):
|
|||
|
|
"""从中文翻译中提取词性"""
|
|||
|
|
if not translation:
|
|||
|
|
return 'noun'
|
|||
|
|
|
|||
|
|
pos_map = {
|
|||
|
|
'v.': 'verb',
|
|||
|
|
'n.': 'noun',
|
|||
|
|
'adj.': 'adjective',
|
|||
|
|
'adv.': 'adverb',
|
|||
|
|
'prep.': 'preposition',
|
|||
|
|
'conj.': 'conjunction',
|
|||
|
|
'pron.': 'pronoun',
|
|||
|
|
'interj.': 'interjection'
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
for abbr, full in pos_map.items():
|
|||
|
|
if abbr in translation or abbr.replace('.', '') in translation:
|
|||
|
|
return full
|
|||
|
|
|
|||
|
|
# 中文词性判断
|
|||
|
|
if '动' in translation:
|
|||
|
|
return 'verb'
|
|||
|
|
elif '形' in translation or '容' in translation:
|
|||
|
|
return 'adjective'
|
|||
|
|
elif '副' in translation:
|
|||
|
|
return 'adverb'
|
|||
|
|
elif '介' in translation:
|
|||
|
|
return 'preposition'
|
|||
|
|
elif '连' in translation:
|
|||
|
|
return 'conjunction'
|
|||
|
|
|
|||
|
|
return 'noun' # 默认名词
|
|||
|
|
|
|||
|
|
def import_words_from_excel(file_path):
|
|||
|
|
"""从Excel导入单词"""
|
|||
|
|
try:
|
|||
|
|
# 读取Excel文件
|
|||
|
|
print(f"📖 正在读取文件: {file_path}")
|
|||
|
|
df = pd.read_excel(file_path)
|
|||
|
|
|
|||
|
|
print(f"📊 文件列名: {df.columns.tolist()}")
|
|||
|
|
print(f"📊 总行数: {len(df)}")
|
|||
|
|
|
|||
|
|
# 连接数据库
|
|||
|
|
conn = mysql.connector.connect(**db_config)
|
|||
|
|
cursor = conn.cursor()
|
|||
|
|
|
|||
|
|
# 先清空旧数据
|
|||
|
|
print("\n清理旧数据...")
|
|||
|
|
cursor.execute("DELETE FROM ai_vocabulary_book_words WHERE book_id = %s", (BOOK_ID,))
|
|||
|
|
cursor.execute("""
|
|||
|
|
DELETE v FROM ai_vocabulary v
|
|||
|
|
LEFT JOIN ai_vocabulary_book_words bw ON bw.vocabulary_id = v.id
|
|||
|
|
WHERE bw.id IS NULL
|
|||
|
|
""")
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
# 准备SQL语句
|
|||
|
|
insert_vocab_sql = """
|
|||
|
|
INSERT INTO ai_vocabulary
|
|||
|
|
(word, phonetic_us, phonetic_uk, phonetic, level, frequency, is_active,
|
|||
|
|
word_root, synonyms, antonyms, derivatives, collocations, created_at, updated_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
insert_definition_sql = """
|
|||
|
|
INSERT INTO ai_vocabulary_definitions
|
|||
|
|
(vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s, %s)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
insert_example_sql = """
|
|||
|
|
INSERT INTO ai_vocabulary_examples
|
|||
|
|
(vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
|
|||
|
|
VALUES (%s, %s, %s, %s, %s)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
insert_book_word_sql = """
|
|||
|
|
INSERT INTO ai_vocabulary_book_words
|
|||
|
|
(book_id, vocabulary_id, sort_order, created_at)
|
|||
|
|
VALUES (%s, %s, %s, %s)
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
success_count = 0
|
|||
|
|
error_count = 0
|
|||
|
|
|
|||
|
|
# 遍历每一行
|
|||
|
|
for index, row in df.iterrows():
|
|||
|
|
try:
|
|||
|
|
# 提取基本数据
|
|||
|
|
word = clean_text(row.get('Word'))
|
|||
|
|
if not word:
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 音标
|
|||
|
|
phonetic_us = clean_text(row.get('美式音标'))
|
|||
|
|
phonetic_uk = clean_text(row.get('英式音标'))
|
|||
|
|
phonetic = phonetic_us or phonetic_uk
|
|||
|
|
|
|||
|
|
# 释义
|
|||
|
|
translation_cn = clean_text(row.get('中文含义'))
|
|||
|
|
translation_en = clean_text(row.get('英文翻译(对应中文含义)'))
|
|||
|
|
|
|||
|
|
if not translation_cn:
|
|||
|
|
print(f"⚠️ 跳过 {word}:缺少中文含义")
|
|||
|
|
continue
|
|||
|
|
|
|||
|
|
# 如果没有英文翻译,使用单词本身
|
|||
|
|
if not translation_en:
|
|||
|
|
translation_en = word
|
|||
|
|
|
|||
|
|
# 提取词性
|
|||
|
|
part_of_speech = extract_part_of_speech(translation_cn)
|
|||
|
|
|
|||
|
|
# 例句
|
|||
|
|
example_en = clean_text(row.get('例句'))
|
|||
|
|
example_cn = clean_text(row.get('例句中文翻译'))
|
|||
|
|
|
|||
|
|
# 词根
|
|||
|
|
word_root = clean_text(row.get('词根'))
|
|||
|
|
|
|||
|
|
# 同义词(处理为JSON)
|
|||
|
|
synonyms_text = clean_text(row.get('同义词(含义)'))
|
|||
|
|
synonyms_json = '[]'
|
|||
|
|
if synonyms_text:
|
|||
|
|
# 分号分隔,格式:word1(含义1);word2(含义2)
|
|||
|
|
import json
|
|||
|
|
syn_list = []
|
|||
|
|
for syn in synonyms_text.split(';'):
|
|||
|
|
syn = syn.strip()
|
|||
|
|
if syn:
|
|||
|
|
syn_list.append(syn)
|
|||
|
|
synonyms_json = json.dumps(syn_list, ensure_ascii=False)
|
|||
|
|
|
|||
|
|
# 反义词(处理为JSON)
|
|||
|
|
antonyms_text = clean_text(row.get('反义词(含义)'))
|
|||
|
|
antonyms_json = '[]'
|
|||
|
|
if antonyms_text:
|
|||
|
|
import json
|
|||
|
|
ant_list = []
|
|||
|
|
for ant in antonyms_text.split(';'):
|
|||
|
|
ant = ant.strip()
|
|||
|
|
if ant:
|
|||
|
|
ant_list.append(ant)
|
|||
|
|
antonyms_json = json.dumps(ant_list, ensure_ascii=False)
|
|||
|
|
|
|||
|
|
# 派生词(处理为JSON)
|
|||
|
|
derivatives_text = clean_text(row.get('派生词(含义)'))
|
|||
|
|
derivatives_json = '[]'
|
|||
|
|
if derivatives_text:
|
|||
|
|
import json
|
|||
|
|
der_list = []
|
|||
|
|
for der in derivatives_text.split(';'):
|
|||
|
|
der = der.strip()
|
|||
|
|
if der:
|
|||
|
|
der_list.append(der)
|
|||
|
|
derivatives_json = json.dumps(der_list, ensure_ascii=False)
|
|||
|
|
|
|||
|
|
# 词组搭配(处理为JSON)
|
|||
|
|
phrases_text = clean_text(row.get('词组搭配(中文含义)'))
|
|||
|
|
collocations_json = '[]'
|
|||
|
|
if phrases_text:
|
|||
|
|
import json
|
|||
|
|
col_list = []
|
|||
|
|
for phrase in phrases_text.split(';'):
|
|||
|
|
phrase = phrase.strip()
|
|||
|
|
if phrase:
|
|||
|
|
col_list.append(phrase)
|
|||
|
|
collocations_json = json.dumps(col_list, ensure_ascii=False)
|
|||
|
|
|
|||
|
|
# 插入词汇
|
|||
|
|
now = datetime.now()
|
|||
|
|
cursor.execute(insert_vocab_sql, (
|
|||
|
|
word,
|
|||
|
|
phonetic_us,
|
|||
|
|
phonetic_uk,
|
|||
|
|
phonetic,
|
|||
|
|
'beginner',
|
|||
|
|
index + 1,
|
|||
|
|
True,
|
|||
|
|
word_root,
|
|||
|
|
synonyms_json,
|
|||
|
|
antonyms_json,
|
|||
|
|
derivatives_json,
|
|||
|
|
collocations_json,
|
|||
|
|
now,
|
|||
|
|
now
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
vocab_id = cursor.lastrowid
|
|||
|
|
|
|||
|
|
# 插入主要释义
|
|||
|
|
cursor.execute(insert_definition_sql, (
|
|||
|
|
vocab_id,
|
|||
|
|
part_of_speech,
|
|||
|
|
translation_en, # ✅ 使用正确的英文翻译
|
|||
|
|
translation_cn,
|
|||
|
|
0,
|
|||
|
|
now
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
# 插入例句
|
|||
|
|
if example_en and example_cn:
|
|||
|
|
# 处理多个例句(用分号分隔)
|
|||
|
|
examples_en = example_en.split(';')
|
|||
|
|
examples_cn = example_cn.split(';')
|
|||
|
|
|
|||
|
|
for i, (ex_en, ex_cn) in enumerate(zip(examples_en, examples_cn)):
|
|||
|
|
ex_en = ex_en.strip()
|
|||
|
|
ex_cn = ex_cn.strip()
|
|||
|
|
if ex_en and ex_cn:
|
|||
|
|
cursor.execute(insert_example_sql, (
|
|||
|
|
vocab_id,
|
|||
|
|
ex_en,
|
|||
|
|
ex_cn,
|
|||
|
|
i,
|
|||
|
|
now
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
# 关联到词汇书
|
|||
|
|
cursor.execute(insert_book_word_sql, (
|
|||
|
|
BOOK_ID,
|
|||
|
|
vocab_id,
|
|||
|
|
index,
|
|||
|
|
now
|
|||
|
|
))
|
|||
|
|
|
|||
|
|
success_count += 1
|
|||
|
|
if success_count % 50 == 0:
|
|||
|
|
print(f"✅ 已导入 {success_count} 个单词...")
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
error_count += 1
|
|||
|
|
print(f"❌ 导入第 {index + 1} 行失败: {e}")
|
|||
|
|
print(f" 单词: {word if 'word' in locals() else 'N/A'}")
|
|||
|
|
|
|||
|
|
# 提交事务
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
# 更新词汇书的总单词数
|
|||
|
|
cursor.execute(
|
|||
|
|
"UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
|
|||
|
|
(success_count, BOOK_ID)
|
|||
|
|
)
|
|||
|
|
conn.commit()
|
|||
|
|
|
|||
|
|
print(f"\n🎉 导入完成!")
|
|||
|
|
print(f"✅ 成功: {success_count} 个单词")
|
|||
|
|
print(f"❌ 失败: {error_count} 个单词")
|
|||
|
|
|
|||
|
|
# 验证数据
|
|||
|
|
cursor.execute(
|
|||
|
|
"SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
|
|||
|
|
(BOOK_ID,)
|
|||
|
|
)
|
|||
|
|
count = cursor.fetchone()[0]
|
|||
|
|
print(f"📊 词汇书中共有 {count} 个单词")
|
|||
|
|
|
|||
|
|
# 检查释义数量
|
|||
|
|
cursor.execute("""
|
|||
|
|
SELECT COUNT(DISTINCT d.vocabulary_id)
|
|||
|
|
FROM ai_vocabulary_book_words bw
|
|||
|
|
JOIN ai_vocabulary_definitions d ON d.vocabulary_id = bw.vocabulary_id
|
|||
|
|
WHERE bw.book_id = %s
|
|||
|
|
""", (BOOK_ID,))
|
|||
|
|
def_count = cursor.fetchone()[0]
|
|||
|
|
print(f"📊 有释义的单词: {def_count} 个")
|
|||
|
|
|
|||
|
|
# 检查例句数量
|
|||
|
|
cursor.execute("""
|
|||
|
|
SELECT COUNT(DISTINCT e.vocabulary_id)
|
|||
|
|
FROM ai_vocabulary_book_words bw
|
|||
|
|
JOIN ai_vocabulary_examples e ON e.vocabulary_id = bw.vocabulary_id
|
|||
|
|
WHERE bw.book_id = %s
|
|||
|
|
""", (BOOK_ID,))
|
|||
|
|
ex_count = cursor.fetchone()[0]
|
|||
|
|
print(f"📊 有例句的单词: {ex_count} 个")
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"❌ 导入失败: {e}")
|
|||
|
|
import traceback
|
|||
|
|
traceback.print_exc()
|
|||
|
|
finally:
|
|||
|
|
if cursor:
|
|||
|
|
cursor.close()
|
|||
|
|
if conn:
|
|||
|
|
conn.close()
|
|||
|
|
|
|||
|
|
if __name__ == '__main__':
|
|||
|
|
import_words_from_excel('data/小学.xlsx')
|