Files
ai_english/data/import_primary_words.py
2025-11-17 13:39:05 +08:00

211 lines
7.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""导入小学英语核心词汇到数据库"""
import pandas as pd
import mysql.connector
from datetime import datetime
import uuid
# 数据库配置
db_config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'JKjk20011115',
'database': 'ai_english_learning',
'charset': 'utf8mb4'
}
# 词汇书ID
BOOK_ID = 'primary_core_1000'
def generate_uuid():
"""生成UUID"""
return str(uuid.uuid4())
def import_words_from_excel(file_path):
"""从Excel导入单词"""
try:
# 读取Excel文件
print(f"📖 正在读取文件: {file_path}")
df = pd.read_excel(file_path)
print(f"📊 文件列名: {df.columns.tolist()}")
print(f"📊 总行数: {len(df)}")
print(f"\n前5行数据预览:")
print(df.head())
# 连接数据库
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
# 准备SQL语句
insert_vocab_sql = """
INSERT INTO ai_vocabulary
(word, phonetic, level, frequency, is_active, created_at, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
id = LAST_INSERT_ID(id),
phonetic = VALUES(phonetic),
level = VALUES(level),
frequency = VALUES(frequency),
updated_at = VALUES(updated_at)
"""
insert_definition_sql = """
INSERT INTO ai_vocabulary_definitions
(vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
VALUES (%s, %s, %s, %s, %s, %s)
"""
insert_example_sql = """
INSERT INTO ai_vocabulary_examples
(vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
VALUES (%s, %s, %s, %s, %s)
"""
insert_book_word_sql = """
INSERT INTO ai_vocabulary_book_words
(book_id, vocabulary_id, sort_order, created_at)
VALUES (%s, %s, %s, %s)
ON DUPLICATE KEY UPDATE sort_order = VALUES(sort_order)
"""
success_count = 0
error_count = 0
# 遍历每一行
for index, row in df.iterrows():
try:
# 提取数据根据实际Excel列名调整
word = str(row.get('Word', '')).strip()
if not word or word == 'nan':
continue
# 优先使用美式音标
phonetic = str(row.get('美式音标', '')).strip()
if phonetic == 'nan' or not phonetic:
phonetic = str(row.get('英式音标', '')).strip()
if phonetic == 'nan':
phonetic = None
translation = str(row.get('中文含义', '')).strip()
if translation == 'nan':
translation = ''
# 从中文含义中提取词性(如果有的话)
part_of_speech = 'noun' # 默认为名词
if translation:
if 'v.' in translation or '' in translation:
part_of_speech = 'verb'
elif 'adj.' in translation or '' in translation:
part_of_speech = 'adjective'
elif 'adv.' in translation or '' in translation:
part_of_speech = 'adverb'
elif 'prep.' in translation or '' in translation:
part_of_speech = 'preposition'
elif 'conj.' in translation or '' in translation:
part_of_speech = 'conjunction'
example_en = str(row.get('例句', '')).strip()
if example_en == 'nan' or not example_en:
example_en = None
example_cn = str(row.get('例句中文翻译', '')).strip()
if example_cn == 'nan' or not example_cn:
example_cn = None
# 插入词汇
now = datetime.now()
cursor.execute(insert_vocab_sql, (
word,
phonetic,
'beginner', # 小学词汇难度为beginner
index + 1, # 使用行号作为频率
True,
now,
now
))
# 获取插入的ID
vocab_id = cursor.lastrowid
# 插入释义
if translation:
cursor.execute(insert_definition_sql, (
vocab_id,
part_of_speech,
word, # 英文定义暂时用单词本身
translation,
0,
now
))
# 插入例句(只取第一个例句)
if example_en and example_cn:
# 如果有多个例句,用分号分隔,只取第一个
first_example_en = example_en.split('')[0] if '' in example_en else example_en
first_example_cn = example_cn.split('')[0] if '' in example_cn else example_cn
cursor.execute(insert_example_sql, (
vocab_id,
first_example_en,
first_example_cn,
0,
now
))
# 关联到词汇书
cursor.execute(insert_book_word_sql, (
BOOK_ID,
vocab_id,
index,
now
))
success_count += 1
if success_count % 50 == 0:
print(f"✅ 已导入 {success_count} 个单词...")
conn.commit()
except Exception as e:
error_count += 1
print(f"❌ 导入第 {index + 1} 行失败: {e}")
print(f" 数据: {row.to_dict()}")
# 提交事务
conn.commit()
# 更新词汇书的总单词数
cursor.execute(
"UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
(success_count, BOOK_ID)
)
conn.commit()
print(f"\n🎉 导入完成!")
print(f"✅ 成功: {success_count} 个单词")
print(f"❌ 失败: {error_count} 个单词")
# 验证数据
cursor.execute(
"SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
(BOOK_ID,)
)
count = cursor.fetchone()[0]
print(f"📊 词汇书中共有 {count} 个单词")
except Exception as e:
print(f"❌ 导入失败: {e}")
import traceback
traceback.print_exc()
finally:
if cursor:
cursor.close()
if conn:
conn.close()
if __name__ == '__main__':
import_words_from_excel('data/小学.xlsx')