Files
ai_english/data/import_senior_high_words.py

266 lines
9.7 KiB
Python
Raw Normal View History

2025-11-17 13:39:05 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""导入高中英语词汇到数据库"""
import pandas as pd
import mysql.connector
from datetime import datetime
import json
# 数据库配置
db_config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'JKjk20011115',
'database': 'ai_english_learning',
'charset': 'utf8mb4'
}
# 词汇书ID
BOOK_ID = 'senior_high_3500'
def clean_text(text):
"""清理文本处理nan和空值"""
if pd.isna(text) or str(text).strip() == '' or str(text).strip() == 'nan':
return None
return str(text).strip()
def extract_part_of_speech(translation):
"""从中文翻译中提取词性"""
if not translation:
return 'noun'
pos_map = {
'v.': 'verb',
'n.': 'noun',
'adj.': 'adjective',
'adv.': 'adverb',
'prep.': 'preposition',
'conj.': 'conjunction',
'pron.': 'pronoun',
'interj.': 'interjection'
}
for abbr, full in pos_map.items():
if abbr in translation or abbr.replace('.', '') in translation:
return full
# 中文词性判断
if '' in translation:
return 'verb'
elif '' in translation or '' in translation:
return 'adjective'
elif '' in translation:
return 'adverb'
elif '' in translation:
return 'preposition'
elif '' in translation:
return 'conjunction'
return 'noun'
def import_words_from_excel(file_path):
"""从Excel导入单词"""
try:
print(f"📖 正在读取文件: {file_path}")
df = pd.read_excel(file_path)
print(f"📊 文件列名: {df.columns.tolist()}")
print(f"📊 总行数: {len(df)}")
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
# 清理旧数据
print("\n清理旧数据...")
cursor.execute("DELETE FROM ai_vocabulary_book_words WHERE book_id = %s", (BOOK_ID,))
cursor.execute("""
DELETE v FROM ai_vocabulary v
LEFT JOIN ai_vocabulary_book_words bw ON bw.vocabulary_id = v.id
WHERE bw.id IS NULL
""")
conn.commit()
# SQL语句
insert_vocab_sql = """
INSERT INTO ai_vocabulary
(word, phonetic_us, phonetic_uk, phonetic, level, frequency, is_active,
word_root, synonyms, antonyms, derivatives, collocations, created_at, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
insert_definition_sql = """
INSERT INTO ai_vocabulary_definitions
(vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
VALUES (%s, %s, %s, %s, %s, %s)
"""
insert_example_sql = """
INSERT INTO ai_vocabulary_examples
(vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
VALUES (%s, %s, %s, %s, %s)
"""
insert_book_word_sql = """
INSERT INTO ai_vocabulary_book_words
(book_id, vocabulary_id, sort_order, created_at)
VALUES (%s, %s, %s, %s)
"""
success_count = 0
error_count = 0
for index, row in df.iterrows():
try:
word = clean_text(row.get('Word'))
if not word:
continue
# 检查单词是否已存在
cursor.execute("SELECT id FROM ai_vocabulary WHERE word = %s", (word,))
existing = cursor.fetchone()
if existing:
# 单词已存在使用现有ID
vocab_id = existing[0]
else:
# 单词不存在,插入新记录
# 音标
phonetic_us = clean_text(row.get('美式音标'))
phonetic_uk = clean_text(row.get('英式音标'))
phonetic = phonetic_us or phonetic_uk
# 释义
translation_cn = clean_text(row.get('中文含义'))
translation_en = clean_text(row.get('英文翻译(对应中文含义)'))
if not translation_cn:
print(f"⚠️ 跳过 {word}:缺少中文含义")
continue
if not translation_en:
translation_en = word
part_of_speech = extract_part_of_speech(translation_cn)
# 例句
example_en = clean_text(row.get('例句'))
example_cn = clean_text(row.get('例句中文翻译'))
# 词根
word_root = clean_text(row.get('词根'))
# 同义词
synonyms_text = clean_text(row.get('同义词(含义)'))
synonyms_json = '[]'
if synonyms_text:
syn_list = [syn.strip() for syn in synonyms_text.split('') if syn.strip()]
synonyms_json = json.dumps(syn_list, ensure_ascii=False)
# 反义词
antonyms_text = clean_text(row.get('反义词(含义)'))
antonyms_json = '[]'
if antonyms_text:
ant_list = [ant.strip() for ant in antonyms_text.split('') if ant.strip()]
antonyms_json = json.dumps(ant_list, ensure_ascii=False)
# 派生词
derivatives_text = clean_text(row.get('派生词(含义)'))
derivatives_json = '[]'
if derivatives_text:
der_list = [der.strip() for der in derivatives_text.split('') if der.strip()]
derivatives_json = json.dumps(der_list, ensure_ascii=False)
# 词组搭配
phrases_text = clean_text(row.get('词组搭配(中文含义)'))
collocations_json = '[]'
if phrases_text:
col_list = [phrase.strip() for phrase in phrases_text.split('') if phrase.strip()]
collocations_json = json.dumps(col_list, ensure_ascii=False)
# 插入词汇
now = datetime.now()
cursor.execute(insert_vocab_sql, (
word, phonetic_us, phonetic_uk, phonetic,
'intermediate', # 高中难度
index + 1, True,
word_root, synonyms_json, antonyms_json,
derivatives_json, collocations_json,
now, now
))
vocab_id = cursor.lastrowid
# 插入释义
cursor.execute(insert_definition_sql, (
vocab_id, part_of_speech, translation_en,
translation_cn, 0, now
))
# 插入例句
if example_en and example_cn:
examples_en = example_en.split('')
examples_cn = example_cn.split('')
for i, (ex_en, ex_cn) in enumerate(zip(examples_en, examples_cn)):
ex_en = ex_en.strip()
ex_cn = ex_cn.strip()
if ex_en and ex_cn:
cursor.execute(insert_example_sql, (
vocab_id, ex_en, ex_cn, i, now
))
# 关联到词汇书(无论单词是否已存在,都要关联)
now = datetime.now()
try:
cursor.execute(insert_book_word_sql, (
BOOK_ID, vocab_id, index, now
))
except Exception as link_error:
# 如果关联已存在,跳过
if '1062' not in str(link_error): # 不是重复键错误
raise
success_count += 1
if success_count % 100 == 0:
print(f"✅ 已处理 {success_count} 个单词...")
conn.commit()
except Exception as e:
error_count += 1
print(f"❌ 导入第 {index + 1} 行失败: {e}")
conn.commit()
# 更新词汇书总数
cursor.execute(
"UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
(success_count, BOOK_ID)
)
conn.commit()
print(f"\n🎉 高中词汇导入完成!")
print(f"✅ 成功: {success_count} 个单词")
print(f"❌ 失败: {error_count} 个单词")
# 验证
cursor.execute(
"SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
(BOOK_ID,)
)
print(f"📊 词汇书中共有 {cursor.fetchone()[0]} 个单词")
except Exception as e:
print(f"❌ 导入失败: {e}")
import traceback
traceback.print_exc()
finally:
if 'cursor' in locals():
cursor.close()
if 'conn' in locals():
conn.close()
if __name__ == '__main__':
import_words_from_excel('data/高中英语词汇.xlsx')