Files
ai_english/import_cet4_words.py
2025-11-17 14:09:17 +08:00

267 lines
9.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""导入大学英语四级核心词汇到数据库"""
import pandas as pd
import mysql.connector
from datetime import datetime
import json
# 数据库配置
db_config = {
'host': 'localhost',
'port': 3306,
'user': 'root',
'password': 'JKjk20011115',
'database': 'ai_english_learning',
'charset': 'utf8mb4'
}
# 词汇书ID
BOOK_ID = 'cet4_core_2500'
def clean_text(text):
"""清理文本处理nan和空值"""
if pd.isna(text) or str(text).strip() == '' or str(text).strip() == 'nan':
return None
return str(text).strip()
def extract_part_of_speech(translation):
"""从中文翻译中提取词性"""
if not translation:
return 'noun'
pos_map = {
'v.': 'verb',
'n.': 'noun',
'adj.': 'adjective',
'adv.': 'adverb',
'prep.': 'preposition',
'conj.': 'conjunction',
'pron.': 'pronoun',
'interj.': 'interjection'
}
for abbr, full in pos_map.items():
if abbr in translation or abbr.replace('.', '') in translation:
return full
# 中文词性判断
if '' in translation:
return 'verb'
elif '' in translation or '' in translation:
return 'adjective'
elif '' in translation:
return 'adverb'
elif '' in translation:
return 'preposition'
elif '' in translation:
return 'conjunction'
return 'noun'
def import_words_from_excel(file_path):
"""从Excel导入单词"""
try:
print(f"📖 正在读取文件: {file_path}")
df = pd.read_excel(file_path)
print(f"📊 文件列名: {df.columns.tolist()}")
print(f"📊 总行数: {len(df)}")
conn = mysql.connector.connect(**db_config)
cursor = conn.cursor()
# 清理旧数据
print("\n清理旧数据...")
cursor.execute("DELETE FROM ai_vocabulary_book_words WHERE book_id = %s", (BOOK_ID,))
cursor.execute("""
DELETE v FROM ai_vocabulary v
LEFT JOIN ai_vocabulary_book_words bw ON bw.vocabulary_id = v.id
WHERE bw.id IS NULL
""")
conn.commit()
# SQL语句
insert_vocab_sql = """
INSERT INTO ai_vocabulary
(word, phonetic_us, phonetic_uk, phonetic, level, frequency, is_active,
word_root, synonyms, antonyms, derivatives, collocations, created_at, updated_at)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
insert_definition_sql = """
INSERT INTO ai_vocabulary_definitions
(vocabulary_id, part_of_speech, definition_en, definition_cn, sort_order, created_at)
VALUES (%s, %s, %s, %s, %s, %s)
"""
insert_example_sql = """
INSERT INTO ai_vocabulary_examples
(vocabulary_id, sentence_en, sentence_cn, sort_order, created_at)
VALUES (%s, %s, %s, %s, %s)
"""
insert_book_word_sql = """
INSERT INTO ai_vocabulary_book_words
(book_id, vocabulary_id, sort_order, created_at)
VALUES (%s, %s, %s, %s)
"""
success_count = 0
error_count = 0
for index, row in df.iterrows():
try:
# 尝试多个可能的列名
word = clean_text(row.get('Word') or row.get('单词(Word)') or row.get('单词'))
if not word:
continue
# 检查单词是否已存在
cursor.execute("SELECT id FROM ai_vocabulary WHERE word = %s", (word,))
existing = cursor.fetchone()
if existing:
# 单词已存在使用现有ID
vocab_id = existing[0]
else:
# 单词不存在,插入新记录
# 音标
phonetic_us = clean_text(row.get('美式音标'))
phonetic_uk = clean_text(row.get('英式音标'))
phonetic = phonetic_us or phonetic_uk
# 释义
translation_cn = clean_text(row.get('中文含义'))
translation_en = clean_text(row.get('英文翻译(对应中文含义)') or row.get('英文翻译'))
if not translation_cn:
print(f"⚠️ 跳过 {word}:缺少中文含义")
continue
if not translation_en:
translation_en = word
part_of_speech = extract_part_of_speech(translation_cn)
# 例句
example_en = clean_text(row.get('例句'))
example_cn = clean_text(row.get('例句中文翻译'))
# 词根
word_root = clean_text(row.get('词根') or row.get('词根/词源'))
# 同义词
synonyms_text = clean_text(row.get('同义词(含义)'))
synonyms_json = '[]'
if synonyms_text:
syn_list = [syn.strip() for syn in synonyms_text.split('') if syn.strip()]
synonyms_json = json.dumps(syn_list, ensure_ascii=False)
# 反义词
antonyms_text = clean_text(row.get('反义词(含义)'))
antonyms_json = '[]'
if antonyms_text:
ant_list = [ant.strip() for ant in antonyms_text.split('') if ant.strip()]
antonyms_json = json.dumps(ant_list, ensure_ascii=False)
# 派生词
derivatives_text = clean_text(row.get('派生词(含义)'))
derivatives_json = '[]'
if derivatives_text:
der_list = [der.strip() for der in derivatives_text.split('') if der.strip()]
derivatives_json = json.dumps(der_list, ensure_ascii=False)
# 词组搭配
phrases_text = clean_text(row.get('词组搭配(中文含义)'))
collocations_json = '[]'
if phrases_text:
col_list = [phrase.strip() for phrase in phrases_text.split('') if phrase.strip()]
collocations_json = json.dumps(col_list, ensure_ascii=False)
# 插入词汇
now = datetime.now()
cursor.execute(insert_vocab_sql, (
word, phonetic_us, phonetic_uk, phonetic,
'intermediate', # CET-4难度
index + 1, True,
word_root, synonyms_json, antonyms_json,
derivatives_json, collocations_json,
now, now
))
vocab_id = cursor.lastrowid
# 插入释义
cursor.execute(insert_definition_sql, (
vocab_id, part_of_speech, translation_en,
translation_cn, 0, now
))
# 插入例句
if example_en and example_cn:
examples_en = example_en.split('')
examples_cn = example_cn.split('')
for i, (ex_en, ex_cn) in enumerate(zip(examples_en, examples_cn)):
ex_en = ex_en.strip()
ex_cn = ex_cn.strip()
if ex_en and ex_cn:
cursor.execute(insert_example_sql, (
vocab_id, ex_en, ex_cn, i, now
))
# 关联到词汇书(无论单词是否已存在,都要关联)
now = datetime.now()
try:
cursor.execute(insert_book_word_sql, (
BOOK_ID, vocab_id, index, now
))
except Exception as link_error:
# 如果关联已存在,跳过
if '1062' not in str(link_error): # 不是重复键错误
raise
success_count += 1
if success_count % 100 == 0:
print(f"✅ 已处理 {success_count} 个单词...")
conn.commit()
except Exception as e:
error_count += 1
print(f"❌ 导入第 {index + 1} 行失败: {e}")
conn.commit()
# 更新词汇书总数
cursor.execute(
"UPDATE ai_vocabulary_books SET total_words = %s WHERE id = %s",
(success_count, BOOK_ID)
)
conn.commit()
print(f"\n🎉 大学英语四级核心词汇导入完成!")
print(f"✅ 成功: {success_count} 个单词")
print(f"❌ 失败: {error_count} 个单词")
# 验证
cursor.execute(
"SELECT COUNT(*) FROM ai_vocabulary_book_words WHERE book_id = %s",
(BOOK_ID,)
)
print(f"📊 词汇书中共有 {cursor.fetchone()[0]} 个单词")
except Exception as e:
print(f"❌ 导入失败: {e}")
import traceback
traceback.print_exc()
finally:
if 'cursor' in locals():
cursor.close()
if 'conn' in locals():
conn.close()
if __name__ == '__main__':
import_words_from_excel('data/大学英语四级核心词汇.xlsx')