commit
92e122297f
@ -0,0 +1,2 @@
|
||||
.venv/
|
||||
src/__pycache__/
|
||||
@ -0,0 +1,15 @@
|
||||
# Requirements
|
||||
### Minimal versions
|
||||
python 3.7
|
||||
postgresql 11
|
||||
|
||||
### Database schema
|
||||
includes GIN index column `geodata_search_idx`
|
||||
```sql
|
||||
ALTER TABLE geodata ADD COLUMN geodata_search_ts TSVECTOR
|
||||
GENERATED ALWAYS AS (to_tsvector('russian', geodata.description) STORED
|
||||
```
|
||||
(use coalese() if multiple)
|
||||
```sql
|
||||
CREATE INDEX geodata_search_idx ON geodata USING GIN (geodata_search_ts);
|
||||
```
|
||||
@ -0,0 +1,15 @@
|
||||
{
|
||||
"folders": [
|
||||
{
|
||||
"path": "."
|
||||
}
|
||||
],
|
||||
"settings": {
|
||||
"python.defaultInterpreterPath": ".venv/bin/python",
|
||||
"python.formatting.blackPath": "/usr/bin/black",
|
||||
"python.linting.flake8Path": "/usr/bin/flake8",
|
||||
"python.linting.flake8Args": [
|
||||
"--black-config=''"
|
||||
]
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,18 @@
|
||||
anyio==3.5.0
|
||||
asgiref==3.5.0
|
||||
click==8.0.4
|
||||
et-xmlfile==1.1.0
|
||||
fastapi==0.74.1
|
||||
greenlet==1.1.2
|
||||
h11==0.13.0
|
||||
idna==3.3
|
||||
openpyxl==3.0.9
|
||||
psycopg2==2.9.3
|
||||
pydantic==1.9.0
|
||||
python-multipart==0.0.5
|
||||
six==1.16.0
|
||||
sniffio==1.2.0
|
||||
SQLAlchemy==1.4.31
|
||||
starlette==0.17.1
|
||||
typing_extensions==4.1.1
|
||||
uvicorn==0.17.5
|
||||
@ -0,0 +1,14 @@
|
||||
anyio==3.5.0
|
||||
et-xmlfile==1.1.0
|
||||
fastapi==0.74.1
|
||||
greenlet==1.1.2
|
||||
idna==3.3
|
||||
openpyxl==3.0.9
|
||||
psycopg2==2.9.3
|
||||
pydantic==1.9.0
|
||||
python-multipart==0.0.5
|
||||
six==1.16.0
|
||||
sniffio==1.2.0
|
||||
SQLAlchemy==1.4.31
|
||||
starlette==0.17.1
|
||||
typing_extensions==4.1.1
|
||||
@ -0,0 +1,62 @@
|
||||
from typing import List
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy.sql.expression import func
|
||||
from sqlalchemy import inspect
|
||||
|
||||
from . import models, schemas
|
||||
from .database import Base
|
||||
|
||||
|
||||
def get_item(db: Session, item_id: int):
|
||||
"""индивидуальные страницы для каждого Описания набора данных"""
|
||||
return db.query(models.Item).filter(models.Item.id == item_id).first()
|
||||
|
||||
|
||||
def get_item_by_description(db: Session, needle: str, skip: int = 0, limit: int = 20):
|
||||
"""фильтры для поиска Описаний наборов данных по атрибутам
|
||||
дополнительный возможный синтаксис в запросах преобразуется
|
||||
в поисковый запрос содержащий:
|
||||
- простой текст: переданные слова в любом порядке
|
||||
- "текст в кавычках": переданные слова в указанном порядке
|
||||
- OR: какие-либо из переданных слов
|
||||
- -: не содержащий данного слова
|
||||
"""
|
||||
# hardcode russian for now
|
||||
# built-in postgress websearch_to_tsquery() is good
|
||||
# https://www.postgresql.org/docs/11/textsearch-controls.html#TEXTSEARCH-PARSING-QUERIES
|
||||
# TODO: add requested search columns [0]
|
||||
items_table = inspect(models.Item).local_table
|
||||
result = (
|
||||
db.query(models.Item)
|
||||
.filter(
|
||||
items_table.c.ts.op("@@")(func.websearch_to_tsquery('"russian"', needle))
|
||||
)
|
||||
.order_by(items_table.c.id)
|
||||
.offset(skip)
|
||||
.limit(limit)
|
||||
.all()
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def get_items(db: Session, skip: int = 0, limit: int = 20):
|
||||
"""список доступных в системе Описаний наборов данных"""
|
||||
return db.query(models.Item).offset(skip).limit(limit).all()
|
||||
|
||||
|
||||
def insert_items(db: Session, items: List[schemas.ItemCreate]):
|
||||
"""импорт в базу данных Описаний наборов данных"""
|
||||
before = db.query(models.Item).count()
|
||||
accepted = len(items)
|
||||
# https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-fast-execution-helpers
|
||||
modeled_items = (models.ItemCreate(**item.dict()) for item in items)
|
||||
db.add_all(modeled_items)
|
||||
db.commit()
|
||||
inserted = db.query(models.Item).count() - before
|
||||
return (accepted, inserted)
|
||||
|
||||
|
||||
def get_headers(db: Session):
|
||||
"""полные заголовки таблиц"""
|
||||
return db.query(models.Header).all()
|
||||
@ -0,0 +1,18 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
# Database connection parameters
|
||||
SQALCHEMY_DATABASE_URL = "postgresql+psycopg2://geodata:PASSWORD@postgres/geodata"
|
||||
|
||||
|
||||
engine = create_engine(
|
||||
SQALCHEMY_DATABASE_URL,
|
||||
# https://docs.sqlalchemy.org/en/14/dialects/postgresql.html#psycopg2-connect-arguments
|
||||
executemany_mode="values_only",
|
||||
executemany_values_page_size=10000,
|
||||
executemany_batch_page_size=500,
|
||||
)
|
||||
SessionLocal = sessionmaker(autocommit=False, bind=engine)
|
||||
|
||||
Base = declarative_base()
|
||||
@ -0,0 +1,139 @@
|
||||
from datetime import datetime
|
||||
from re import IGNORECASE, sub as substitute
|
||||
from typing import List
|
||||
from urllib.parse import unquote_plus
|
||||
|
||||
from fastapi import Depends, FastAPI, File, HTTPException, UploadFile
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from . import crud, models, schemas, spreadsheet
|
||||
from .database import SessionLocal, engine
|
||||
|
||||
|
||||
# TODO: switch for Alembic [2]
|
||||
models.Base.metadata.create_all(bind=engine)
|
||||
|
||||
app = FastAPI()
|
||||
|
||||
# Dependency
|
||||
def get_db():
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.post(
|
||||
"/items/",
|
||||
responses={
|
||||
422: {
|
||||
"description": "Загружен некорректный файл (ожидался .xlsx). "
|
||||
"Неизвестный заголовок. Или обнаружены данные в столбце без заголовка. "
|
||||
"Необходимо использовать файлы установленного образца."
|
||||
},
|
||||
},
|
||||
response_model=schemas.InsertStatus,
|
||||
)
|
||||
def create_items(file: UploadFile = File(...), db: Session = Depends(get_db)):
|
||||
"""импорт в базу данных Описаний наборов данных из Excel файлов"""
|
||||
try:
|
||||
start = datetime.now()
|
||||
# parse spreadsheet into a collection of header and data iterators
|
||||
# TODO: Multiple worksheets per Workbook or one? [3]
|
||||
# assume one for now (second one have strange formatting)
|
||||
with spreadsheet.parse(file=file) as spreadsheet_parse:
|
||||
sheet = spreadsheet_parse[0]
|
||||
# TODO use fullfledged validation framework? [6]
|
||||
# ad-hoc for now:
|
||||
# fetch known headersfrom the database
|
||||
# NB: we never check the actual order of the columns, so we might suffer for it
|
||||
|
||||
# we have to construct a collection of known headers anyway,
|
||||
# might as well make it a set
|
||||
unknown_headers = set(sheet["header"]) - set(
|
||||
dbh.spreadsheet for dbh in crud.get_headers(db=db)
|
||||
)
|
||||
if unknown_headers:
|
||||
raise HTTPException(
|
||||
status_code=422,
|
||||
detail="Unknown headers in the spreadsheet: {}. Check the coordinated format".format(
|
||||
unknown_headers
|
||||
),
|
||||
)
|
||||
|
||||
# construct a list of schemas.Item items
|
||||
spreadsheet_item_list = [
|
||||
schemas.ItemCreate(
|
||||
**{
|
||||
key: row[i]
|
||||
for i, key in enumerate(schemas.ItemCreate.__fields__.keys())
|
||||
}
|
||||
)
|
||||
for row in sheet["data"]
|
||||
]
|
||||
# dump all the data into database
|
||||
accepted, processed = crud.insert_items(db=db, items=spreadsheet_item_list)
|
||||
|
||||
except spreadsheet.InvalidFileException:
|
||||
raise HTTPException(
|
||||
status_code=422, detail="Invalid file upload (expected .xlsx)"
|
||||
)
|
||||
except spreadsheet.DataInUnnamedColumnException:
|
||||
raise HTTPException(
|
||||
status_code=422, detail="Data is found in a column with empty header"
|
||||
)
|
||||
|
||||
return schemas.InsertStatus(
|
||||
status="Success" if accepted == processed else "Warning",
|
||||
accepted=accepted,
|
||||
processed=processed,
|
||||
process_time=datetime.now() - start,
|
||||
)
|
||||
|
||||
|
||||
@app.get(
|
||||
"/item/{item_id}",
|
||||
response_model=schemas.Item,
|
||||
responses={404: {"description": "Такой набор данных отсутствует"}},
|
||||
)
|
||||
def read_item(item_id: int, db: Session = Depends(get_db)):
|
||||
"""индивидуальные страницы для каждого Описания набора данных"""
|
||||
db_item = crud.get_item(db=db, item_id=item_id)
|
||||
if db_item is None:
|
||||
raise HTTPException(status_code=404, detail="Item not found")
|
||||
return db_item
|
||||
|
||||
|
||||
@app.get("/items/", response_model=List[schemas.Item])
|
||||
def read_items(skip: int = 0, limit: int = 20, db: Session = Depends(get_db)):
|
||||
"""список доступных в системе Описаний наборов данных"""
|
||||
return crud.get_items(db=db, skip=skip, limit=limit)
|
||||
|
||||
|
||||
@app.get(
|
||||
"/search/",
|
||||
response_model=List[schemas.Item],
|
||||
responses={400: {"description": "Запрос слишком короткий (минимумл 3 символа)"}},
|
||||
)
|
||||
def search(q: str, skip: int = 0, limit: int = 20, db: Session = Depends(get_db)):
|
||||
"""фильтры для поиска Описаний наборов данных по атрибутам
|
||||
дополнительный возможный синтаксис в запросах преобразуется
|
||||
в поисковый запрос содержащий:
|
||||
- простой текст: переданные слова в любом порядке
|
||||
- "текст в кавычках": переданные слова в указанном порядке
|
||||
- OR ИЛИ: какие-либо из переданных слов
|
||||
- -: не содержащий данного слова
|
||||
"""
|
||||
q = unquote_plus(q)
|
||||
# replace all full ИЛИ words with OR
|
||||
q = substitute(r"\bИЛИ\b", "OR", q, flags=IGNORECASE)
|
||||
if len(q) < 3:
|
||||
raise HTTPException(status_code=400, detail="Query too short")
|
||||
return crud.get_item_by_description(db=db, needle=q, skip=skip, limit=limit)
|
||||
|
||||
|
||||
@app.get("/headers/", response_model=List[schemas.Header])
|
||||
def read_headers(db: Session = Depends(get_db)):
|
||||
"""полные наименования столбцов таблиц"""
|
||||
return crud.get_headers(db=db)
|
||||
@ -0,0 +1,50 @@
|
||||
from sqlalchemy import Column, Integer, String, DateTime
|
||||
|
||||
# we might need vector concat later, then we'll have to bring in sqlalchemy_utils
|
||||
# https://sqlalchemy-utils.readthedocs.io/en/latest/_modules/sqlalchemy_utils/types/ts_vector.html
|
||||
from sqlalchemy.dialects.postgresql import TSVECTOR
|
||||
|
||||
from .database import Base
|
||||
|
||||
|
||||
class Header(Base):
|
||||
__tablename__ = "headers"
|
||||
|
||||
database = Column(String, primary_key=True)
|
||||
spreadsheet = Column(String)
|
||||
|
||||
|
||||
class ItemBase(Base):
|
||||
__tablename__ = "geodata"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
fadr = Column(String)
|
||||
oldid = Column(String)
|
||||
category = Column(String)
|
||||
basin = Column(String)
|
||||
deposit = Column(String)
|
||||
well = Column(String)
|
||||
depth = Column(String)
|
||||
stratum = Column(String)
|
||||
owner = Column(String)
|
||||
org = Column(String)
|
||||
ownercontacts = Column(String)
|
||||
samplelist = Column(String)
|
||||
description = Column(String)
|
||||
formdimentions = Column(String)
|
||||
datalist = Column(String)
|
||||
resolution = Column(String)
|
||||
date = Column(String)
|
||||
additionalinfo = Column(String)
|
||||
scanner = Column(String)
|
||||
comment = Column(String)
|
||||
continuation = Column(String)
|
||||
|
||||
|
||||
class ItemCreate(ItemBase):
|
||||
pass
|
||||
|
||||
|
||||
class Item(ItemBase):
|
||||
|
||||
geodata_search_ts = Column(TSVECTOR)
|
||||
@ -0,0 +1,69 @@
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class HeaderBase(BaseModel):
|
||||
database: str
|
||||
spreadsheet: str
|
||||
|
||||
|
||||
class HeaderCreate(HeaderBase):
|
||||
pass
|
||||
|
||||
|
||||
class Header(HeaderBase):
|
||||
"""Заголовок описания набора геологических данных
|
||||
по результатам экспедиционных исследований"""
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class ItemBase(BaseModel):
|
||||
"""Описание набора геологических данных
|
||||
по результатам экспедиционных исследований
|
||||
"""
|
||||
|
||||
fadr: Optional[str] = None
|
||||
oldid: Optional[str] = None
|
||||
category: Optional[str] = None
|
||||
basin: Optional[str] = None
|
||||
deposit: Optional[str] = None
|
||||
well: Optional[str] = None
|
||||
depth: Optional[str] = None
|
||||
stratum: Optional[str] = None
|
||||
owner: Optional[str] = None
|
||||
org: Optional[str] = None
|
||||
ownercontacts: Optional[str] = None
|
||||
samplelist: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
formdimentions: Optional[str] = None
|
||||
datalist: Optional[str] = None
|
||||
resolution: Optional[str] = None
|
||||
date: Optional[Union[datetime, str]] = None
|
||||
additionalinfo: Optional[str] = None
|
||||
scanner: Optional[str] = None
|
||||
comment: Optional[str] = None
|
||||
continuation: Optional[str] = None
|
||||
|
||||
|
||||
class ItemCreate(ItemBase):
|
||||
pass
|
||||
|
||||
|
||||
class Item(ItemBase):
|
||||
id: int
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class InsertStatus(BaseModel):
|
||||
"""basic insertation status metrics"""
|
||||
|
||||
status: str
|
||||
accepted: int
|
||||
processed: int
|
||||
process_time: timedelta
|
||||
@ -0,0 +1,103 @@
|
||||
from openpyxl import Workbook, load_workbook
|
||||
from openpyxl.utils.exceptions import InvalidFileException
|
||||
from fastapi import File, UploadFile
|
||||
|
||||
from asyncio import run as asyncio_run
|
||||
from contextlib import closing, contextmanager
|
||||
from itertools import chain
|
||||
from os import remove
|
||||
from pathlib import Path
|
||||
from shutil import copyfileobj
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
||||
|
||||
class DataInUnnamedColumnException(Exception):
|
||||
"""all the columns containing any data have to me named"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
@contextmanager
|
||||
def parse(file: UploadFile = File(...)):
|
||||
"""returns a dict with a pair of iterators
|
||||
for each sheet in the spreadsheet in a list
|
||||
|
||||
[
|
||||
{"header": header_iterator,"data": data_iterator}, # sheet1
|
||||
{"header": header_iterator,"data": data_iterator}, # sheet2
|
||||
# etc
|
||||
]
|
||||
"""
|
||||
# prepare return list
|
||||
result = []
|
||||
|
||||
suffix = Path(file.filename).suffix
|
||||
try:
|
||||
# TODO: decide if we use subdir in /tmp here, then create it [5]
|
||||
with NamedTemporaryFile(delete=False, suffix=suffix, dir="/tmp/notgeo") as tmp:
|
||||
copyfileobj(file.file, tmp)
|
||||
spreadsheet_file = Path(tmp.name)
|
||||
|
||||
# Unlike a normal workbook, a read-only workbook will use lazy loading.
|
||||
# The workbook must be explicitly closed with the close() method.
|
||||
|
||||
# we use contextlib.closing() to do it for us
|
||||
with closing(load_workbook(spreadsheet_file, read_only=True)) as wb:
|
||||
# TODO: Multiple worksheets per Workbook or one? [3]
|
||||
# assume one for now (second one have strange formatting)
|
||||
ws = wb.active
|
||||
# assume headers are in the top row
|
||||
header = [
|
||||
cell
|
||||
for row in ws.iter_rows(max_row=1, values_only=True)
|
||||
for cell in row
|
||||
]
|
||||
|
||||
# assume data stretch is continuous
|
||||
# find first occurence of None in header and assert that
|
||||
# no unnamed column contains any data
|
||||
# .index is 0-based and min_col= is 1-based so we want to
|
||||
# go for header.index(None)+1
|
||||
# unpack row generator into separate cell iterators, chain them together,
|
||||
# and make sure none of the cells contains any data
|
||||
try:
|
||||
last_column_with_header = header.index(None)
|
||||
if any(
|
||||
chain(
|
||||
*(
|
||||
ws.iter_rows(
|
||||
min_col=last_column_with_header + 1, values_only=True
|
||||
)
|
||||
)
|
||||
)
|
||||
):
|
||||
raise DataInUnnamedColumnException(
|
||||
"Data is found in a column with empty header"
|
||||
)
|
||||
except ValueError:
|
||||
# header.index(None) couldn't find anything,
|
||||
# all cell in the first row are filled,
|
||||
# so all columns have headers,
|
||||
# we can safely continue
|
||||
last_column_with_header = len(header)
|
||||
|
||||
# Construct spreadsheet data iterator
|
||||
# .index is 0-based and max_col= is 1-based so we might've wanted to
|
||||
# go for header.index(None)+1, but max_col= range is including
|
||||
# and we want to only include non-empty columns, and go for
|
||||
# the previous one (-1) so in the end
|
||||
# max_col=header.index(None)+1-1
|
||||
data = ws.iter_rows(
|
||||
min_row=2, max_col=last_column_with_header, values_only=True
|
||||
)
|
||||
result.append(
|
||||
{"header": (cell for cell in header if cell is not None), "data": data}
|
||||
)
|
||||
# END [3]
|
||||
|
||||
yield result
|
||||
finally:
|
||||
# clean up by explicitly closing the files and removing temporary spreadsheet
|
||||
asyncio_run(file.close())
|
||||
tmp.close()
|
||||
remove(spreadsheet_file)
|
||||
Loading…
Reference in new issue